Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 1 | /************************************************* |
| 2 | * Perl-Compatible Regular Expressions * |
| 3 | *************************************************/ |
| 4 | |
| 5 | /* PCRE is a library of functions to support regular expressions whose syntax |
| 6 | and semantics are as close as possible to those of the Perl 5 language. |
| 7 | |
| 8 | Written by Philip Hazel |
| 9 | Original API code Copyright (c) 1997-2012 University of Cambridge |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 10 | New API code Copyright (c) 2016-2022 University of Cambridge |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 11 | |
| 12 | ----------------------------------------------------------------------------- |
| 13 | Redistribution and use in source and binary forms, with or without |
| 14 | modification, are permitted provided that the following conditions are met: |
| 15 | |
| 16 | * Redistributions of source code must retain the above copyright notice, |
| 17 | this list of conditions and the following disclaimer. |
| 18 | |
| 19 | * Redistributions in binary form must reproduce the above copyright |
| 20 | notice, this list of conditions and the following disclaimer in the |
| 21 | documentation and/or other materials provided with the distribution. |
| 22 | |
| 23 | * Neither the name of the University of Cambridge nor the names of its |
| 24 | contributors may be used to endorse or promote products derived from |
| 25 | this software without specific prior written permission. |
| 26 | |
| 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 37 | POSSIBILITY OF SUCH DAMAGE. |
| 38 | ----------------------------------------------------------------------------- |
| 39 | */ |
| 40 | |
| 41 | |
| 42 | #ifdef HAVE_CONFIG_H |
| 43 | #include "config.h" |
| 44 | #endif |
| 45 | |
| 46 | #define NLBLOCK cb /* Block containing newline information */ |
| 47 | #define PSSTART start_pattern /* Field containing processed string start */ |
| 48 | #define PSEND end_pattern /* Field containing processed string end */ |
| 49 | |
| 50 | #include "pcre2_internal.h" |
| 51 | |
| 52 | /* In rare error cases debugging might require calling pcre2_printint(). */ |
| 53 | |
| 54 | #if 0 |
| 55 | #ifdef EBCDIC |
| 56 | #define PRINTABLE(c) ((c) >= 64 && (c) < 255) |
| 57 | #else |
| 58 | #define PRINTABLE(c) ((c) >= 32 && (c) < 127) |
| 59 | #endif |
| 60 | #include "pcre2_printint.c" |
| 61 | #define DEBUG_CALL_PRINTINT |
| 62 | #endif |
| 63 | |
| 64 | /* Other debugging code can be enabled by these defines. */ |
| 65 | |
| 66 | /* #define DEBUG_SHOW_CAPTURES */ |
| 67 | /* #define DEBUG_SHOW_PARSED */ |
| 68 | |
| 69 | /* There are a few things that vary with different code unit sizes. Handle them |
| 70 | by defining macros in order to minimize #if usage. */ |
| 71 | |
| 72 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
| 73 | #define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 5 |
| 74 | #define XDIGIT(c) xdigitab[c] |
| 75 | |
| 76 | #else /* Either 16-bit or 32-bit */ |
| 77 | #define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff) |
| 78 | |
| 79 | #if PCRE2_CODE_UNIT_WIDTH == 16 |
| 80 | #define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 6 |
| 81 | |
| 82 | #else /* 32-bit */ |
| 83 | #define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 6 |
| 84 | #endif |
| 85 | #endif |
| 86 | |
| 87 | /* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which |
| 88 | consists of uint32_t elements. Assume that if uint32_t can't hold it, two of |
| 89 | them will be able to (i.e. assume a 64-bit world). */ |
| 90 | |
| 91 | #if PCRE2_SIZE_MAX <= UINT32_MAX |
| 92 | #define PUTOFFSET(s,p) *p++ = s |
| 93 | #define GETOFFSET(s,p) s = *p++ |
| 94 | #define GETPLUSOFFSET(s,p) s = *(++p) |
| 95 | #define READPLUSOFFSET(s,p) s = p[1] |
| 96 | #define SKIPOFFSET(p) p++ |
| 97 | #define SIZEOFFSET 1 |
| 98 | #else |
| 99 | #define PUTOFFSET(s,p) \ |
| 100 | { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); } |
| 101 | #define GETOFFSET(s,p) \ |
| 102 | { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; } |
| 103 | #define GETPLUSOFFSET(s,p) \ |
| 104 | { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; } |
| 105 | #define READPLUSOFFSET(s,p) \ |
| 106 | { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; } |
| 107 | #define SKIPOFFSET(p) p += 2 |
| 108 | #define SIZEOFFSET 2 |
| 109 | #endif |
| 110 | |
| 111 | /* Macros for manipulating elements of the parsed pattern vector. */ |
| 112 | |
| 113 | #define META_CODE(x) (x & 0xffff0000u) |
| 114 | #define META_DATA(x) (x & 0x0000ffffu) |
| 115 | #define META_DIFF(x,y) ((x-y)>>16) |
| 116 | |
| 117 | /* Function definitions to allow mutual recursion */ |
| 118 | |
| 119 | #ifdef SUPPORT_UNICODE |
| 120 | static unsigned int |
| 121 | add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t, |
| 122 | compile_block *, const uint32_t *, unsigned int); |
| 123 | #endif |
| 124 | |
| 125 | static int |
| 126 | compile_regex(uint32_t, PCRE2_UCHAR **, uint32_t **, int *, uint32_t, |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 127 | uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *, |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 128 | compile_block *, PCRE2_SIZE *); |
| 129 | |
| 130 | static int |
| 131 | get_branchlength(uint32_t **, int *, int *, parsed_recurse_check *, |
| 132 | compile_block *); |
| 133 | |
| 134 | static BOOL |
| 135 | set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *, |
| 136 | compile_block *); |
| 137 | |
| 138 | static int |
| 139 | check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *, |
| 140 | compile_block *, int *); |
| 141 | |
| 142 | |
| 143 | /************************************************* |
| 144 | * Code parameters and static tables * |
| 145 | *************************************************/ |
| 146 | |
| 147 | #define MAX_GROUP_NUMBER 65535u |
| 148 | #define MAX_REPEAT_COUNT 65535u |
| 149 | #define REPEAT_UNLIMITED (MAX_REPEAT_COUNT+1) |
| 150 | |
| 151 | /* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in |
| 152 | different ways in the different pattern scans. The parsing and group- |
| 153 | identifying pre-scan uses it to handle nesting, and needs it to be 16-bit |
| 154 | aligned for this. Having defined the size in code units, we set up |
| 155 | C16_WORK_SIZE as the number of elements in the 16-bit vector. |
| 156 | |
| 157 | During the first compiling phase, when determining how much memory is required, |
| 158 | the regex is partly compiled into this space, but the compiled parts are |
| 159 | discarded as soon as they can be, so that hopefully there will never be an |
| 160 | overrun. The code does, however, check for an overrun, which can occur for |
| 161 | pathological patterns. The size of the workspace depends on LINK_SIZE because |
| 162 | the length of compiled items varies with this. |
| 163 | |
| 164 | In the real compile phase, this workspace is not currently used. */ |
| 165 | |
| 166 | #define COMPILE_WORK_SIZE (3000*LINK_SIZE) /* Size in code units */ |
| 167 | |
| 168 | #define C16_WORK_SIZE \ |
| 169 | ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t)) |
| 170 | |
| 171 | /* A uint32_t vector is used for caching information about the size of |
| 172 | capturing groups, to improve performance. A default is created on the stack of |
| 173 | this size. */ |
| 174 | |
| 175 | #define GROUPINFO_DEFAULT_SIZE 256 |
| 176 | |
| 177 | /* The overrun tests check for a slightly smaller size so that they detect the |
| 178 | overrun before it actually does run off the end of the data block. */ |
| 179 | |
| 180 | #define WORK_SIZE_SAFETY_MARGIN (100) |
| 181 | |
| 182 | /* This value determines the size of the initial vector that is used for |
| 183 | remembering named groups during the pre-compile. It is allocated on the stack, |
| 184 | but if it is too small, it is expanded, in a similar way to the workspace. The |
| 185 | value is the number of slots in the list. */ |
| 186 | |
| 187 | #define NAMED_GROUP_LIST_SIZE 20 |
| 188 | |
| 189 | /* The pre-compiling pass over the pattern creates a parsed pattern in a vector |
| 190 | of uint32_t. For short patterns this lives on the stack, with this size. Heap |
| 191 | memory is used for longer patterns. */ |
| 192 | |
| 193 | #define PARSED_PATTERN_DEFAULT_SIZE 1024 |
| 194 | |
| 195 | /* Maximum length value to check against when making sure that the variable |
| 196 | that holds the compiled pattern length does not overflow. We make it a bit less |
| 197 | than INT_MAX to allow for adding in group terminating code units, so that we |
| 198 | don't have to check them every time. */ |
| 199 | |
| 200 | #define OFLOW_MAX (INT_MAX - 20) |
| 201 | |
| 202 | /* Code values for parsed patterns, which are stored in a vector of 32-bit |
| 203 | unsigned ints. Values less than META_END are literal data values. The coding |
| 204 | for identifying the item is in the top 16-bits, leaving 16 bits for the |
| 205 | additional data that some of them need. The META_CODE, META_DATA, and META_DIFF |
| 206 | macros are used to manipulate parsed pattern elements. |
| 207 | |
| 208 | NOTE: When these definitions are changed, the table of extra lengths for each |
| 209 | code (meta_extra_lengths, just below) must be updated to remain in step. */ |
| 210 | |
| 211 | #define META_END 0x80000000u /* End of pattern */ |
| 212 | |
| 213 | #define META_ALT 0x80010000u /* alternation */ |
| 214 | #define META_ATOMIC 0x80020000u /* atomic group */ |
| 215 | #define META_BACKREF 0x80030000u /* Back ref */ |
| 216 | #define META_BACKREF_BYNAME 0x80040000u /* \k'name' */ |
| 217 | #define META_BIGVALUE 0x80050000u /* Next is a literal > META_END */ |
| 218 | #define META_CALLOUT_NUMBER 0x80060000u /* (?C with numerical argument */ |
| 219 | #define META_CALLOUT_STRING 0x80070000u /* (?C with string argument */ |
| 220 | #define META_CAPTURE 0x80080000u /* Capturing parenthesis */ |
| 221 | #define META_CIRCUMFLEX 0x80090000u /* ^ metacharacter */ |
| 222 | #define META_CLASS 0x800a0000u /* start non-empty class */ |
| 223 | #define META_CLASS_EMPTY 0x800b0000u /* empty class */ |
| 224 | #define META_CLASS_EMPTY_NOT 0x800c0000u /* negative empty class */ |
| 225 | #define META_CLASS_END 0x800d0000u /* end of non-empty class */ |
| 226 | #define META_CLASS_NOT 0x800e0000u /* start non-empty negative class */ |
| 227 | #define META_COND_ASSERT 0x800f0000u /* (?(?assertion)... */ |
| 228 | #define META_COND_DEFINE 0x80100000u /* (?(DEFINE)... */ |
| 229 | #define META_COND_NAME 0x80110000u /* (?(<name>)... */ |
| 230 | #define META_COND_NUMBER 0x80120000u /* (?(digits)... */ |
| 231 | #define META_COND_RNAME 0x80130000u /* (?(R&name)... */ |
| 232 | #define META_COND_RNUMBER 0x80140000u /* (?(Rdigits)... */ |
| 233 | #define META_COND_VERSION 0x80150000u /* (?(VERSION<op>x.y)... */ |
| 234 | #define META_DOLLAR 0x80160000u /* $ metacharacter */ |
| 235 | #define META_DOT 0x80170000u /* . metacharacter */ |
| 236 | #define META_ESCAPE 0x80180000u /* \d and friends */ |
| 237 | #define META_KET 0x80190000u /* closing parenthesis */ |
| 238 | #define META_NOCAPTURE 0x801a0000u /* no capture parens */ |
| 239 | #define META_OPTIONS 0x801b0000u /* (?i) and friends */ |
| 240 | #define META_POSIX 0x801c0000u /* POSIX class item */ |
| 241 | #define META_POSIX_NEG 0x801d0000u /* negative POSIX class item */ |
| 242 | #define META_RANGE_ESCAPED 0x801e0000u /* range with at least one escape */ |
| 243 | #define META_RANGE_LITERAL 0x801f0000u /* range defined literally */ |
| 244 | #define META_RECURSE 0x80200000u /* Recursion */ |
| 245 | #define META_RECURSE_BYNAME 0x80210000u /* (?&name) */ |
| 246 | #define META_SCRIPT_RUN 0x80220000u /* (*script_run:...) */ |
| 247 | |
| 248 | /* These must be kept together to make it easy to check that an assertion |
| 249 | is present where expected in a conditional group. */ |
| 250 | |
| 251 | #define META_LOOKAHEAD 0x80230000u /* (?= */ |
| 252 | #define META_LOOKAHEADNOT 0x80240000u /* (?! */ |
| 253 | #define META_LOOKBEHIND 0x80250000u /* (?<= */ |
| 254 | #define META_LOOKBEHINDNOT 0x80260000u /* (?<! */ |
| 255 | |
| 256 | /* These cannot be conditions */ |
| 257 | |
| 258 | #define META_LOOKAHEAD_NA 0x80270000u /* (*napla: */ |
| 259 | #define META_LOOKBEHIND_NA 0x80280000u /* (*naplb: */ |
| 260 | |
| 261 | /* These must be kept in this order, with consecutive values, and the _ARG |
| 262 | versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument |
| 263 | versions. */ |
| 264 | |
| 265 | #define META_MARK 0x80290000u /* (*MARK) */ |
| 266 | #define META_ACCEPT 0x802a0000u /* (*ACCEPT) */ |
| 267 | #define META_FAIL 0x802b0000u /* (*FAIL) */ |
| 268 | #define META_COMMIT 0x802c0000u /* These */ |
| 269 | #define META_COMMIT_ARG 0x802d0000u /* pairs */ |
| 270 | #define META_PRUNE 0x802e0000u /* must */ |
| 271 | #define META_PRUNE_ARG 0x802f0000u /* be */ |
| 272 | #define META_SKIP 0x80300000u /* kept */ |
| 273 | #define META_SKIP_ARG 0x80310000u /* in */ |
| 274 | #define META_THEN 0x80320000u /* this */ |
| 275 | #define META_THEN_ARG 0x80330000u /* order */ |
| 276 | |
| 277 | /* These must be kept in groups of adjacent 3 values, and all together. */ |
| 278 | |
| 279 | #define META_ASTERISK 0x80340000u /* * */ |
| 280 | #define META_ASTERISK_PLUS 0x80350000u /* *+ */ |
| 281 | #define META_ASTERISK_QUERY 0x80360000u /* *? */ |
| 282 | #define META_PLUS 0x80370000u /* + */ |
| 283 | #define META_PLUS_PLUS 0x80380000u /* ++ */ |
| 284 | #define META_PLUS_QUERY 0x80390000u /* +? */ |
| 285 | #define META_QUERY 0x803a0000u /* ? */ |
| 286 | #define META_QUERY_PLUS 0x803b0000u /* ?+ */ |
| 287 | #define META_QUERY_QUERY 0x803c0000u /* ?? */ |
| 288 | #define META_MINMAX 0x803d0000u /* {n,m} repeat */ |
| 289 | #define META_MINMAX_PLUS 0x803e0000u /* {n,m}+ repeat */ |
| 290 | #define META_MINMAX_QUERY 0x803f0000u /* {n,m}? repeat */ |
| 291 | |
| 292 | #define META_FIRST_QUANTIFIER META_ASTERISK |
| 293 | #define META_LAST_QUANTIFIER META_MINMAX_QUERY |
| 294 | |
| 295 | /* This is a special "meta code" that is used only to distinguish (*asr: from |
| 296 | (*sr: in the table of aphabetic assertions. It is never stored in the parsed |
| 297 | pattern because (*asr: is turned into (*sr:(*atomic: at that stage. There is |
| 298 | therefore no need for it to have a length entry, so use a high value. */ |
| 299 | |
| 300 | #define META_ATOMIC_SCRIPT_RUN 0x8fff0000u |
| 301 | |
| 302 | /* Table of extra lengths for each of the meta codes. Must be kept in step with |
| 303 | the definitions above. For some items these values are a basic length to which |
| 304 | a variable amount has to be added. */ |
| 305 | |
| 306 | static unsigned char meta_extra_lengths[] = { |
| 307 | 0, /* META_END */ |
| 308 | 0, /* META_ALT */ |
| 309 | 0, /* META_ATOMIC */ |
| 310 | 0, /* META_BACKREF - more if group is >= 10 */ |
| 311 | 1+SIZEOFFSET, /* META_BACKREF_BYNAME */ |
| 312 | 1, /* META_BIGVALUE */ |
| 313 | 3, /* META_CALLOUT_NUMBER */ |
| 314 | 3+SIZEOFFSET, /* META_CALLOUT_STRING */ |
| 315 | 0, /* META_CAPTURE */ |
| 316 | 0, /* META_CIRCUMFLEX */ |
| 317 | 0, /* META_CLASS */ |
| 318 | 0, /* META_CLASS_EMPTY */ |
| 319 | 0, /* META_CLASS_EMPTY_NOT */ |
| 320 | 0, /* META_CLASS_END */ |
| 321 | 0, /* META_CLASS_NOT */ |
| 322 | 0, /* META_COND_ASSERT */ |
| 323 | SIZEOFFSET, /* META_COND_DEFINE */ |
| 324 | 1+SIZEOFFSET, /* META_COND_NAME */ |
| 325 | 1+SIZEOFFSET, /* META_COND_NUMBER */ |
| 326 | 1+SIZEOFFSET, /* META_COND_RNAME */ |
| 327 | 1+SIZEOFFSET, /* META_COND_RNUMBER */ |
| 328 | 3, /* META_COND_VERSION */ |
| 329 | 0, /* META_DOLLAR */ |
| 330 | 0, /* META_DOT */ |
| 331 | 0, /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */ |
| 332 | 0, /* META_KET */ |
| 333 | 0, /* META_NOCAPTURE */ |
| 334 | 1, /* META_OPTIONS */ |
| 335 | 1, /* META_POSIX */ |
| 336 | 1, /* META_POSIX_NEG */ |
| 337 | 0, /* META_RANGE_ESCAPED */ |
| 338 | 0, /* META_RANGE_LITERAL */ |
| 339 | SIZEOFFSET, /* META_RECURSE */ |
| 340 | 1+SIZEOFFSET, /* META_RECURSE_BYNAME */ |
| 341 | 0, /* META_SCRIPT_RUN */ |
| 342 | 0, /* META_LOOKAHEAD */ |
| 343 | 0, /* META_LOOKAHEADNOT */ |
| 344 | SIZEOFFSET, /* META_LOOKBEHIND */ |
| 345 | SIZEOFFSET, /* META_LOOKBEHINDNOT */ |
| 346 | 0, /* META_LOOKAHEAD_NA */ |
| 347 | SIZEOFFSET, /* META_LOOKBEHIND_NA */ |
| 348 | 1, /* META_MARK - plus the string length */ |
| 349 | 0, /* META_ACCEPT */ |
| 350 | 0, /* META_FAIL */ |
| 351 | 0, /* META_COMMIT */ |
| 352 | 1, /* META_COMMIT_ARG - plus the string length */ |
| 353 | 0, /* META_PRUNE */ |
| 354 | 1, /* META_PRUNE_ARG - plus the string length */ |
| 355 | 0, /* META_SKIP */ |
| 356 | 1, /* META_SKIP_ARG - plus the string length */ |
| 357 | 0, /* META_THEN */ |
| 358 | 1, /* META_THEN_ARG - plus the string length */ |
| 359 | 0, /* META_ASTERISK */ |
| 360 | 0, /* META_ASTERISK_PLUS */ |
| 361 | 0, /* META_ASTERISK_QUERY */ |
| 362 | 0, /* META_PLUS */ |
| 363 | 0, /* META_PLUS_PLUS */ |
| 364 | 0, /* META_PLUS_QUERY */ |
| 365 | 0, /* META_QUERY */ |
| 366 | 0, /* META_QUERY_PLUS */ |
| 367 | 0, /* META_QUERY_QUERY */ |
| 368 | 2, /* META_MINMAX */ |
| 369 | 2, /* META_MINMAX_PLUS */ |
| 370 | 2 /* META_MINMAX_QUERY */ |
| 371 | }; |
| 372 | |
| 373 | /* Types for skipping parts of a parsed pattern. */ |
| 374 | |
| 375 | enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET }; |
| 376 | |
| 377 | /* Macro for setting individual bits in class bitmaps. It took some |
| 378 | experimenting to figure out how to stop gcc 5.3.0 from warning with |
| 379 | -Wconversion. This version gets a warning: |
| 380 | |
| 381 | #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1u << ((b)&7)) |
| 382 | |
| 383 | Let's hope the apparently less efficient version isn't actually so bad if the |
| 384 | compiler is clever with identical subexpressions. */ |
| 385 | |
| 386 | #define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1u << ((b)&7))) |
| 387 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 388 | /* Values and flags for the unsigned xxcuflags variables that accompany xxcu |
| 389 | variables, which are concerned with first and required code units. A value |
| 390 | greater than or equal to REQ_NONE means "no code unit set"; otherwise the |
| 391 | matching xxcu variable is set, and the low valued bits are relevant. */ |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 392 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 393 | #define REQ_UNSET 0xffffffffu /* Not yet found anything */ |
| 394 | #define REQ_NONE 0xfffffffeu /* Found not fixed character */ |
| 395 | #define REQ_CASELESS 0x00000001u /* Code unit in xxcu is caseless */ |
| 396 | #define REQ_VARY 0x00000002u /* Code unit is followed by non-literal */ |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 397 | |
| 398 | /* These flags are used in the groupinfo vector. */ |
| 399 | |
| 400 | #define GI_SET_FIXED_LENGTH 0x80000000u |
| 401 | #define GI_NOT_FIXED_LENGTH 0x40000000u |
| 402 | #define GI_FIXED_LENGTH_MASK 0x0000ffffu |
| 403 | |
| 404 | /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC |
| 405 | and is fast (a good compiler can turn it into a subtraction and unsigned |
| 406 | comparison). */ |
| 407 | |
| 408 | #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9) |
| 409 | |
| 410 | /* Table to identify hex digits. The tables in chartables are dependent on the |
| 411 | locale, and may mark arbitrary characters as digits. We want to recognize only |
| 412 | 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It |
| 413 | costs 256 bytes, but it is a lot faster than doing character value tests (at |
| 414 | least in some simple cases I timed), and in some applications one wants PCRE2 |
| 415 | to compile efficiently as well as match efficiently. The value in the table is |
| 416 | the binary hex digit value, or 0xff for non-hex digits. */ |
| 417 | |
| 418 | /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in |
| 419 | UTF-8 mode. */ |
| 420 | |
| 421 | #ifndef EBCDIC |
| 422 | static const uint8_t xdigitab[] = |
| 423 | { |
| 424 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 */ |
| 425 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */ |
| 426 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 */ |
| 427 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */ |
| 428 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - ' */ |
| 429 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ( - / */ |
| 430 | 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 */ |
| 431 | 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /* 8 - ? */ |
| 432 | 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* @ - G */ |
| 433 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H - O */ |
| 434 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* P - W */ |
| 435 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* X - _ */ |
| 436 | 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* ` - g */ |
| 437 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h - o */ |
| 438 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* p - w */ |
| 439 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* x -127 */ |
| 440 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */ |
| 441 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */ |
| 442 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */ |
| 443 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */ |
| 444 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */ |
| 445 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */ |
| 446 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */ |
| 447 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */ |
| 448 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */ |
| 449 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */ |
| 450 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */ |
| 451 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */ |
| 452 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */ |
| 453 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */ |
| 454 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */ |
| 455 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */ |
| 456 | |
| 457 | #else |
| 458 | |
| 459 | /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */ |
| 460 | |
| 461 | static const uint8_t xdigitab[] = |
| 462 | { |
| 463 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 0 */ |
| 464 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */ |
| 465 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 10 */ |
| 466 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */ |
| 467 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 32- 39 20 */ |
| 468 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 40- 47 */ |
| 469 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 48- 55 30 */ |
| 470 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 56- 63 */ |
| 471 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - 71 40 */ |
| 472 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 72- | */ |
| 473 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* & - 87 50 */ |
| 474 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 88- 95 */ |
| 475 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - -103 60 */ |
| 476 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ? */ |
| 477 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */ |
| 478 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- " */ |
| 479 | 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g 80 */ |
| 480 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h -143 */ |
| 481 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p 90 */ |
| 482 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* q -159 */ |
| 483 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x A0 */ |
| 484 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* y -175 */ |
| 485 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ^ -183 B0 */ |
| 486 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */ |
| 487 | 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* { - G C0 */ |
| 488 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H -207 */ |
| 489 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* } - P D0 */ |
| 490 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Q -223 */ |
| 491 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* \ - X E0 */ |
| 492 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Y -239 */ |
| 493 | 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 F0 */ |
| 494 | 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/* 8 -255 */ |
| 495 | #endif /* EBCDIC */ |
| 496 | |
| 497 | |
| 498 | /* Table for handling alphanumeric escaped characters. Positive returns are |
| 499 | simple data values; negative values are for special things like \d and so on. |
| 500 | Zero means further processing is needed (for things like \x), or the escape is |
| 501 | invalid. */ |
| 502 | |
| 503 | /* This is the "normal" table for ASCII systems or for EBCDIC systems running |
| 504 | in UTF-8 mode. It runs from '0' to 'z'. */ |
| 505 | |
| 506 | #ifndef EBCDIC |
| 507 | #define ESCAPES_FIRST CHAR_0 |
| 508 | #define ESCAPES_LAST CHAR_z |
| 509 | #define UPPER_CASE(c) (c-32) |
| 510 | |
| 511 | static const short int escapes[] = { |
| 512 | 0, 0, |
| 513 | 0, 0, |
| 514 | 0, 0, |
| 515 | 0, 0, |
| 516 | 0, 0, |
| 517 | CHAR_COLON, CHAR_SEMICOLON, |
| 518 | CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, |
| 519 | CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK, |
| 520 | CHAR_COMMERCIAL_AT, -ESC_A, |
| 521 | -ESC_B, -ESC_C, |
| 522 | -ESC_D, -ESC_E, |
| 523 | 0, -ESC_G, |
| 524 | -ESC_H, 0, |
| 525 | 0, -ESC_K, |
| 526 | 0, 0, |
| 527 | -ESC_N, 0, |
| 528 | -ESC_P, -ESC_Q, |
| 529 | -ESC_R, -ESC_S, |
| 530 | 0, 0, |
| 531 | -ESC_V, -ESC_W, |
| 532 | -ESC_X, 0, |
| 533 | -ESC_Z, CHAR_LEFT_SQUARE_BRACKET, |
| 534 | CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET, |
| 535 | CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE, |
| 536 | CHAR_GRAVE_ACCENT, CHAR_BEL, |
| 537 | -ESC_b, 0, |
| 538 | -ESC_d, CHAR_ESC, |
| 539 | CHAR_FF, 0, |
| 540 | -ESC_h, 0, |
| 541 | 0, -ESC_k, |
| 542 | 0, 0, |
| 543 | CHAR_LF, 0, |
| 544 | -ESC_p, 0, |
| 545 | CHAR_CR, -ESC_s, |
| 546 | CHAR_HT, 0, |
| 547 | -ESC_v, -ESC_w, |
| 548 | 0, 0, |
| 549 | -ESC_z |
| 550 | }; |
| 551 | |
| 552 | #else |
| 553 | |
| 554 | /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. |
| 555 | It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code |
| 556 | is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a |
| 557 | because it is defined as 'a', which of course picks up the ASCII value. */ |
| 558 | |
| 559 | #if 'a' == 0x81 /* Check for a real EBCDIC environment */ |
| 560 | #define ESCAPES_FIRST CHAR_a |
| 561 | #define ESCAPES_LAST CHAR_9 |
| 562 | #define UPPER_CASE(c) (c+64) |
| 563 | #else /* Testing in an ASCII environment */ |
| 564 | #define ESCAPES_FIRST ((unsigned char)'\x81') /* EBCDIC 'a' */ |
| 565 | #define ESCAPES_LAST ((unsigned char)'\xf9') /* EBCDIC '9' */ |
| 566 | #define UPPER_CASE(c) (c-32) |
| 567 | #endif |
| 568 | |
| 569 | static const short int escapes[] = { |
| 570 | /* 80 */ CHAR_BEL, -ESC_b, 0, -ESC_d, CHAR_ESC, CHAR_FF, 0, |
| 571 | /* 88 */ -ESC_h, 0, 0, '{', 0, 0, 0, 0, |
| 572 | /* 90 */ 0, 0, -ESC_k, 0, 0, CHAR_LF, 0, -ESC_p, |
| 573 | /* 98 */ 0, CHAR_CR, 0, '}', 0, 0, 0, 0, |
| 574 | /* A0 */ 0, '~', -ESC_s, CHAR_HT, 0, -ESC_v, -ESC_w, 0, |
| 575 | /* A8 */ 0, -ESC_z, 0, 0, 0, '[', 0, 0, |
| 576 | /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0, |
| 577 | /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', |
| 578 | /* C0 */ '{', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, |
| 579 | /* C8 */ -ESC_H, 0, 0, 0, 0, 0, 0, 0, |
| 580 | /* D0 */ '}', 0, -ESC_K, 0, 0, -ESC_N, 0, -ESC_P, |
| 581 | /* D8 */ -ESC_Q, -ESC_R, 0, 0, 0, 0, 0, 0, |
| 582 | /* E0 */ '\\', 0, -ESC_S, 0, 0, -ESC_V, -ESC_W, -ESC_X, |
| 583 | /* E8 */ 0, -ESC_Z, 0, 0, 0, 0, 0, 0, |
| 584 | /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, |
| 585 | /* F8 */ 0, 0 |
| 586 | }; |
| 587 | |
| 588 | /* We also need a table of characters that may follow \c in an EBCDIC |
| 589 | environment for characters 0-31. */ |
| 590 | |
| 591 | static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_"; |
| 592 | |
| 593 | #endif /* EBCDIC */ |
| 594 | |
| 595 | |
| 596 | /* Table of special "verbs" like (*PRUNE). This is a short table, so it is |
| 597 | searched linearly. Put all the names into a single string, in order to reduce |
| 598 | the number of relocations when a shared library is dynamically linked. The |
| 599 | string is built from string macros so that it works in UTF-8 mode on EBCDIC |
| 600 | platforms. */ |
| 601 | |
| 602 | typedef struct verbitem { |
| 603 | unsigned int len; /* Length of verb name */ |
| 604 | uint32_t meta; /* Base META_ code */ |
| 605 | int has_arg; /* Argument requirement */ |
| 606 | } verbitem; |
| 607 | |
| 608 | static const char verbnames[] = |
| 609 | "\0" /* Empty name is a shorthand for MARK */ |
| 610 | STRING_MARK0 |
| 611 | STRING_ACCEPT0 |
| 612 | STRING_F0 |
| 613 | STRING_FAIL0 |
| 614 | STRING_COMMIT0 |
| 615 | STRING_PRUNE0 |
| 616 | STRING_SKIP0 |
| 617 | STRING_THEN; |
| 618 | |
| 619 | static const verbitem verbs[] = { |
| 620 | { 0, META_MARK, +1 }, /* > 0 => must have an argument */ |
| 621 | { 4, META_MARK, +1 }, |
| 622 | { 6, META_ACCEPT, -1 }, /* < 0 => Optional argument, convert to pre-MARK */ |
| 623 | { 1, META_FAIL, -1 }, |
| 624 | { 4, META_FAIL, -1 }, |
| 625 | { 6, META_COMMIT, 0 }, |
| 626 | { 5, META_PRUNE, 0 }, /* Optional argument; bump META code if found */ |
| 627 | { 4, META_SKIP, 0 }, |
| 628 | { 4, META_THEN, 0 } |
| 629 | }; |
| 630 | |
| 631 | static const int verbcount = sizeof(verbs)/sizeof(verbitem); |
| 632 | |
| 633 | /* Verb opcodes, indexed by their META code offset from META_MARK. */ |
| 634 | |
| 635 | static const uint32_t verbops[] = { |
| 636 | OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE, |
| 637 | OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG }; |
| 638 | |
| 639 | /* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */ |
| 640 | |
| 641 | typedef struct alasitem { |
| 642 | unsigned int len; /* Length of name */ |
| 643 | uint32_t meta; /* Base META_ code */ |
| 644 | } alasitem; |
| 645 | |
| 646 | static const char alasnames[] = |
| 647 | STRING_pla0 |
| 648 | STRING_plb0 |
| 649 | STRING_napla0 |
| 650 | STRING_naplb0 |
| 651 | STRING_nla0 |
| 652 | STRING_nlb0 |
| 653 | STRING_positive_lookahead0 |
| 654 | STRING_positive_lookbehind0 |
| 655 | STRING_non_atomic_positive_lookahead0 |
| 656 | STRING_non_atomic_positive_lookbehind0 |
| 657 | STRING_negative_lookahead0 |
| 658 | STRING_negative_lookbehind0 |
| 659 | STRING_atomic0 |
| 660 | STRING_sr0 |
| 661 | STRING_asr0 |
| 662 | STRING_script_run0 |
| 663 | STRING_atomic_script_run; |
| 664 | |
| 665 | static const alasitem alasmeta[] = { |
| 666 | { 3, META_LOOKAHEAD }, |
| 667 | { 3, META_LOOKBEHIND }, |
| 668 | { 5, META_LOOKAHEAD_NA }, |
| 669 | { 5, META_LOOKBEHIND_NA }, |
| 670 | { 3, META_LOOKAHEADNOT }, |
| 671 | { 3, META_LOOKBEHINDNOT }, |
| 672 | { 18, META_LOOKAHEAD }, |
| 673 | { 19, META_LOOKBEHIND }, |
| 674 | { 29, META_LOOKAHEAD_NA }, |
| 675 | { 30, META_LOOKBEHIND_NA }, |
| 676 | { 18, META_LOOKAHEADNOT }, |
| 677 | { 19, META_LOOKBEHINDNOT }, |
| 678 | { 6, META_ATOMIC }, |
| 679 | { 2, META_SCRIPT_RUN }, /* sr = script run */ |
| 680 | { 3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */ |
| 681 | { 10, META_SCRIPT_RUN }, /* script run */ |
| 682 | { 17, META_ATOMIC_SCRIPT_RUN } /* atomic script run */ |
| 683 | }; |
| 684 | |
| 685 | static const int alascount = sizeof(alasmeta)/sizeof(alasitem); |
| 686 | |
| 687 | /* Offsets from OP_STAR for case-independent and negative repeat opcodes. */ |
| 688 | |
| 689 | static uint32_t chartypeoffset[] = { |
| 690 | OP_STAR - OP_STAR, OP_STARI - OP_STAR, |
| 691 | OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR }; |
| 692 | |
| 693 | /* Tables of names of POSIX character classes and their lengths. The names are |
| 694 | now all in a single string, to reduce the number of relocations when a shared |
| 695 | library is dynamically loaded. The list of lengths is terminated by a zero |
| 696 | length entry. The first three must be alpha, lower, upper, as this is assumed |
| 697 | for handling case independence. The indices for graph, print, and punct are |
| 698 | needed, so identify them. */ |
| 699 | |
| 700 | static const char posix_names[] = |
| 701 | STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0 |
| 702 | STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0 |
| 703 | STRING_graph0 STRING_print0 STRING_punct0 STRING_space0 |
| 704 | STRING_word0 STRING_xdigit; |
| 705 | |
| 706 | static const uint8_t posix_name_lengths[] = { |
| 707 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; |
| 708 | |
| 709 | #define PC_GRAPH 8 |
| 710 | #define PC_PRINT 9 |
| 711 | #define PC_PUNCT 10 |
| 712 | |
| 713 | /* Table of class bit maps for each POSIX class. Each class is formed from a |
| 714 | base map, with an optional addition or removal of another map. Then, for some |
| 715 | classes, there is some additional tweaking: for [:blank:] the vertical space |
| 716 | characters are removed, and for [:alpha:] and [:alnum:] the underscore |
| 717 | character is removed. The triples in the table consist of the base map offset, |
| 718 | second map offset or -1 if no second map, and a non-negative value for map |
| 719 | addition or a negative value for map subtraction (if there are two maps). The |
| 720 | absolute value of the third field has these meanings: 0 => no tweaking, 1 => |
| 721 | remove vertical space characters, 2 => remove underscore. */ |
| 722 | |
| 723 | static const int posix_class_maps[] = { |
| 724 | cbit_word, cbit_digit, -2, /* alpha */ |
| 725 | cbit_lower, -1, 0, /* lower */ |
| 726 | cbit_upper, -1, 0, /* upper */ |
| 727 | cbit_word, -1, 2, /* alnum - word without underscore */ |
| 728 | cbit_print, cbit_cntrl, 0, /* ascii */ |
| 729 | cbit_space, -1, 1, /* blank - a GNU extension */ |
| 730 | cbit_cntrl, -1, 0, /* cntrl */ |
| 731 | cbit_digit, -1, 0, /* digit */ |
| 732 | cbit_graph, -1, 0, /* graph */ |
| 733 | cbit_print, -1, 0, /* print */ |
| 734 | cbit_punct, -1, 0, /* punct */ |
| 735 | cbit_space, -1, 0, /* space */ |
| 736 | cbit_word, -1, 0, /* word - a Perl extension */ |
| 737 | cbit_xdigit,-1, 0 /* xdigit */ |
| 738 | }; |
| 739 | |
| 740 | #ifdef SUPPORT_UNICODE |
| 741 | |
| 742 | /* The POSIX class Unicode property substitutes that are used in UCP mode must |
| 743 | be in the order of the POSIX class names, defined above. */ |
| 744 | |
| 745 | static int posix_substitutes[] = { |
| 746 | PT_GC, ucp_L, /* alpha */ |
| 747 | PT_PC, ucp_Ll, /* lower */ |
| 748 | PT_PC, ucp_Lu, /* upper */ |
| 749 | PT_ALNUM, 0, /* alnum */ |
| 750 | -1, 0, /* ascii, treat as non-UCP */ |
| 751 | -1, 1, /* blank, treat as \h */ |
| 752 | PT_PC, ucp_Cc, /* cntrl */ |
| 753 | PT_PC, ucp_Nd, /* digit */ |
| 754 | PT_PXGRAPH, 0, /* graph */ |
| 755 | PT_PXPRINT, 0, /* print */ |
| 756 | PT_PXPUNCT, 0, /* punct */ |
| 757 | PT_PXSPACE, 0, /* space */ /* Xps is POSIX space, but from 8.34 */ |
| 758 | PT_WORD, 0, /* word */ /* Perl and POSIX space are the same */ |
| 759 | -1, 0 /* xdigit, treat as non-UCP */ |
| 760 | }; |
| 761 | #define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t))) |
| 762 | #endif /* SUPPORT_UNICODE */ |
| 763 | |
| 764 | /* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset |
| 765 | are allowed. */ |
| 766 | |
| 767 | #define PUBLIC_LITERAL_COMPILE_OPTIONS \ |
| 768 | (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \ |
| 769 | PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \ |
| 770 | PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF) |
| 771 | |
| 772 | #define PUBLIC_COMPILE_OPTIONS \ |
| 773 | (PUBLIC_LITERAL_COMPILE_OPTIONS| \ |
| 774 | PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \ |
| 775 | PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \ |
| 776 | PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \ |
| 777 | PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \ |
| 778 | PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \ |
| 779 | PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY) |
| 780 | |
| 781 | #define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \ |
| 782 | (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD) |
| 783 | |
| 784 | #define PUBLIC_COMPILE_EXTRA_OPTIONS \ |
| 785 | (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \ |
| 786 | PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \ |
| 787 | PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \ |
| 788 | PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) |
| 789 | |
| 790 | /* Compile time error code numbers. They are given names so that they can more |
| 791 | easily be tracked. When a new number is added, the tables called eint1 and |
| 792 | eint2 in pcre2posix.c may need to be updated, and a new error text must be |
| 793 | added to compile_error_texts in pcre2_error.c. Also, the error codes in |
| 794 | pcre2.h.in must be updated - their values are exactly 100 greater than these |
| 795 | values. */ |
| 796 | |
| 797 | enum { ERR0 = COMPILE_ERROR_BASE, |
| 798 | ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10, |
| 799 | ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20, |
| 800 | ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30, |
| 801 | ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40, |
| 802 | ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50, |
| 803 | ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60, |
| 804 | ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70, |
| 805 | ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80, |
| 806 | ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90, |
| 807 | ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98, ERR99 }; |
| 808 | |
| 809 | /* This is a table of start-of-pattern options such as (*UTF) and settings such |
| 810 | as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward |
| 811 | compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is |
| 812 | generic and always supported. */ |
| 813 | |
| 814 | enum { PSO_OPT, /* Value is an option bit */ |
| 815 | PSO_FLG, /* Value is a flag bit */ |
| 816 | PSO_NL, /* Value is a newline type */ |
| 817 | PSO_BSR, /* Value is a \R type */ |
| 818 | PSO_LIMH, /* Read integer value for heap limit */ |
| 819 | PSO_LIMM, /* Read integer value for match limit */ |
| 820 | PSO_LIMD }; /* Read integer value for depth limit */ |
| 821 | |
| 822 | typedef struct pso { |
| 823 | const uint8_t *name; |
| 824 | uint16_t length; |
| 825 | uint16_t type; |
| 826 | uint32_t value; |
| 827 | } pso; |
| 828 | |
| 829 | /* NB: STRING_UTFn_RIGHTPAR contains the length as well */ |
| 830 | |
| 831 | static pso pso_list[] = { |
| 832 | { (uint8_t *)STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF }, |
| 833 | { (uint8_t *)STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF }, |
| 834 | { (uint8_t *)STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP }, |
| 835 | { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET }, |
| 836 | { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET }, |
| 837 | { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPT, PCRE2_NO_AUTO_POSSESS }, |
| 838 | { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR }, |
| 839 | { (uint8_t *)STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT }, |
| 840 | { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPT, PCRE2_NO_START_OPTIMIZE }, |
| 841 | { (uint8_t *)STRING_LIMIT_HEAP_EQ, 11, PSO_LIMH, 0 }, |
| 842 | { (uint8_t *)STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 }, |
| 843 | { (uint8_t *)STRING_LIMIT_DEPTH_EQ, 12, PSO_LIMD, 0 }, |
| 844 | { (uint8_t *)STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMD, 0 }, |
| 845 | { (uint8_t *)STRING_CR_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_CR }, |
| 846 | { (uint8_t *)STRING_LF_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_LF }, |
| 847 | { (uint8_t *)STRING_CRLF_RIGHTPAR, 5, PSO_NL, PCRE2_NEWLINE_CRLF }, |
| 848 | { (uint8_t *)STRING_ANY_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_ANY }, |
| 849 | { (uint8_t *)STRING_NUL_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_NUL }, |
| 850 | { (uint8_t *)STRING_ANYCRLF_RIGHTPAR, 8, PSO_NL, PCRE2_NEWLINE_ANYCRLF }, |
| 851 | { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_ANYCRLF }, |
| 852 | { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE } |
| 853 | }; |
| 854 | |
| 855 | /* This table is used when converting repeating opcodes into possessified |
| 856 | versions as a result of an explicit possessive quantifier such as ++. A zero |
| 857 | value means there is no possessified version - in those cases the item in |
| 858 | question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT |
| 859 | because all relevant opcodes are less than that. */ |
| 860 | |
| 861 | static const uint8_t opcode_possessify[] = { |
| 862 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */ |
| 863 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */ |
| 864 | |
| 865 | 0, /* NOTI */ |
| 866 | OP_POSSTAR, 0, /* STAR, MINSTAR */ |
| 867 | OP_POSPLUS, 0, /* PLUS, MINPLUS */ |
| 868 | OP_POSQUERY, 0, /* QUERY, MINQUERY */ |
| 869 | OP_POSUPTO, 0, /* UPTO, MINUPTO */ |
| 870 | 0, /* EXACT */ |
| 871 | 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */ |
| 872 | |
| 873 | OP_POSSTARI, 0, /* STARI, MINSTARI */ |
| 874 | OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */ |
| 875 | OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */ |
| 876 | OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */ |
| 877 | 0, /* EXACTI */ |
| 878 | 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */ |
| 879 | |
| 880 | OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */ |
| 881 | OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */ |
| 882 | OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */ |
| 883 | OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */ |
| 884 | 0, /* NOTEXACT */ |
| 885 | 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */ |
| 886 | |
| 887 | OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */ |
| 888 | OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */ |
| 889 | OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */ |
| 890 | OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */ |
| 891 | 0, /* NOTEXACTI */ |
| 892 | 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */ |
| 893 | |
| 894 | OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */ |
| 895 | OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */ |
| 896 | OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */ |
| 897 | OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */ |
| 898 | 0, /* TYPEEXACT */ |
| 899 | 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */ |
| 900 | |
| 901 | OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */ |
| 902 | OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */ |
| 903 | OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */ |
| 904 | OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */ |
| 905 | 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */ |
| 906 | |
| 907 | 0, 0, 0, /* CLASS, NCLASS, XCLASS */ |
| 908 | 0, 0, /* REF, REFI */ |
| 909 | 0, 0, /* DNREF, DNREFI */ |
| 910 | 0, 0 /* RECURSE, CALLOUT */ |
| 911 | }; |
| 912 | |
| 913 | |
| 914 | #ifdef DEBUG_SHOW_PARSED |
| 915 | /************************************************* |
| 916 | * Show the parsed pattern for debugging * |
| 917 | *************************************************/ |
| 918 | |
| 919 | /* For debugging the pre-scan, this code, which outputs the parsed data vector, |
| 920 | can be enabled. */ |
| 921 | |
| 922 | static void show_parsed(compile_block *cb) |
| 923 | { |
| 924 | uint32_t *pptr = cb->parsed_pattern; |
| 925 | |
| 926 | for (;;) |
| 927 | { |
| 928 | int max, min; |
| 929 | PCRE2_SIZE offset; |
| 930 | uint32_t i; |
| 931 | uint32_t length; |
| 932 | uint32_t meta_arg = META_DATA(*pptr); |
| 933 | |
| 934 | fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr); |
| 935 | |
| 936 | if (*pptr < META_END) |
| 937 | { |
| 938 | if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr); |
| 939 | pptr++; |
| 940 | } |
| 941 | |
| 942 | else switch (META_CODE(*pptr++)) |
| 943 | { |
| 944 | default: |
| 945 | fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n"); |
| 946 | return; |
| 947 | |
| 948 | case META_END: |
| 949 | fprintf(stderr, "META_END\n"); |
| 950 | return; |
| 951 | |
| 952 | case META_CAPTURE: |
| 953 | fprintf(stderr, "META_CAPTURE %d", meta_arg); |
| 954 | break; |
| 955 | |
| 956 | case META_RECURSE: |
| 957 | GETOFFSET(offset, pptr); |
| 958 | fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset); |
| 959 | break; |
| 960 | |
| 961 | case META_BACKREF: |
| 962 | if (meta_arg < 10) |
| 963 | offset = cb->small_ref_offset[meta_arg]; |
| 964 | else |
| 965 | GETOFFSET(offset, pptr); |
| 966 | fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset); |
| 967 | break; |
| 968 | |
| 969 | case META_ESCAPE: |
| 970 | if (meta_arg == ESC_P || meta_arg == ESC_p) |
| 971 | { |
| 972 | uint32_t ptype = *pptr >> 16; |
| 973 | uint32_t pvalue = *pptr++ & 0xffff; |
| 974 | fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p', |
| 975 | ptype, pvalue); |
| 976 | } |
| 977 | else |
| 978 | { |
| 979 | uint32_t cc; |
| 980 | /* There's just one escape we might have here that isn't negated in the |
| 981 | escapes table. */ |
| 982 | if (meta_arg == ESC_g) cc = CHAR_g; |
| 983 | else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++) |
| 984 | { |
| 985 | if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break; |
| 986 | } |
| 987 | if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK; |
| 988 | fprintf(stderr, "META \\%c", cc); |
| 989 | } |
| 990 | break; |
| 991 | |
| 992 | case META_MINMAX: |
| 993 | min = *pptr++; |
| 994 | max = *pptr++; |
| 995 | if (max != REPEAT_UNLIMITED) |
| 996 | fprintf(stderr, "META {%d,%d}", min, max); |
| 997 | else |
| 998 | fprintf(stderr, "META {%d,}", min); |
| 999 | break; |
| 1000 | |
| 1001 | case META_MINMAX_QUERY: |
| 1002 | min = *pptr++; |
| 1003 | max = *pptr++; |
| 1004 | if (max != REPEAT_UNLIMITED) |
| 1005 | fprintf(stderr, "META {%d,%d}?", min, max); |
| 1006 | else |
| 1007 | fprintf(stderr, "META {%d,}?", min); |
| 1008 | break; |
| 1009 | |
| 1010 | case META_MINMAX_PLUS: |
| 1011 | min = *pptr++; |
| 1012 | max = *pptr++; |
| 1013 | if (max != REPEAT_UNLIMITED) |
| 1014 | fprintf(stderr, "META {%d,%d}+", min, max); |
| 1015 | else |
| 1016 | fprintf(stderr, "META {%d,}+", min); |
| 1017 | break; |
| 1018 | |
| 1019 | case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break; |
| 1020 | case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break; |
| 1021 | case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break; |
| 1022 | case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break; |
| 1023 | case META_DOT: fprintf(stderr, "META_DOT"); break; |
| 1024 | case META_ASTERISK: fprintf(stderr, "META *"); break; |
| 1025 | case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break; |
| 1026 | case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break; |
| 1027 | case META_PLUS: fprintf(stderr, "META +"); break; |
| 1028 | case META_PLUS_QUERY: fprintf(stderr, "META +?"); break; |
| 1029 | case META_PLUS_PLUS: fprintf(stderr, "META ++"); break; |
| 1030 | case META_QUERY: fprintf(stderr, "META ?"); break; |
| 1031 | case META_QUERY_QUERY: fprintf(stderr, "META ??"); break; |
| 1032 | case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break; |
| 1033 | |
| 1034 | case META_ATOMIC: fprintf(stderr, "META (?>"); break; |
| 1035 | case META_NOCAPTURE: fprintf(stderr, "META (?:"); break; |
| 1036 | case META_LOOKAHEAD: fprintf(stderr, "META (?="); break; |
| 1037 | case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break; |
| 1038 | case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break; |
| 1039 | case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break; |
| 1040 | case META_KET: fprintf(stderr, "META )"); break; |
| 1041 | case META_ALT: fprintf(stderr, "META | %d", meta_arg); break; |
| 1042 | |
| 1043 | case META_CLASS: fprintf(stderr, "META ["); break; |
| 1044 | case META_CLASS_NOT: fprintf(stderr, "META [^"); break; |
| 1045 | case META_CLASS_END: fprintf(stderr, "META ]"); break; |
| 1046 | case META_CLASS_EMPTY: fprintf(stderr, "META []"); break; |
| 1047 | case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break; |
| 1048 | |
| 1049 | case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break; |
| 1050 | case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break; |
| 1051 | |
| 1052 | case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break; |
| 1053 | case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break; |
| 1054 | |
| 1055 | case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break; |
| 1056 | case META_FAIL: fprintf(stderr, "META (*FAIL)"); break; |
| 1057 | case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break; |
| 1058 | case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break; |
| 1059 | case META_SKIP: fprintf(stderr, "META (*SKIP)"); break; |
| 1060 | case META_THEN: fprintf(stderr, "META (*THEN)"); break; |
| 1061 | |
| 1062 | case META_OPTIONS: fprintf(stderr, "META_OPTIONS 0x%02x", *pptr++); break; |
| 1063 | |
| 1064 | case META_LOOKBEHIND: |
| 1065 | fprintf(stderr, "META (?<= %d offset=", meta_arg); |
| 1066 | GETOFFSET(offset, pptr); |
| 1067 | fprintf(stderr, "%zd", offset); |
| 1068 | break; |
| 1069 | |
| 1070 | case META_LOOKBEHIND_NA: |
| 1071 | fprintf(stderr, "META (*naplb: %d offset=", meta_arg); |
| 1072 | GETOFFSET(offset, pptr); |
| 1073 | fprintf(stderr, "%zd", offset); |
| 1074 | break; |
| 1075 | |
| 1076 | case META_LOOKBEHINDNOT: |
| 1077 | fprintf(stderr, "META (?<! %d offset=", meta_arg); |
| 1078 | GETOFFSET(offset, pptr); |
| 1079 | fprintf(stderr, "%zd", offset); |
| 1080 | break; |
| 1081 | |
| 1082 | case META_CALLOUT_NUMBER: |
| 1083 | fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0], |
| 1084 | pptr[1]); |
| 1085 | pptr += 3; |
| 1086 | break; |
| 1087 | |
| 1088 | case META_CALLOUT_STRING: |
| 1089 | { |
| 1090 | uint32_t patoffset = *pptr++; /* Offset of next pattern item */ |
| 1091 | uint32_t patlength = *pptr++; /* Length of next pattern item */ |
| 1092 | fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++); |
| 1093 | GETOFFSET(offset, pptr); |
| 1094 | fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength); |
| 1095 | } |
| 1096 | break; |
| 1097 | |
| 1098 | case META_RECURSE_BYNAME: |
| 1099 | fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++); |
| 1100 | GETOFFSET(offset, pptr); |
| 1101 | fprintf(stderr, "%zd", offset); |
| 1102 | break; |
| 1103 | |
| 1104 | case META_BACKREF_BYNAME: |
| 1105 | fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++); |
| 1106 | GETOFFSET(offset, pptr); |
| 1107 | fprintf(stderr, "%zd", offset); |
| 1108 | break; |
| 1109 | |
| 1110 | case META_COND_NUMBER: |
| 1111 | fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]); |
| 1112 | GETOFFSET(offset, pptr); |
| 1113 | fprintf(stderr, "%zd", offset); |
| 1114 | pptr++; |
| 1115 | break; |
| 1116 | |
| 1117 | case META_COND_DEFINE: |
| 1118 | fprintf(stderr, "META (?(DEFINE) offset="); |
| 1119 | GETOFFSET(offset, pptr); |
| 1120 | fprintf(stderr, "%zd", offset); |
| 1121 | break; |
| 1122 | |
| 1123 | case META_COND_VERSION: |
| 1124 | fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">="); |
| 1125 | fprintf(stderr, "%d.", *pptr++); |
| 1126 | fprintf(stderr, "%d)", *pptr++); |
| 1127 | break; |
| 1128 | |
| 1129 | case META_COND_NAME: |
| 1130 | fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++); |
| 1131 | GETOFFSET(offset, pptr); |
| 1132 | fprintf(stderr, "%zd", offset); |
| 1133 | break; |
| 1134 | |
| 1135 | case META_COND_RNAME: |
| 1136 | fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++); |
| 1137 | GETOFFSET(offset, pptr); |
| 1138 | fprintf(stderr, "%zd", offset); |
| 1139 | break; |
| 1140 | |
| 1141 | /* This is kept as a name, because it might be. */ |
| 1142 | |
| 1143 | case META_COND_RNUMBER: |
| 1144 | fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++); |
| 1145 | GETOFFSET(offset, pptr); |
| 1146 | fprintf(stderr, "%zd", offset); |
| 1147 | break; |
| 1148 | |
| 1149 | case META_MARK: |
| 1150 | fprintf(stderr, "META (*MARK:"); |
| 1151 | goto SHOWARG; |
| 1152 | |
| 1153 | case META_COMMIT_ARG: |
| 1154 | fprintf(stderr, "META (*COMMIT:"); |
| 1155 | goto SHOWARG; |
| 1156 | |
| 1157 | case META_PRUNE_ARG: |
| 1158 | fprintf(stderr, "META (*PRUNE:"); |
| 1159 | goto SHOWARG; |
| 1160 | |
| 1161 | case META_SKIP_ARG: |
| 1162 | fprintf(stderr, "META (*SKIP:"); |
| 1163 | goto SHOWARG; |
| 1164 | |
| 1165 | case META_THEN_ARG: |
| 1166 | fprintf(stderr, "META (*THEN:"); |
| 1167 | SHOWARG: |
| 1168 | length = *pptr++; |
| 1169 | for (i = 0; i < length; i++) |
| 1170 | { |
| 1171 | uint32_t cc = *pptr++; |
| 1172 | if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc); |
| 1173 | else fprintf(stderr, "\\x{%x}", cc); |
| 1174 | } |
| 1175 | fprintf(stderr, ") length=%u", length); |
| 1176 | break; |
| 1177 | } |
| 1178 | fprintf(stderr, "\n"); |
| 1179 | } |
| 1180 | return; |
| 1181 | } |
| 1182 | #endif /* DEBUG_SHOW_PARSED */ |
| 1183 | |
| 1184 | |
| 1185 | |
| 1186 | /************************************************* |
| 1187 | * Copy compiled code * |
| 1188 | *************************************************/ |
| 1189 | |
| 1190 | /* Compiled JIT code cannot be copied, so the new compiled block has no |
| 1191 | associated JIT data. */ |
| 1192 | |
| 1193 | PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION |
| 1194 | pcre2_code_copy(const pcre2_code *code) |
| 1195 | { |
| 1196 | PCRE2_SIZE* ref_count; |
| 1197 | pcre2_code *newcode; |
| 1198 | |
| 1199 | if (code == NULL) return NULL; |
| 1200 | newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data); |
| 1201 | if (newcode == NULL) return NULL; |
| 1202 | memcpy(newcode, code, code->blocksize); |
| 1203 | newcode->executable_jit = NULL; |
| 1204 | |
| 1205 | /* If the code is one that has been deserialized, increment the reference count |
| 1206 | in the decoded tables. */ |
| 1207 | |
| 1208 | if ((code->flags & PCRE2_DEREF_TABLES) != 0) |
| 1209 | { |
| 1210 | ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH); |
| 1211 | (*ref_count)++; |
| 1212 | } |
| 1213 | |
| 1214 | return newcode; |
| 1215 | } |
| 1216 | |
| 1217 | |
| 1218 | |
| 1219 | /************************************************* |
| 1220 | * Copy compiled code and character tables * |
| 1221 | *************************************************/ |
| 1222 | |
| 1223 | /* Compiled JIT code cannot be copied, so the new compiled block has no |
| 1224 | associated JIT data. This version of code_copy also makes a separate copy of |
| 1225 | the character tables. */ |
| 1226 | |
| 1227 | PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION |
| 1228 | pcre2_code_copy_with_tables(const pcre2_code *code) |
| 1229 | { |
| 1230 | PCRE2_SIZE* ref_count; |
| 1231 | pcre2_code *newcode; |
| 1232 | uint8_t *newtables; |
| 1233 | |
| 1234 | if (code == NULL) return NULL; |
| 1235 | newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data); |
| 1236 | if (newcode == NULL) return NULL; |
| 1237 | memcpy(newcode, code, code->blocksize); |
| 1238 | newcode->executable_jit = NULL; |
| 1239 | |
| 1240 | newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE), |
| 1241 | code->memctl.memory_data); |
| 1242 | if (newtables == NULL) |
| 1243 | { |
| 1244 | code->memctl.free((void *)newcode, code->memctl.memory_data); |
| 1245 | return NULL; |
| 1246 | } |
| 1247 | memcpy(newtables, code->tables, TABLES_LENGTH); |
| 1248 | ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH); |
| 1249 | *ref_count = 1; |
| 1250 | |
| 1251 | newcode->tables = newtables; |
| 1252 | newcode->flags |= PCRE2_DEREF_TABLES; |
| 1253 | return newcode; |
| 1254 | } |
| 1255 | |
| 1256 | |
| 1257 | |
| 1258 | /************************************************* |
| 1259 | * Free compiled code * |
| 1260 | *************************************************/ |
| 1261 | |
| 1262 | PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION |
| 1263 | pcre2_code_free(pcre2_code *code) |
| 1264 | { |
| 1265 | PCRE2_SIZE* ref_count; |
| 1266 | |
| 1267 | if (code != NULL) |
| 1268 | { |
| 1269 | if (code->executable_jit != NULL) |
| 1270 | PRIV(jit_free)(code->executable_jit, &code->memctl); |
| 1271 | |
| 1272 | if ((code->flags & PCRE2_DEREF_TABLES) != 0) |
| 1273 | { |
| 1274 | /* Decoded tables belong to the codes after deserialization, and they must |
| 1275 | be freed when there are no more references to them. The *ref_count should |
| 1276 | always be > 0. */ |
| 1277 | |
| 1278 | ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH); |
| 1279 | if (*ref_count > 0) |
| 1280 | { |
| 1281 | (*ref_count)--; |
| 1282 | if (*ref_count == 0) |
| 1283 | code->memctl.free((void *)code->tables, code->memctl.memory_data); |
| 1284 | } |
| 1285 | } |
| 1286 | |
| 1287 | code->memctl.free(code, code->memctl.memory_data); |
| 1288 | } |
| 1289 | } |
| 1290 | |
| 1291 | |
| 1292 | |
| 1293 | /************************************************* |
| 1294 | * Read a number, possibly signed * |
| 1295 | *************************************************/ |
| 1296 | |
| 1297 | /* This function is used to read numbers in the pattern. The initial pointer |
| 1298 | must be the sign or first digit of the number. When relative values (introduced |
| 1299 | by + or -) are allowed, they are relative group numbers, and the result must be |
| 1300 | greater than zero. |
| 1301 | |
| 1302 | Arguments: |
| 1303 | ptrptr points to the character pointer variable |
| 1304 | ptrend points to the end of the input string |
| 1305 | allow_sign if < 0, sign not allowed; if >= 0, sign is relative to this |
| 1306 | max_value the largest number allowed |
| 1307 | max_error the error to give for an over-large number |
| 1308 | intptr where to put the result |
| 1309 | errcodeptr where to put an error code |
| 1310 | |
| 1311 | Returns: TRUE - a number was read |
| 1312 | FALSE - errorcode == 0 => no number was found |
| 1313 | errorcode != 0 => an error occurred |
| 1314 | */ |
| 1315 | |
| 1316 | static BOOL |
| 1317 | read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign, |
| 1318 | uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr) |
| 1319 | { |
| 1320 | int sign = 0; |
| 1321 | uint32_t n = 0; |
| 1322 | PCRE2_SPTR ptr = *ptrptr; |
| 1323 | BOOL yield = FALSE; |
| 1324 | |
| 1325 | *errorcodeptr = 0; |
| 1326 | |
| 1327 | if (allow_sign >= 0 && ptr < ptrend) |
| 1328 | { |
| 1329 | if (*ptr == CHAR_PLUS) |
| 1330 | { |
| 1331 | sign = +1; |
| 1332 | max_value -= allow_sign; |
| 1333 | ptr++; |
| 1334 | } |
| 1335 | else if (*ptr == CHAR_MINUS) |
| 1336 | { |
| 1337 | sign = -1; |
| 1338 | ptr++; |
| 1339 | } |
| 1340 | } |
| 1341 | |
| 1342 | if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE; |
| 1343 | while (ptr < ptrend && IS_DIGIT(*ptr)) |
| 1344 | { |
| 1345 | n = n * 10 + *ptr++ - CHAR_0; |
| 1346 | if (n > max_value) |
| 1347 | { |
| 1348 | *errorcodeptr = max_error; |
| 1349 | goto EXIT; |
| 1350 | } |
| 1351 | } |
| 1352 | |
| 1353 | if (allow_sign >= 0 && sign != 0) |
| 1354 | { |
| 1355 | if (n == 0) |
| 1356 | { |
| 1357 | *errorcodeptr = ERR26; /* +0 and -0 are not allowed */ |
| 1358 | goto EXIT; |
| 1359 | } |
| 1360 | |
| 1361 | if (sign > 0) n += allow_sign; |
| 1362 | else if ((int)n > allow_sign) |
| 1363 | { |
| 1364 | *errorcodeptr = ERR15; /* Non-existent subpattern */ |
| 1365 | goto EXIT; |
| 1366 | } |
| 1367 | else n = allow_sign + 1 - n; |
| 1368 | } |
| 1369 | |
| 1370 | yield = TRUE; |
| 1371 | |
| 1372 | EXIT: |
| 1373 | *intptr = n; |
| 1374 | *ptrptr = ptr; |
| 1375 | return yield; |
| 1376 | } |
| 1377 | |
| 1378 | |
| 1379 | |
| 1380 | /************************************************* |
| 1381 | * Read repeat counts * |
| 1382 | *************************************************/ |
| 1383 | |
| 1384 | /* Read an item of the form {n,m} and return the values if non-NULL pointers |
| 1385 | are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a |
| 1386 | larger value is used for "unlimited". We have to use signed arguments for |
| 1387 | read_number() because it is capable of returning a signed value. |
| 1388 | |
| 1389 | Arguments: |
| 1390 | ptrptr points to pointer to character after'{' |
| 1391 | ptrend pointer to end of input |
| 1392 | minp if not NULL, pointer to int for min |
| 1393 | maxp if not NULL, pointer to int for max (-1 if no max) |
| 1394 | returned as -1 if no max |
| 1395 | errorcodeptr points to error code variable |
| 1396 | |
| 1397 | Returns: FALSE if not a repeat quantifier, errorcode set zero |
| 1398 | FALSE on error, with errorcode set non-zero |
| 1399 | TRUE on success, with pointer updated to point after '}' |
| 1400 | */ |
| 1401 | |
| 1402 | static BOOL |
| 1403 | read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp, |
| 1404 | uint32_t *maxp, int *errorcodeptr) |
| 1405 | { |
| 1406 | PCRE2_SPTR p; |
| 1407 | BOOL yield = FALSE; |
| 1408 | BOOL had_comma = FALSE; |
| 1409 | int32_t min = 0; |
| 1410 | int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */ |
| 1411 | |
| 1412 | /* Check the syntax */ |
| 1413 | |
| 1414 | *errorcodeptr = 0; |
| 1415 | for (p = *ptrptr;; p++) |
| 1416 | { |
| 1417 | uint32_t c; |
| 1418 | if (p >= ptrend) return FALSE; |
| 1419 | c = *p; |
| 1420 | if (IS_DIGIT(c)) continue; |
| 1421 | if (c == CHAR_RIGHT_CURLY_BRACKET) break; |
| 1422 | if (c == CHAR_COMMA) |
| 1423 | { |
| 1424 | if (had_comma) return FALSE; |
| 1425 | had_comma = TRUE; |
| 1426 | } |
| 1427 | else return FALSE; |
| 1428 | } |
| 1429 | |
| 1430 | /* The only error from read_number() is for a number that is too big. */ |
| 1431 | |
| 1432 | p = *ptrptr; |
| 1433 | if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr)) |
| 1434 | goto EXIT; |
| 1435 | |
| 1436 | if (*p == CHAR_RIGHT_CURLY_BRACKET) |
| 1437 | { |
| 1438 | p++; |
| 1439 | max = min; |
| 1440 | } |
| 1441 | else |
| 1442 | { |
| 1443 | if (*(++p) != CHAR_RIGHT_CURLY_BRACKET) |
| 1444 | { |
| 1445 | if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, |
| 1446 | errorcodeptr)) |
| 1447 | goto EXIT; |
| 1448 | if (max < min) |
| 1449 | { |
| 1450 | *errorcodeptr = ERR4; |
| 1451 | goto EXIT; |
| 1452 | } |
| 1453 | } |
| 1454 | p++; |
| 1455 | } |
| 1456 | |
| 1457 | yield = TRUE; |
| 1458 | if (minp != NULL) *minp = (uint32_t)min; |
| 1459 | if (maxp != NULL) *maxp = (uint32_t)max; |
| 1460 | |
| 1461 | /* Update the pattern pointer */ |
| 1462 | |
| 1463 | EXIT: |
| 1464 | *ptrptr = p; |
| 1465 | return yield; |
| 1466 | } |
| 1467 | |
| 1468 | |
| 1469 | |
| 1470 | /************************************************* |
| 1471 | * Handle escapes * |
| 1472 | *************************************************/ |
| 1473 | |
| 1474 | /* This function is called when a \ has been encountered. It either returns a |
| 1475 | positive value for a simple escape such as \d, or 0 for a data character, which |
| 1476 | is placed in chptr. A backreference to group n is returned as negative n. On |
| 1477 | entry, ptr is pointing at the character after \. On exit, it points after the |
| 1478 | final code unit of the escape sequence. |
| 1479 | |
| 1480 | This function is also called from pcre2_substitute() to handle escape sequences |
| 1481 | in replacement strings. In this case, the cb argument is NULL, and in the case |
| 1482 | of escapes that have further processing, only sequences that define a data |
| 1483 | character are recognised. The isclass argument is not relevant; the options |
| 1484 | argument is the final value of the compiled pattern's options. |
| 1485 | |
| 1486 | Arguments: |
| 1487 | ptrptr points to the input position pointer |
| 1488 | ptrend points to the end of the input |
| 1489 | chptr points to a returned data character |
| 1490 | errorcodeptr points to the errorcode variable (containing zero) |
| 1491 | options the current options bits |
| 1492 | isclass TRUE if inside a character class |
| 1493 | cb compile data block or NULL when called from pcre2_substitute() |
| 1494 | |
| 1495 | Returns: zero => a data character |
| 1496 | positive => a special escape sequence |
| 1497 | negative => a numerical back reference |
| 1498 | on error, errorcodeptr is set non-zero |
| 1499 | */ |
| 1500 | |
| 1501 | int |
| 1502 | PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr, |
| 1503 | int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass, |
| 1504 | compile_block *cb) |
| 1505 | { |
| 1506 | BOOL utf = (options & PCRE2_UTF) != 0; |
| 1507 | PCRE2_SPTR ptr = *ptrptr; |
| 1508 | uint32_t c, cc; |
| 1509 | int escape = 0; |
| 1510 | int i; |
| 1511 | |
| 1512 | /* If backslash is at the end of the string, it's an error. */ |
| 1513 | |
| 1514 | if (ptr >= ptrend) |
| 1515 | { |
| 1516 | *errorcodeptr = ERR1; |
| 1517 | return 0; |
| 1518 | } |
| 1519 | |
| 1520 | GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ |
| 1521 | *errorcodeptr = 0; /* Be optimistic */ |
| 1522 | |
| 1523 | /* Non-alphanumerics are literals, so we just leave the value in c. An initial |
| 1524 | value test saves a memory lookup for code points outside the alphanumeric |
| 1525 | range. */ |
| 1526 | |
| 1527 | if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */ |
| 1528 | |
| 1529 | /* Otherwise, do a table lookup. Non-zero values need little processing here. A |
| 1530 | positive value is a literal value for something like \n. A negative value is |
| 1531 | the negation of one of the ESC_ macros that is passed back for handling by the |
| 1532 | calling function. Some extra checking is needed for \N because only \N{U+dddd} |
| 1533 | is supported. If the value is zero, further processing is handled below. */ |
| 1534 | |
| 1535 | else if ((i = escapes[c - ESCAPES_FIRST]) != 0) |
| 1536 | { |
| 1537 | if (i > 0) |
| 1538 | { |
| 1539 | c = (uint32_t)i; |
| 1540 | if (c == CHAR_CR && (extra_options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0) |
| 1541 | c = CHAR_LF; |
| 1542 | } |
| 1543 | else /* Negative table entry */ |
| 1544 | { |
| 1545 | escape = -i; /* Else return a special escape */ |
| 1546 | if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X)) |
| 1547 | cb->external_flags |= PCRE2_HASBKPORX; /* Note \P, \p, or \X */ |
| 1548 | |
| 1549 | /* Perl supports \N{name} for character names and \N{U+dddd} for numerical |
| 1550 | Unicode code points, as well as plain \N for "not newline". PCRE does not |
| 1551 | support \N{name}. However, it does support quantification such as \N{2,3}, |
| 1552 | so if \N{ is not followed by U+dddd we check for a quantifier. */ |
| 1553 | |
| 1554 | if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET) |
| 1555 | { |
| 1556 | PCRE2_SPTR p = ptr + 1; |
| 1557 | |
| 1558 | /* \N{U+ can be handled by the \x{ code. However, this construction is |
| 1559 | not valid in EBCDIC environments because it specifies a Unicode |
| 1560 | character, not a codepoint in the local code. For example \N{U+0041} |
| 1561 | must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode |
| 1562 | casing semantics for the entire pattern, so allow it only in UTF (i.e. |
| 1563 | Unicode) mode. */ |
| 1564 | |
| 1565 | if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS) |
| 1566 | { |
| 1567 | #ifdef EBCDIC |
| 1568 | *errorcodeptr = ERR93; |
| 1569 | #else |
| 1570 | if (utf) |
| 1571 | { |
| 1572 | ptr = p + 1; |
| 1573 | escape = 0; /* Not a fancy escape after all */ |
| 1574 | goto COME_FROM_NU; |
| 1575 | } |
| 1576 | else *errorcodeptr = ERR93; |
| 1577 | #endif |
| 1578 | } |
| 1579 | |
| 1580 | /* Give an error if what follows is not a quantifier, but don't override |
| 1581 | an error set by the quantifier reader (e.g. number overflow). */ |
| 1582 | |
| 1583 | else |
| 1584 | { |
| 1585 | if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) && |
| 1586 | *errorcodeptr == 0) |
| 1587 | *errorcodeptr = ERR37; |
| 1588 | } |
| 1589 | } |
| 1590 | } |
| 1591 | } |
| 1592 | |
| 1593 | /* Escapes that need further processing, including those that are unknown, have |
| 1594 | a zero entry in the lookup table. When called from pcre2_substitute(), only \c, |
| 1595 | \o, and \x are recognized (\u and \U can never appear as they are used for case |
| 1596 | forcing). */ |
| 1597 | |
| 1598 | else |
| 1599 | { |
| 1600 | int s; |
| 1601 | PCRE2_SPTR oldptr; |
| 1602 | BOOL overflow; |
| 1603 | BOOL alt_bsux = |
| 1604 | ((options & PCRE2_ALT_BSUX) | (extra_options & PCRE2_EXTRA_ALT_BSUX)) != 0; |
| 1605 | |
| 1606 | /* Filter calls from pcre2_substitute(). */ |
| 1607 | |
| 1608 | if (cb == NULL) |
| 1609 | { |
| 1610 | if (c != CHAR_c && c != CHAR_o && c != CHAR_x) |
| 1611 | { |
| 1612 | *errorcodeptr = ERR3; |
| 1613 | return 0; |
| 1614 | } |
| 1615 | alt_bsux = FALSE; /* Do not modify \x handling */ |
| 1616 | } |
| 1617 | |
| 1618 | switch (c) |
| 1619 | { |
| 1620 | /* A number of Perl escapes are not handled by PCRE. We give an explicit |
| 1621 | error. */ |
| 1622 | |
| 1623 | case CHAR_F: |
| 1624 | case CHAR_l: |
| 1625 | case CHAR_L: |
| 1626 | *errorcodeptr = ERR37; |
| 1627 | break; |
| 1628 | |
| 1629 | /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX |
| 1630 | is set. Otherwise, \u must be followed by exactly four hex digits or, if |
| 1631 | PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces. |
| 1632 | Otherwise it is a lowercase u letter. This gives some compatibility with |
| 1633 | ECMAScript (aka JavaScript). */ |
| 1634 | |
| 1635 | case CHAR_u: |
| 1636 | if (!alt_bsux) *errorcodeptr = ERR37; else |
| 1637 | { |
| 1638 | uint32_t xc; |
| 1639 | |
| 1640 | if (ptr >= ptrend) break; |
| 1641 | if (*ptr == CHAR_LEFT_CURLY_BRACKET && |
| 1642 | (extra_options & PCRE2_EXTRA_ALT_BSUX) != 0) |
| 1643 | { |
| 1644 | PCRE2_SPTR hptr = ptr + 1; |
| 1645 | cc = 0; |
| 1646 | |
| 1647 | while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff) |
| 1648 | { |
| 1649 | if ((cc & 0xf0000000) != 0) /* Test for 32-bit overflow */ |
| 1650 | { |
| 1651 | *errorcodeptr = ERR77; |
| 1652 | ptr = hptr; /* Show where */ |
| 1653 | break; /* *hptr != } will cause another break below */ |
| 1654 | } |
| 1655 | cc = (cc << 4) | xc; |
| 1656 | hptr++; |
| 1657 | } |
| 1658 | |
| 1659 | if (hptr == ptr + 1 || /* No hex digits */ |
| 1660 | hptr >= ptrend || /* Hit end of input */ |
| 1661 | *hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator */ |
| 1662 | break; /* Hex escape not recognized */ |
| 1663 | |
| 1664 | c = cc; /* Accept the code point */ |
| 1665 | ptr = hptr + 1; |
| 1666 | } |
| 1667 | |
| 1668 | else /* Must be exactly 4 hex digits */ |
| 1669 | { |
| 1670 | if (ptrend - ptr < 4) break; /* Less than 4 chars */ |
| 1671 | if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */ |
| 1672 | if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ |
| 1673 | cc = (cc << 4) | xc; |
| 1674 | if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */ |
| 1675 | cc = (cc << 4) | xc; |
| 1676 | if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */ |
| 1677 | c = (cc << 4) | xc; |
| 1678 | ptr += 4; |
| 1679 | } |
| 1680 | |
| 1681 | if (utf) |
| 1682 | { |
| 1683 | if (c > 0x10ffffU) *errorcodeptr = ERR77; |
| 1684 | else |
| 1685 | if (c >= 0xd800 && c <= 0xdfff && |
| 1686 | (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0) |
| 1687 | *errorcodeptr = ERR73; |
| 1688 | } |
| 1689 | else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77; |
| 1690 | } |
| 1691 | break; |
| 1692 | |
| 1693 | /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, |
| 1694 | in which case it is an upper case letter. */ |
| 1695 | |
| 1696 | case CHAR_U: |
| 1697 | if (!alt_bsux) *errorcodeptr = ERR37; |
| 1698 | break; |
| 1699 | |
| 1700 | /* In a character class, \g is just a literal "g". Outside a character |
| 1701 | class, \g must be followed by one of a number of specific things: |
| 1702 | |
| 1703 | (1) A number, either plain or braced. If positive, it is an absolute |
| 1704 | backreference. If negative, it is a relative backreference. This is a Perl |
| 1705 | 5.10 feature. |
| 1706 | |
| 1707 | (2) Perl 5.10 also supports \g{name} as a reference to a named group. This |
| 1708 | is part of Perl's movement towards a unified syntax for back references. As |
| 1709 | this is synonymous with \k{name}, we fudge it up by pretending it really |
| 1710 | was \k{name}. |
| 1711 | |
| 1712 | (3) For Oniguruma compatibility we also support \g followed by a name or a |
| 1713 | number either in angle brackets or in single quotes. However, these are |
| 1714 | (possibly recursive) subroutine calls, _not_ backreferences. We return |
| 1715 | the ESC_g code. |
| 1716 | |
| 1717 | Summary: Return a negative number for a numerical back reference, ESC_k for |
| 1718 | a named back reference, and ESC_g for a named or numbered subroutine call. |
| 1719 | */ |
| 1720 | |
| 1721 | case CHAR_g: |
| 1722 | if (isclass) break; |
| 1723 | |
| 1724 | if (ptr >= ptrend) |
| 1725 | { |
| 1726 | *errorcodeptr = ERR57; |
| 1727 | break; |
| 1728 | } |
| 1729 | |
| 1730 | if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE) |
| 1731 | { |
| 1732 | escape = ESC_g; |
| 1733 | break; |
| 1734 | } |
| 1735 | |
| 1736 | /* If there is a brace delimiter, try to read a numerical reference. If |
| 1737 | there isn't one, assume we have a name and treat it as \k. */ |
| 1738 | |
| 1739 | if (*ptr == CHAR_LEFT_CURLY_BRACKET) |
| 1740 | { |
| 1741 | PCRE2_SPTR p = ptr + 1; |
| 1742 | if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s, |
| 1743 | errorcodeptr)) |
| 1744 | { |
| 1745 | if (*errorcodeptr == 0) escape = ESC_k; /* No number found */ |
| 1746 | break; |
| 1747 | } |
| 1748 | if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET) |
| 1749 | { |
| 1750 | *errorcodeptr = ERR57; |
| 1751 | break; |
| 1752 | } |
| 1753 | ptr = p + 1; |
| 1754 | } |
| 1755 | |
| 1756 | /* Read an undelimited number */ |
| 1757 | |
| 1758 | else |
| 1759 | { |
| 1760 | if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s, |
| 1761 | errorcodeptr)) |
| 1762 | { |
| 1763 | if (*errorcodeptr == 0) *errorcodeptr = ERR57; /* No number found */ |
| 1764 | break; |
| 1765 | } |
| 1766 | } |
| 1767 | |
| 1768 | if (s <= 0) |
| 1769 | { |
| 1770 | *errorcodeptr = ERR15; |
| 1771 | break; |
| 1772 | } |
| 1773 | |
| 1774 | escape = -s; |
| 1775 | break; |
| 1776 | |
| 1777 | /* The handling of escape sequences consisting of a string of digits |
| 1778 | starting with one that is not zero is not straightforward. Perl has changed |
| 1779 | over the years. Nowadays \g{} for backreferences and \o{} for octal are |
| 1780 | recommended to avoid the ambiguities in the old syntax. |
| 1781 | |
| 1782 | Outside a character class, the digits are read as a decimal number. If the |
| 1783 | number is less than 10, or if there are that many previous extracting left |
| 1784 | brackets, it is a back reference. Otherwise, up to three octal digits are |
| 1785 | read to form an escaped character code. Thus \123 is likely to be octal 123 |
| 1786 | (cf \0123, which is octal 012 followed by the literal 3). |
| 1787 | |
| 1788 | Inside a character class, \ followed by a digit is always either a literal |
| 1789 | 8 or 9 or an octal number. */ |
| 1790 | |
| 1791 | case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5: |
| 1792 | case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: |
| 1793 | |
| 1794 | if (!isclass) |
| 1795 | { |
| 1796 | oldptr = ptr; |
| 1797 | ptr--; /* Back to the digit */ |
| 1798 | |
| 1799 | /* As we know we are at a digit, the only possible error from |
| 1800 | read_number() is a number that is too large to be a group number. In this |
| 1801 | case we fall through handle this as not a group reference. If we have |
| 1802 | read a small enough number, check for a back reference. |
| 1803 | |
| 1804 | \1 to \9 are always back references. \8x and \9x are too; \1x to \7x |
| 1805 | are octal escapes if there are not that many previous captures. */ |
| 1806 | |
| 1807 | if (read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, 0, &s, errorcodeptr) && |
| 1808 | (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount)) |
| 1809 | { |
| 1810 | if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61; |
| 1811 | else escape = -s; /* Indicates a back reference */ |
| 1812 | break; |
| 1813 | } |
| 1814 | |
| 1815 | ptr = oldptr; /* Put the pointer back and fall through */ |
| 1816 | } |
| 1817 | |
| 1818 | /* Handle a digit following \ when the number is not a back reference, or |
| 1819 | we are within a character class. If the first digit is 8 or 9, Perl used to |
| 1820 | generate a binary zero and then treat the digit as a following literal. At |
| 1821 | least by Perl 5.18 this changed so as not to insert the binary zero. */ |
| 1822 | |
| 1823 | if (c >= CHAR_8) break; |
| 1824 | |
| 1825 | /* Fall through */ |
| 1826 | |
| 1827 | /* \0 always starts an octal number, but we may drop through to here with a |
| 1828 | larger first octal digit. The original code used just to take the least |
| 1829 | significant 8 bits of octal numbers (I think this is what early Perls used |
| 1830 | to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode, |
| 1831 | but no more than 3 octal digits. */ |
| 1832 | |
| 1833 | case CHAR_0: |
| 1834 | c -= CHAR_0; |
| 1835 | while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) |
| 1836 | c = c * 8 + *ptr++ - CHAR_0; |
| 1837 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
| 1838 | if (!utf && c > 0xff) *errorcodeptr = ERR51; |
| 1839 | #endif |
| 1840 | break; |
| 1841 | |
| 1842 | /* \o is a relatively new Perl feature, supporting a more general way of |
| 1843 | specifying character codes in octal. The only supported form is \o{ddd}. */ |
| 1844 | |
| 1845 | case CHAR_o: |
| 1846 | if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET) |
| 1847 | { |
| 1848 | ptr--; |
| 1849 | *errorcodeptr = ERR55; |
| 1850 | } |
| 1851 | else if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET) |
| 1852 | *errorcodeptr = ERR78; |
| 1853 | else |
| 1854 | { |
| 1855 | c = 0; |
| 1856 | overflow = FALSE; |
| 1857 | while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) |
| 1858 | { |
| 1859 | cc = *ptr++; |
| 1860 | if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */ |
| 1861 | #if PCRE2_CODE_UNIT_WIDTH == 32 |
| 1862 | if (c >= 0x20000000l) { overflow = TRUE; break; } |
| 1863 | #endif |
| 1864 | c = (c << 3) + (cc - CHAR_0); |
| 1865 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
| 1866 | if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; } |
| 1867 | #elif PCRE2_CODE_UNIT_WIDTH == 16 |
| 1868 | if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; } |
| 1869 | #elif PCRE2_CODE_UNIT_WIDTH == 32 |
| 1870 | if (utf && c > 0x10ffffU) { overflow = TRUE; break; } |
| 1871 | #endif |
| 1872 | } |
| 1873 | if (overflow) |
| 1874 | { |
| 1875 | while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++; |
| 1876 | *errorcodeptr = ERR34; |
| 1877 | } |
| 1878 | else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET) |
| 1879 | { |
| 1880 | if (utf && c >= 0xd800 && c <= 0xdfff && |
| 1881 | (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0) |
| 1882 | { |
| 1883 | ptr--; |
| 1884 | *errorcodeptr = ERR73; |
| 1885 | } |
| 1886 | } |
| 1887 | else |
| 1888 | { |
| 1889 | ptr--; |
| 1890 | *errorcodeptr = ERR64; |
| 1891 | } |
| 1892 | } |
| 1893 | break; |
| 1894 | |
| 1895 | /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed |
| 1896 | by two hexadecimal digits. Otherwise it is a lowercase x letter. */ |
| 1897 | |
| 1898 | case CHAR_x: |
| 1899 | if (alt_bsux) |
| 1900 | { |
| 1901 | uint32_t xc; |
| 1902 | if (ptrend - ptr < 2) break; /* Less than 2 characters */ |
| 1903 | if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */ |
| 1904 | if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ |
| 1905 | c = (cc << 4) | xc; |
| 1906 | ptr += 2; |
| 1907 | } |
| 1908 | |
| 1909 | /* Handle \x in Perl's style. \x{ddd} is a character code which can be |
| 1910 | greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex |
| 1911 | digits. If not, { used to be treated as a data character. However, Perl |
| 1912 | seems to read hex digits up to the first non-such, and ignore the rest, so |
| 1913 | that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE |
| 1914 | now gives an error. */ |
| 1915 | |
| 1916 | else |
| 1917 | { |
| 1918 | if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET) |
| 1919 | { |
| 1920 | #ifndef EBCDIC |
| 1921 | COME_FROM_NU: |
| 1922 | #endif |
| 1923 | if (++ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET) |
| 1924 | { |
| 1925 | *errorcodeptr = ERR78; |
| 1926 | break; |
| 1927 | } |
| 1928 | c = 0; |
| 1929 | overflow = FALSE; |
| 1930 | |
| 1931 | while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff) |
| 1932 | { |
| 1933 | ptr++; |
| 1934 | if (c == 0 && cc == 0) continue; /* Leading zeroes */ |
| 1935 | #if PCRE2_CODE_UNIT_WIDTH == 32 |
| 1936 | if (c >= 0x10000000l) { overflow = TRUE; break; } |
| 1937 | #endif |
| 1938 | c = (c << 4) | cc; |
| 1939 | if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR)) |
| 1940 | { |
| 1941 | overflow = TRUE; |
| 1942 | break; |
| 1943 | } |
| 1944 | } |
| 1945 | |
| 1946 | if (overflow) |
| 1947 | { |
| 1948 | while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++; |
| 1949 | *errorcodeptr = ERR34; |
| 1950 | } |
| 1951 | else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET) |
| 1952 | { |
| 1953 | if (utf && c >= 0xd800 && c <= 0xdfff && |
| 1954 | (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0) |
| 1955 | { |
| 1956 | ptr--; |
| 1957 | *errorcodeptr = ERR73; |
| 1958 | } |
| 1959 | } |
| 1960 | |
| 1961 | /* If the sequence of hex digits does not end with '}', give an error. |
| 1962 | We used just to recognize this construct and fall through to the normal |
| 1963 | \x handling, but nowadays Perl gives an error, which seems much more |
| 1964 | sensible, so we do too. */ |
| 1965 | |
| 1966 | else |
| 1967 | { |
| 1968 | ptr--; |
| 1969 | *errorcodeptr = ERR67; |
| 1970 | } |
| 1971 | } /* End of \x{} processing */ |
| 1972 | |
| 1973 | /* Read a up to two hex digits after \x */ |
| 1974 | |
| 1975 | else |
| 1976 | { |
| 1977 | c = 0; |
| 1978 | if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */ |
| 1979 | ptr++; |
| 1980 | c = cc; |
| 1981 | if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */ |
| 1982 | ptr++; |
| 1983 | c = (c << 4) | cc; |
| 1984 | } /* End of \xdd handling */ |
| 1985 | } /* End of Perl-style \x handling */ |
| 1986 | break; |
| 1987 | |
| 1988 | /* The handling of \c is different in ASCII and EBCDIC environments. In an |
| 1989 | ASCII (or Unicode) environment, an error is given if the character |
| 1990 | following \c is not a printable ASCII character. Otherwise, the following |
| 1991 | character is upper-cased if it is a letter, and after that the 0x40 bit is |
| 1992 | flipped. The result is the value of the escape. |
| 1993 | |
| 1994 | In an EBCDIC environment the handling of \c is compatible with the |
| 1995 | specification in the perlebcdic document. The following character must be |
| 1996 | a letter or one of small number of special characters. These provide a |
| 1997 | means of defining the character values 0-31. |
| 1998 | |
| 1999 | For testing the EBCDIC handling of \c in an ASCII environment, recognize |
| 2000 | the EBCDIC value of 'c' explicitly. */ |
| 2001 | |
| 2002 | #if defined EBCDIC && 'a' != 0x81 |
| 2003 | case 0x83: |
| 2004 | #else |
| 2005 | case CHAR_c: |
| 2006 | #endif |
| 2007 | if (ptr >= ptrend) |
| 2008 | { |
| 2009 | *errorcodeptr = ERR2; |
| 2010 | break; |
| 2011 | } |
| 2012 | c = *ptr; |
| 2013 | if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c); |
| 2014 | |
| 2015 | /* Handle \c in an ASCII/Unicode environment. */ |
| 2016 | |
| 2017 | #ifndef EBCDIC /* ASCII/UTF-8 coding */ |
| 2018 | if (c < 32 || c > 126) /* Excludes all non-printable ASCII */ |
| 2019 | { |
| 2020 | *errorcodeptr = ERR68; |
| 2021 | break; |
| 2022 | } |
| 2023 | c ^= 0x40; |
| 2024 | |
| 2025 | /* Handle \c in an EBCDIC environment. The special case \c? is converted to |
| 2026 | 255 (0xff) or 95 (0x5f) if other characters suggest we are using the |
| 2027 | POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.) |
| 2028 | The other valid sequences correspond to a list of specific characters. */ |
| 2029 | |
| 2030 | #else |
| 2031 | if (c == CHAR_QUESTION_MARK) |
| 2032 | c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff; |
| 2033 | else |
| 2034 | { |
| 2035 | for (i = 0; i < 32; i++) |
| 2036 | { |
| 2037 | if (c == ebcdic_escape_c[i]) break; |
| 2038 | } |
| 2039 | if (i < 32) c = i; else *errorcodeptr = ERR68; |
| 2040 | } |
| 2041 | #endif /* EBCDIC */ |
| 2042 | |
| 2043 | ptr++; |
| 2044 | break; |
| 2045 | |
| 2046 | /* Any other alphanumeric following \ is an error. Perl gives an error only |
| 2047 | if in warning mode, but PCRE doesn't have a warning mode. */ |
| 2048 | |
| 2049 | default: |
| 2050 | *errorcodeptr = ERR3; |
| 2051 | *ptrptr = ptr - 1; /* Point to the character at fault */ |
| 2052 | return 0; |
| 2053 | } |
| 2054 | } |
| 2055 | |
| 2056 | /* Set the pointer to the next character before returning. */ |
| 2057 | |
| 2058 | *ptrptr = ptr; |
| 2059 | *chptr = c; |
| 2060 | return escape; |
| 2061 | } |
| 2062 | |
| 2063 | |
| 2064 | |
| 2065 | #ifdef SUPPORT_UNICODE |
| 2066 | /************************************************* |
| 2067 | * Handle \P and \p * |
| 2068 | *************************************************/ |
| 2069 | |
| 2070 | /* This function is called after \P or \p has been encountered, provided that |
| 2071 | PCRE2 is compiled with support for UTF and Unicode properties. On entry, the |
| 2072 | contents of ptrptr are pointing after the P or p. On exit, it is left pointing |
| 2073 | after the final code unit of the escape sequence. |
| 2074 | |
| 2075 | Arguments: |
| 2076 | ptrptr the pattern position pointer |
| 2077 | negptr a boolean that is set TRUE for negation else FALSE |
| 2078 | ptypeptr an unsigned int that is set to the type value |
| 2079 | pdataptr an unsigned int that is set to the detailed property value |
| 2080 | errorcodeptr the error code variable |
| 2081 | cb the compile data |
| 2082 | |
| 2083 | Returns: TRUE if the type value was found, or FALSE for an invalid type |
| 2084 | */ |
| 2085 | |
| 2086 | static BOOL |
| 2087 | get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr, |
| 2088 | uint16_t *pdataptr, int *errorcodeptr, compile_block *cb) |
| 2089 | { |
| 2090 | PCRE2_UCHAR c; |
| 2091 | PCRE2_SIZE i, bot, top; |
| 2092 | PCRE2_SPTR ptr = *ptrptr; |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 2093 | PCRE2_UCHAR name[50]; |
| 2094 | PCRE2_UCHAR *vptr = NULL; |
| 2095 | uint16_t ptscript = PT_NOTSCRIPT; |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 2096 | |
| 2097 | if (ptr >= cb->end_pattern) goto ERROR_RETURN; |
| 2098 | c = *ptr++; |
| 2099 | *negptr = FALSE; |
| 2100 | |
| 2101 | /* \P or \p can be followed by a name in {}, optionally preceded by ^ for |
| 2102 | negation. */ |
| 2103 | |
| 2104 | if (c == CHAR_LEFT_CURLY_BRACKET) |
| 2105 | { |
| 2106 | if (ptr >= cb->end_pattern) goto ERROR_RETURN; |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 2107 | |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 2108 | if (*ptr == CHAR_CIRCUMFLEX_ACCENT) |
| 2109 | { |
| 2110 | *negptr = TRUE; |
| 2111 | ptr++; |
| 2112 | } |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 2113 | |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 2114 | for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++) |
| 2115 | { |
| 2116 | if (ptr >= cb->end_pattern) goto ERROR_RETURN; |
| 2117 | c = *ptr++; |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 2118 | while (c == '_' || c == '-' || isspace(c)) |
| 2119 | { |
| 2120 | if (ptr >= cb->end_pattern) goto ERROR_RETURN; |
| 2121 | c = *ptr++; |
| 2122 | } |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 2123 | if (c == CHAR_NUL) goto ERROR_RETURN; |
| 2124 | if (c == CHAR_RIGHT_CURLY_BRACKET) break; |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 2125 | name[i] = tolower(c); |
| 2126 | if ((c == ':' || c == '=') && vptr == NULL) vptr = name + i; |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 2127 | } |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 2128 | |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 2129 | if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN; |
| 2130 | name[i] = 0; |
| 2131 | } |
| 2132 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 2133 | /* If { doesn't follow \p or \P there is just one following character, which |
| 2134 | must be an ASCII letter. */ |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 2135 | |
| 2136 | else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0) |
| 2137 | { |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 2138 | name[0] = tolower(c); |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 2139 | name[1] = 0; |
| 2140 | } |
| 2141 | else goto ERROR_RETURN; |
| 2142 | |
| 2143 | *ptrptr = ptr; |
| 2144 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 2145 | /* If the property contains ':' or '=' we have class name and value separately |
| 2146 | specified. The following are supported: |
| 2147 | |
| 2148 | . Bidi_Class (synonym bc), for which the property names are "bidi<name>". |
| 2149 | . Script (synonym sc) for which the property name is the script name |
| 2150 | . Script_Extensions (synonym scx), ditto |
| 2151 | |
| 2152 | As this is a small number, we currently just check the names directly. If this |
| 2153 | grows, a sorted table and a switch will be neater. |
| 2154 | |
| 2155 | For both the script properties, set a PT_xxx value so that (1) they can be |
| 2156 | distinguished and (2) invalid script names that happen to be the name of |
| 2157 | another property can be diagnosed. */ |
| 2158 | |
| 2159 | if (vptr != NULL) |
| 2160 | { |
| 2161 | int offset = 0; |
| 2162 | PCRE2_UCHAR sname[8]; |
| 2163 | |
| 2164 | *vptr = 0; /* Terminate property name */ |
| 2165 | if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 || |
| 2166 | PRIV(strcmp_c8)(name, STRING_bc) == 0) |
| 2167 | { |
| 2168 | offset = 4; |
| 2169 | sname[0] = CHAR_b; |
| 2170 | sname[1] = CHAR_i; /* There is no strcpy_c8 function */ |
| 2171 | sname[2] = CHAR_d; |
| 2172 | sname[3] = CHAR_i; |
| 2173 | } |
| 2174 | |
| 2175 | else if (PRIV(strcmp_c8)(name, STRING_script) == 0 || |
| 2176 | PRIV(strcmp_c8)(name, STRING_sc) == 0) |
| 2177 | ptscript = PT_SC; |
| 2178 | |
| 2179 | else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 || |
| 2180 | PRIV(strcmp_c8)(name, STRING_scx) == 0) |
| 2181 | ptscript = PT_SCX; |
| 2182 | |
| 2183 | else |
| 2184 | { |
| 2185 | *errorcodeptr = ERR47; |
| 2186 | return FALSE; |
| 2187 | } |
| 2188 | |
| 2189 | /* Adjust the string in name[] as needed */ |
| 2190 | |
| 2191 | memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR)); |
| 2192 | if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR)); |
| 2193 | } |
| 2194 | |
| 2195 | /* Search for a recognized property using binary chop. */ |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 2196 | |
| 2197 | bot = 0; |
| 2198 | top = PRIV(utt_size); |
| 2199 | |
| 2200 | while (bot < top) |
| 2201 | { |
| 2202 | int r; |
| 2203 | i = (bot + top) >> 1; |
| 2204 | r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset); |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 2205 | |
| 2206 | /* When a matching property is found, some extra checking is needed when the |
| 2207 | \p{xx:yy} syntax is used and xx is either sc or scx. */ |
| 2208 | |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 2209 | if (r == 0) |
| 2210 | { |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 2211 | *pdataptr = PRIV(utt)[i].value; |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 2212 | if (vptr == NULL || ptscript == PT_NOTSCRIPT) |
| 2213 | { |
| 2214 | *ptypeptr = PRIV(utt)[i].type; |
| 2215 | return TRUE; |
| 2216 | } |
| 2217 | |
| 2218 | switch (PRIV(utt)[i].type) |
| 2219 | { |
| 2220 | case PT_SC: |
| 2221 | *ptypeptr = PT_SC; |
| 2222 | return TRUE; |
| 2223 | |
| 2224 | case PT_SCX: |
| 2225 | *ptypeptr = ptscript; |
| 2226 | return TRUE; |
| 2227 | } |
| 2228 | |
| 2229 | break; /* Non-script found */ |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 2230 | } |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 2231 | |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 2232 | if (r > 0) bot = i + 1; else top = i; |
| 2233 | } |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 2234 | |
| 2235 | *errorcodeptr = ERR47; /* Unrecognized property */ |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 2236 | return FALSE; |
| 2237 | |
| 2238 | ERROR_RETURN: /* Malformed \P or \p */ |
| 2239 | *errorcodeptr = ERR46; |
| 2240 | *ptrptr = ptr; |
| 2241 | return FALSE; |
| 2242 | } |
| 2243 | #endif |
| 2244 | |
| 2245 | |
| 2246 | |
| 2247 | /************************************************* |
| 2248 | * Check for POSIX class syntax * |
| 2249 | *************************************************/ |
| 2250 | |
| 2251 | /* This function is called when the sequence "[:" or "[." or "[=" is |
| 2252 | encountered in a character class. It checks whether this is followed by a |
| 2253 | sequence of characters terminated by a matching ":]" or ".]" or "=]". If we |
| 2254 | reach an unescaped ']' without the special preceding character, return FALSE. |
| 2255 | |
| 2256 | Originally, this function only recognized a sequence of letters between the |
| 2257 | terminators, but it seems that Perl recognizes any sequence of characters, |
| 2258 | though of course unknown POSIX names are subsequently rejected. Perl gives an |
| 2259 | "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE |
| 2260 | didn't consider this to be a POSIX class. Likewise for [:1234:]. |
| 2261 | |
| 2262 | The problem in trying to be exactly like Perl is in the handling of escapes. We |
| 2263 | have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX |
| 2264 | class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code |
| 2265 | below handles the special cases \\ and \], but does not try to do any other |
| 2266 | escape processing. This makes it different from Perl for cases such as |
| 2267 | [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does |
| 2268 | not recognize "l\ower". This is a lesser evil than not diagnosing bad classes |
| 2269 | when Perl does, I think. |
| 2270 | |
| 2271 | A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not. |
| 2272 | It seems that the appearance of a nested POSIX class supersedes an apparent |
| 2273 | external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or |
| 2274 | a digit. This is handled by returning FALSE if the start of a new group with |
| 2275 | the same terminator is encountered, since the next closing sequence must close |
| 2276 | the nested group, not the outer one. |
| 2277 | |
| 2278 | In Perl, unescaped square brackets may also appear as part of class names. For |
| 2279 | example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for |
| 2280 | [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not |
| 2281 | seem right at all. PCRE does not allow closing square brackets in POSIX class |
| 2282 | names. |
| 2283 | |
| 2284 | Arguments: |
| 2285 | ptr pointer to the character after the initial [ (colon, dot, equals) |
| 2286 | ptrend pointer to the end of the pattern |
| 2287 | endptr where to return a pointer to the terminating ':', '.', or '=' |
| 2288 | |
| 2289 | Returns: TRUE or FALSE |
| 2290 | */ |
| 2291 | |
| 2292 | static BOOL |
| 2293 | check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr) |
| 2294 | { |
| 2295 | PCRE2_UCHAR terminator; /* Don't combine these lines; the Solaris cc */ |
| 2296 | terminator = *ptr++; /* compiler warns about "non-constant" initializer. */ |
| 2297 | |
| 2298 | for (; ptrend - ptr >= 2; ptr++) |
| 2299 | { |
| 2300 | if (*ptr == CHAR_BACKSLASH && |
| 2301 | (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH)) |
| 2302 | ptr++; |
| 2303 | |
| 2304 | else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) || |
| 2305 | *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE; |
| 2306 | |
| 2307 | else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) |
| 2308 | { |
| 2309 | *endptr = ptr; |
| 2310 | return TRUE; |
| 2311 | } |
| 2312 | } |
| 2313 | |
| 2314 | return FALSE; |
| 2315 | } |
| 2316 | |
| 2317 | |
| 2318 | |
| 2319 | /************************************************* |
| 2320 | * Check POSIX class name * |
| 2321 | *************************************************/ |
| 2322 | |
| 2323 | /* This function is called to check the name given in a POSIX-style class entry |
| 2324 | such as [:alnum:]. |
| 2325 | |
| 2326 | Arguments: |
| 2327 | ptr points to the first letter |
| 2328 | len the length of the name |
| 2329 | |
| 2330 | Returns: a value representing the name, or -1 if unknown |
| 2331 | */ |
| 2332 | |
| 2333 | static int |
| 2334 | check_posix_name(PCRE2_SPTR ptr, int len) |
| 2335 | { |
| 2336 | const char *pn = posix_names; |
| 2337 | int yield = 0; |
| 2338 | while (posix_name_lengths[yield] != 0) |
| 2339 | { |
| 2340 | if (len == posix_name_lengths[yield] && |
| 2341 | PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield; |
| 2342 | pn += posix_name_lengths[yield] + 1; |
| 2343 | yield++; |
| 2344 | } |
| 2345 | return -1; |
| 2346 | } |
| 2347 | |
| 2348 | |
| 2349 | |
| 2350 | /************************************************* |
| 2351 | * Read a subpattern or VERB name * |
| 2352 | *************************************************/ |
| 2353 | |
| 2354 | /* This function is called from parse_regex() below whenever it needs to read |
| 2355 | the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial |
| 2356 | pointer must be to the character before the name. If that character is '*' we |
| 2357 | are reading a verb or alpha assertion name. The pointer is updated to point |
| 2358 | after the name, for a VERB or alpha assertion name, or after tha name's |
| 2359 | terminator for a subpattern name. Returning both the offset and the name |
| 2360 | pointer is redundant information, but some callers use one and some the other, |
| 2361 | so it is simplest just to return both. |
| 2362 | |
| 2363 | Arguments: |
| 2364 | ptrptr points to the character pointer variable |
| 2365 | ptrend points to the end of the input string |
| 2366 | utf true if the input is UTF-encoded |
| 2367 | terminator the terminator of a subpattern name must be this |
| 2368 | offsetptr where to put the offset from the start of the pattern |
| 2369 | nameptr where to put a pointer to the name in the input |
| 2370 | namelenptr where to put the length of the name |
| 2371 | errcodeptr where to put an error code |
| 2372 | cb pointer to the compile data block |
| 2373 | |
| 2374 | Returns: TRUE if a name was read |
| 2375 | FALSE otherwise, with error code set |
| 2376 | */ |
| 2377 | |
| 2378 | static BOOL |
| 2379 | read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator, |
| 2380 | PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr, |
| 2381 | int *errorcodeptr, compile_block *cb) |
| 2382 | { |
| 2383 | PCRE2_SPTR ptr = *ptrptr; |
| 2384 | BOOL is_group = (*ptr != CHAR_ASTERISK); |
| 2385 | |
| 2386 | if (++ptr >= ptrend) /* No characters in name */ |
| 2387 | { |
| 2388 | *errorcodeptr = is_group? ERR62: /* Subpattern name expected */ |
| 2389 | ERR60; /* Verb not recognized or malformed */ |
| 2390 | goto FAILED; |
| 2391 | } |
| 2392 | |
| 2393 | *nameptr = ptr; |
| 2394 | *offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern); |
| 2395 | |
| 2396 | /* In UTF mode, a group name may contain letters and decimal digits as defined |
| 2397 | by Unicode properties, and underscores, but must not start with a digit. */ |
| 2398 | |
| 2399 | #ifdef SUPPORT_UNICODE |
| 2400 | if (utf && is_group) |
| 2401 | { |
| 2402 | uint32_t c, type; |
| 2403 | |
| 2404 | GETCHAR(c, ptr); |
| 2405 | type = UCD_CHARTYPE(c); |
| 2406 | |
| 2407 | if (type == ucp_Nd) |
| 2408 | { |
| 2409 | *errorcodeptr = ERR44; |
| 2410 | goto FAILED; |
| 2411 | } |
| 2412 | |
| 2413 | for(;;) |
| 2414 | { |
| 2415 | if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L && |
| 2416 | c != CHAR_UNDERSCORE) break; |
| 2417 | ptr++; |
| 2418 | FORWARDCHARTEST(ptr, ptrend); |
| 2419 | if (ptr >= ptrend) break; |
| 2420 | GETCHAR(c, ptr); |
| 2421 | type = UCD_CHARTYPE(c); |
| 2422 | } |
| 2423 | } |
| 2424 | else |
| 2425 | #else |
| 2426 | (void)utf; /* Avoid compiler warning */ |
| 2427 | #endif /* SUPPORT_UNICODE */ |
| 2428 | |
| 2429 | /* Handle non-group names and group names in non-UTF modes. A group name must |
| 2430 | not start with a digit. If either of the others start with a digit it just |
| 2431 | won't be recognized. */ |
| 2432 | |
| 2433 | { |
| 2434 | if (is_group && IS_DIGIT(*ptr)) |
| 2435 | { |
| 2436 | *errorcodeptr = ERR44; |
| 2437 | goto FAILED; |
| 2438 | } |
| 2439 | |
| 2440 | while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0) |
| 2441 | { |
| 2442 | ptr++; |
| 2443 | } |
| 2444 | } |
| 2445 | |
| 2446 | /* Check name length */ |
| 2447 | |
| 2448 | if (ptr > *nameptr + MAX_NAME_SIZE) |
| 2449 | { |
| 2450 | *errorcodeptr = ERR48; |
| 2451 | goto FAILED; |
| 2452 | } |
| 2453 | *namelenptr = (uint32_t)(ptr - *nameptr); |
| 2454 | |
| 2455 | /* Subpattern names must not be empty, and their terminator is checked here. |
| 2456 | (What follows a verb or alpha assertion name is checked separately.) */ |
| 2457 | |
| 2458 | if (is_group) |
| 2459 | { |
| 2460 | if (ptr == *nameptr) |
| 2461 | { |
| 2462 | *errorcodeptr = ERR62; /* Subpattern name expected */ |
| 2463 | goto FAILED; |
| 2464 | } |
| 2465 | if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator) |
| 2466 | { |
| 2467 | *errorcodeptr = ERR42; |
| 2468 | goto FAILED; |
| 2469 | } |
| 2470 | ptr++; |
| 2471 | } |
| 2472 | |
| 2473 | *ptrptr = ptr; |
| 2474 | return TRUE; |
| 2475 | |
| 2476 | FAILED: |
| 2477 | *ptrptr = ptr; |
| 2478 | return FALSE; |
| 2479 | } |
| 2480 | |
| 2481 | |
| 2482 | |
| 2483 | /************************************************* |
| 2484 | * Manage callouts at start of cycle * |
| 2485 | *************************************************/ |
| 2486 | |
| 2487 | /* At the start of a new item in parse_regex() we are able to record the |
| 2488 | details of the previous item in a prior callout, and also to set up an |
| 2489 | automatic callout if enabled. Avoid having two adjacent automatic callouts, |
| 2490 | which would otherwise happen for items such as \Q that contribute nothing to |
| 2491 | the parsed pattern. |
| 2492 | |
| 2493 | Arguments: |
| 2494 | ptr current pattern pointer |
| 2495 | pcalloutptr points to a pointer to previous callout, or NULL |
| 2496 | auto_callout TRUE if auto_callouts are enabled |
| 2497 | parsed_pattern the parsed pattern pointer |
| 2498 | cb compile block |
| 2499 | |
| 2500 | Returns: possibly updated parsed_pattern pointer. |
| 2501 | */ |
| 2502 | |
| 2503 | static uint32_t * |
| 2504 | manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout, |
| 2505 | uint32_t *parsed_pattern, compile_block *cb) |
| 2506 | { |
| 2507 | uint32_t *previous_callout = *pcalloutptr; |
| 2508 | |
| 2509 | if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr - |
| 2510 | cb->start_pattern - (PCRE2_SIZE)previous_callout[1]); |
| 2511 | |
| 2512 | if (!auto_callout) previous_callout = NULL; else |
| 2513 | { |
| 2514 | if (previous_callout == NULL || |
| 2515 | previous_callout != parsed_pattern - 4 || |
| 2516 | previous_callout[3] != 255) |
| 2517 | { |
| 2518 | previous_callout = parsed_pattern; /* Set up new automatic callout */ |
| 2519 | parsed_pattern += 4; |
| 2520 | previous_callout[0] = META_CALLOUT_NUMBER; |
| 2521 | previous_callout[2] = 0; |
| 2522 | previous_callout[3] = 255; |
| 2523 | } |
| 2524 | previous_callout[1] = (uint32_t)(ptr - cb->start_pattern); |
| 2525 | } |
| 2526 | |
| 2527 | *pcalloutptr = previous_callout; |
| 2528 | return parsed_pattern; |
| 2529 | } |
| 2530 | |
| 2531 | |
| 2532 | |
| 2533 | /************************************************* |
| 2534 | * Parse regex and identify named groups * |
| 2535 | *************************************************/ |
| 2536 | |
| 2537 | /* This function is called first of all. It scans the pattern and does two |
| 2538 | things: (1) It identifies capturing groups and makes a table of named capturing |
| 2539 | groups so that information about them is fully available to both the compiling |
| 2540 | scans. (2) It writes a parsed version of the pattern with comments omitted and |
| 2541 | escapes processed into the parsed_pattern vector. |
| 2542 | |
| 2543 | Arguments: |
| 2544 | ptr points to the start of the pattern |
| 2545 | options compiling dynamic options (may change during the scan) |
| 2546 | has_lookbehind points to a boolean, set TRUE if a lookbehind is found |
| 2547 | cb pointer to the compile data block |
| 2548 | |
| 2549 | Returns: zero on success or a non-zero error code, with the |
| 2550 | error offset placed in the cb field |
| 2551 | */ |
| 2552 | |
| 2553 | /* A structure and some flags for dealing with nested groups. */ |
| 2554 | |
| 2555 | typedef struct nest_save { |
| 2556 | uint16_t nest_depth; |
| 2557 | uint16_t reset_group; |
| 2558 | uint16_t max_group; |
| 2559 | uint16_t flags; |
| 2560 | uint32_t options; |
| 2561 | } nest_save; |
| 2562 | |
| 2563 | #define NSF_RESET 0x0001u |
| 2564 | #define NSF_CONDASSERT 0x0002u |
| 2565 | #define NSF_ATOMICSR 0x0004u |
| 2566 | |
| 2567 | /* Options that are changeable within the pattern must be tracked during |
| 2568 | parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing, |
| 2569 | but all must be tracked so that META_OPTIONS items set the correct values for |
| 2570 | the main compiling phase. */ |
| 2571 | |
| 2572 | #define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \ |
| 2573 | PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \ |
| 2574 | PCRE2_UNGREEDY) |
| 2575 | |
| 2576 | /* States used for analyzing ranges in character classes. The two OK values |
| 2577 | must be last. */ |
| 2578 | |
| 2579 | enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL }; |
| 2580 | |
| 2581 | /* Only in 32-bit mode can there be literals > META_END. A macro encapsulates |
| 2582 | the storing of literal values in the main parsed pattern, where they can always |
| 2583 | be quantified. */ |
| 2584 | |
| 2585 | #if PCRE2_CODE_UNIT_WIDTH == 32 |
| 2586 | #define PARSED_LITERAL(c, p) \ |
| 2587 | { \ |
| 2588 | if (c >= META_END) *p++ = META_BIGVALUE; \ |
| 2589 | *p++ = c; \ |
| 2590 | okquantifier = TRUE; \ |
| 2591 | } |
| 2592 | #else |
| 2593 | #define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE; |
| 2594 | #endif |
| 2595 | |
| 2596 | /* Here's the actual function. */ |
| 2597 | |
| 2598 | static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind, |
| 2599 | compile_block *cb) |
| 2600 | { |
| 2601 | uint32_t c; |
| 2602 | uint32_t delimiter; |
| 2603 | uint32_t namelen; |
| 2604 | uint32_t class_range_state; |
| 2605 | uint32_t *verblengthptr = NULL; /* Value avoids compiler warning */ |
| 2606 | uint32_t *verbstartptr = NULL; |
| 2607 | uint32_t *previous_callout = NULL; |
| 2608 | uint32_t *parsed_pattern = cb->parsed_pattern; |
| 2609 | uint32_t *parsed_pattern_end = cb->parsed_pattern_end; |
| 2610 | uint32_t meta_quantifier = 0; |
| 2611 | uint32_t add_after_mark = 0; |
| 2612 | uint32_t extra_options = cb->cx->extra_options; |
| 2613 | uint16_t nest_depth = 0; |
| 2614 | int after_manual_callout = 0; |
| 2615 | int expect_cond_assert = 0; |
| 2616 | int errorcode = 0; |
| 2617 | int escape; |
| 2618 | int i; |
| 2619 | BOOL inescq = FALSE; |
| 2620 | BOOL inverbname = FALSE; |
| 2621 | BOOL utf = (options & PCRE2_UTF) != 0; |
| 2622 | BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0; |
| 2623 | BOOL isdupname; |
| 2624 | BOOL negate_class; |
| 2625 | BOOL okquantifier = FALSE; |
| 2626 | PCRE2_SPTR thisptr; |
| 2627 | PCRE2_SPTR name; |
| 2628 | PCRE2_SPTR ptrend = cb->end_pattern; |
| 2629 | PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */ |
| 2630 | named_group *ng; |
| 2631 | nest_save *top_nest, *end_nests; |
| 2632 | |
| 2633 | /* Insert leading items for word and line matching (features provided for the |
| 2634 | benefit of pcre2grep). */ |
| 2635 | |
| 2636 | if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0) |
| 2637 | { |
| 2638 | *parsed_pattern++ = META_CIRCUMFLEX; |
| 2639 | *parsed_pattern++ = META_NOCAPTURE; |
| 2640 | } |
| 2641 | else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0) |
| 2642 | { |
| 2643 | *parsed_pattern++ = META_ESCAPE + ESC_b; |
| 2644 | *parsed_pattern++ = META_NOCAPTURE; |
| 2645 | } |
| 2646 | |
| 2647 | /* If the pattern is actually a literal string, process it separately to avoid |
| 2648 | cluttering up the main loop. */ |
| 2649 | |
| 2650 | if ((options & PCRE2_LITERAL) != 0) |
| 2651 | { |
| 2652 | while (ptr < ptrend) |
| 2653 | { |
| 2654 | if (parsed_pattern >= parsed_pattern_end) |
| 2655 | { |
| 2656 | errorcode = ERR63; /* Internal error (parsed pattern overflow) */ |
| 2657 | goto FAILED; |
| 2658 | } |
| 2659 | thisptr = ptr; |
| 2660 | GETCHARINCTEST(c, ptr); |
| 2661 | if (auto_callout) |
| 2662 | parsed_pattern = manage_callouts(thisptr, &previous_callout, |
| 2663 | auto_callout, parsed_pattern, cb); |
| 2664 | PARSED_LITERAL(c, parsed_pattern); |
| 2665 | } |
| 2666 | goto PARSED_END; |
| 2667 | } |
| 2668 | |
| 2669 | /* Process a real regex which may contain meta-characters. */ |
| 2670 | |
| 2671 | top_nest = NULL; |
| 2672 | end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size); |
| 2673 | |
| 2674 | /* The size of the nest_save structure might not be a factor of the size of the |
| 2675 | workspace. Therefore we must round down end_nests so as to correctly avoid |
| 2676 | creating a nest_save that spans the end of the workspace. */ |
| 2677 | |
| 2678 | end_nests = (nest_save *)((char *)end_nests - |
| 2679 | ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save))); |
| 2680 | |
| 2681 | /* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */ |
| 2682 | |
| 2683 | if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED; |
| 2684 | |
| 2685 | /* Now scan the pattern */ |
| 2686 | |
| 2687 | while (ptr < ptrend) |
| 2688 | { |
| 2689 | int prev_expect_cond_assert; |
| 2690 | uint32_t min_repeat, max_repeat; |
| 2691 | uint32_t set, unset, *optset; |
| 2692 | uint32_t terminator; |
| 2693 | uint32_t prev_meta_quantifier; |
| 2694 | BOOL prev_okquantifier; |
| 2695 | PCRE2_SPTR tempptr; |
| 2696 | PCRE2_SIZE offset; |
| 2697 | |
| 2698 | if (parsed_pattern >= parsed_pattern_end) |
| 2699 | { |
| 2700 | errorcode = ERR63; /* Internal error (parsed pattern overflow) */ |
| 2701 | goto FAILED; |
| 2702 | } |
| 2703 | |
| 2704 | if (nest_depth > cb->cx->parens_nest_limit) |
| 2705 | { |
| 2706 | errorcode = ERR19; |
| 2707 | goto FAILED; /* Parentheses too deeply nested */ |
| 2708 | } |
| 2709 | |
| 2710 | /* Get next input character, save its position for callout handling. */ |
| 2711 | |
| 2712 | thisptr = ptr; |
| 2713 | GETCHARINCTEST(c, ptr); |
| 2714 | |
| 2715 | /* Copy quoted literals until \E, allowing for the possibility of automatic |
| 2716 | callouts, except when processing a (*VERB) "name". */ |
| 2717 | |
| 2718 | if (inescq) |
| 2719 | { |
| 2720 | if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E) |
| 2721 | { |
| 2722 | inescq = FALSE; |
| 2723 | ptr++; /* Skip E */ |
| 2724 | } |
| 2725 | else |
| 2726 | { |
| 2727 | if (expect_cond_assert > 0) /* A literal is not allowed if we are */ |
| 2728 | { /* expecting a conditional assertion, */ |
| 2729 | ptr--; /* but an empty \Q\E sequence is OK. */ |
| 2730 | errorcode = ERR28; |
| 2731 | goto FAILED; |
| 2732 | } |
| 2733 | if (inverbname) |
| 2734 | { /* Don't use PARSED_LITERAL() because it */ |
| 2735 | #if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */ |
| 2736 | if (c >= META_END) *parsed_pattern++ = META_BIGVALUE; |
| 2737 | #endif |
| 2738 | *parsed_pattern++ = c; |
| 2739 | } |
| 2740 | else |
| 2741 | { |
| 2742 | if (after_manual_callout-- <= 0) |
| 2743 | parsed_pattern = manage_callouts(thisptr, &previous_callout, |
| 2744 | auto_callout, parsed_pattern, cb); |
| 2745 | PARSED_LITERAL(c, parsed_pattern); |
| 2746 | } |
| 2747 | meta_quantifier = 0; |
| 2748 | } |
| 2749 | continue; /* Next character */ |
| 2750 | } |
| 2751 | |
| 2752 | /* If we are processing the "name" part of a (*VERB:NAME) item, all |
| 2753 | characters up to the closing parenthesis are literals except when |
| 2754 | PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q |
| 2755 | and \E and escaped characters are allowed (no character types such as \d). If |
| 2756 | PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do |
| 2757 | this by not entering the special (*VERB:NAME) processing - they are then |
| 2758 | picked up below. Note that c is a character, not a code unit, so we must not |
| 2759 | use MAX_255 to test its size because MAX_255 tests code units and is assumed |
| 2760 | TRUE in 8-bit mode. */ |
| 2761 | |
| 2762 | if (inverbname && |
| 2763 | ( |
| 2764 | /* EITHER: not both options set */ |
| 2765 | ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) != |
| 2766 | (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) || |
| 2767 | #ifdef SUPPORT_UNICODE |
| 2768 | /* OR: character > 255 AND not Unicode Pattern White Space */ |
| 2769 | (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) || |
| 2770 | #endif |
| 2771 | /* OR: not a # comment or isspace() white space */ |
| 2772 | (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0 |
| 2773 | #ifdef SUPPORT_UNICODE |
| 2774 | /* and not CHAR_NEL when Unicode is supported */ |
| 2775 | && c != CHAR_NEL |
| 2776 | #endif |
| 2777 | ))) |
| 2778 | { |
| 2779 | PCRE2_SIZE verbnamelength; |
| 2780 | |
| 2781 | switch(c) |
| 2782 | { |
| 2783 | default: /* Don't use PARSED_LITERAL() because it */ |
| 2784 | #if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */ |
| 2785 | if (c >= META_END) *parsed_pattern++ = META_BIGVALUE; |
| 2786 | #endif |
| 2787 | *parsed_pattern++ = c; |
| 2788 | break; |
| 2789 | |
| 2790 | case CHAR_RIGHT_PARENTHESIS: |
| 2791 | inverbname = FALSE; |
| 2792 | /* This is the length in characters */ |
| 2793 | verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1); |
| 2794 | /* But the limit on the length is in code units */ |
| 2795 | if (ptr - verbnamestart - 1 > (int)MAX_MARK) |
| 2796 | { |
| 2797 | ptr--; |
| 2798 | errorcode = ERR76; |
| 2799 | goto FAILED; |
| 2800 | } |
| 2801 | *verblengthptr = (uint32_t)verbnamelength; |
| 2802 | |
| 2803 | /* If this name was on a verb such as (*ACCEPT) which does not continue, |
| 2804 | a (*MARK) was generated for the name. We now add the original verb as the |
| 2805 | next item. */ |
| 2806 | |
| 2807 | if (add_after_mark != 0) |
| 2808 | { |
| 2809 | *parsed_pattern++ = add_after_mark; |
| 2810 | add_after_mark = 0; |
| 2811 | } |
| 2812 | break; |
| 2813 | |
| 2814 | case CHAR_BACKSLASH: |
| 2815 | if ((options & PCRE2_ALT_VERBNAMES) != 0) |
| 2816 | { |
| 2817 | escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options, |
| 2818 | cb->cx->extra_options, FALSE, cb); |
| 2819 | if (errorcode != 0) goto FAILED; |
| 2820 | } |
| 2821 | else escape = 0; /* Treat all as literal */ |
| 2822 | |
| 2823 | switch(escape) |
| 2824 | { |
| 2825 | case 0: /* Don't use PARSED_LITERAL() because it */ |
| 2826 | #if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */ |
| 2827 | if (c >= META_END) *parsed_pattern++ = META_BIGVALUE; |
| 2828 | #endif |
| 2829 | *parsed_pattern++ = c; |
| 2830 | break; |
| 2831 | |
| 2832 | case ESC_Q: |
| 2833 | inescq = TRUE; |
| 2834 | break; |
| 2835 | |
| 2836 | case ESC_E: /* Ignore */ |
| 2837 | break; |
| 2838 | |
| 2839 | default: |
| 2840 | errorcode = ERR40; /* Invalid in verb name */ |
| 2841 | goto FAILED; |
| 2842 | } |
| 2843 | } |
| 2844 | continue; /* Next character in pattern */ |
| 2845 | } |
| 2846 | |
| 2847 | /* Not a verb name character. At this point we must process everything that |
| 2848 | must not change the quantification state. This is mainly comments, but we |
| 2849 | handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as |
| 2850 | A+, as in Perl. An isolated \E is ignored. */ |
| 2851 | |
| 2852 | if (c == CHAR_BACKSLASH && ptr < ptrend) |
| 2853 | { |
| 2854 | if (*ptr == CHAR_Q || *ptr == CHAR_E) |
| 2855 | { |
| 2856 | inescq = *ptr == CHAR_Q; |
| 2857 | ptr++; |
| 2858 | continue; |
| 2859 | } |
| 2860 | } |
| 2861 | |
| 2862 | /* Skip over whitespace and # comments in extended mode. Note that c is a |
| 2863 | character, not a code unit, so we must not use MAX_255 to test its size |
| 2864 | because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The |
| 2865 | whitespace characters are those designated as "Pattern White Space" by |
| 2866 | Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is |
| 2867 | U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a |
| 2868 | subset of space characters that match \h and \v. */ |
| 2869 | |
| 2870 | if ((options & PCRE2_EXTENDED) != 0) |
| 2871 | { |
| 2872 | if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue; |
| 2873 | #ifdef SUPPORT_UNICODE |
| 2874 | if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue; |
| 2875 | #endif |
| 2876 | if (c == CHAR_NUMBER_SIGN) |
| 2877 | { |
| 2878 | while (ptr < ptrend) |
| 2879 | { |
| 2880 | if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */ |
| 2881 | { /* IS_NEWLINE sets cb->nllen. */ |
| 2882 | ptr += cb->nllen; |
| 2883 | break; |
| 2884 | } |
| 2885 | ptr++; |
| 2886 | #ifdef SUPPORT_UNICODE |
| 2887 | if (utf) FORWARDCHARTEST(ptr, ptrend); |
| 2888 | #endif |
| 2889 | } |
| 2890 | continue; /* Next character in pattern */ |
| 2891 | } |
| 2892 | } |
| 2893 | |
| 2894 | /* Skip over bracketed comments */ |
| 2895 | |
| 2896 | if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 && |
| 2897 | ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN) |
| 2898 | { |
| 2899 | while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS); |
| 2900 | if (ptr >= ptrend) |
| 2901 | { |
| 2902 | errorcode = ERR18; /* A special error for missing ) in a comment */ |
| 2903 | goto FAILED; /* to make it easier to debug. */ |
| 2904 | } |
| 2905 | ptr++; |
| 2906 | continue; /* Next character in pattern */ |
| 2907 | } |
| 2908 | |
| 2909 | /* If the next item is not a quantifier, fill in length of any previous |
| 2910 | callout and create an auto callout if required. */ |
| 2911 | |
| 2912 | if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK && |
| 2913 | (c != CHAR_LEFT_CURLY_BRACKET || |
| 2914 | (tempptr = ptr, |
| 2915 | !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode)))) |
| 2916 | { |
| 2917 | if (after_manual_callout-- <= 0) |
| 2918 | parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout, |
| 2919 | parsed_pattern, cb); |
| 2920 | } |
| 2921 | |
| 2922 | /* If expect_cond_assert is 2, we have just passed (?( and are expecting an |
| 2923 | assertion, possibly preceded by a callout. If the value is 1, we have just |
| 2924 | had the callout and expect an assertion. There must be at least 3 more |
| 2925 | characters in all cases. When expect_cond_assert is 2, we know that the |
| 2926 | current character is an opening parenthesis, as otherwise we wouldn't be |
| 2927 | here. However, when it is 1, we need to check, and it's easiest just to check |
| 2928 | always. Note that expect_cond_assert may be negative, since all callouts just |
| 2929 | decrement it. */ |
| 2930 | |
| 2931 | if (expect_cond_assert > 0) |
| 2932 | { |
| 2933 | BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 && |
| 2934 | (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK); |
| 2935 | if (ok) |
| 2936 | { |
| 2937 | if (ptr[0] == CHAR_ASTERISK) /* New alpha assertion format, possibly */ |
| 2938 | { |
| 2939 | ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0; |
| 2940 | } |
| 2941 | else switch(ptr[1]) /* Traditional symbolic format */ |
| 2942 | { |
| 2943 | case CHAR_C: |
| 2944 | ok = expect_cond_assert == 2; |
| 2945 | break; |
| 2946 | |
| 2947 | case CHAR_EQUALS_SIGN: |
| 2948 | case CHAR_EXCLAMATION_MARK: |
| 2949 | break; |
| 2950 | |
| 2951 | case CHAR_LESS_THAN_SIGN: |
| 2952 | ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK; |
| 2953 | break; |
| 2954 | |
| 2955 | default: |
| 2956 | ok = FALSE; |
| 2957 | } |
| 2958 | } |
| 2959 | |
| 2960 | if (!ok) |
| 2961 | { |
| 2962 | ptr--; /* Adjust error offset */ |
| 2963 | errorcode = ERR28; |
| 2964 | goto FAILED; |
| 2965 | } |
| 2966 | } |
| 2967 | |
| 2968 | /* Remember whether we are expecting a conditional assertion, and set the |
| 2969 | default for this item. */ |
| 2970 | |
| 2971 | prev_expect_cond_assert = expect_cond_assert; |
| 2972 | expect_cond_assert = 0; |
| 2973 | |
| 2974 | /* Remember quantification status for the previous significant item, then set |
| 2975 | default for this item. */ |
| 2976 | |
| 2977 | prev_okquantifier = okquantifier; |
| 2978 | prev_meta_quantifier = meta_quantifier; |
| 2979 | okquantifier = FALSE; |
| 2980 | meta_quantifier = 0; |
| 2981 | |
| 2982 | /* If the previous significant item was a quantifier, adjust the parsed code |
| 2983 | if there is a following modifier. The base meta value is always followed by |
| 2984 | the PLUS and QUERY values, in that order. We do this here rather than after |
| 2985 | reading a quantifier so that intervening comments and /x whitespace can be |
| 2986 | ignored without having to replicate code. */ |
| 2987 | |
| 2988 | if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS)) |
| 2989 | { |
| 2990 | parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] = |
| 2991 | prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)? |
| 2992 | 0x00020000u : 0x00010000u); |
| 2993 | continue; /* Next character in pattern */ |
| 2994 | } |
| 2995 | |
| 2996 | |
| 2997 | /* Process the next item in the main part of a pattern. */ |
| 2998 | |
| 2999 | switch(c) |
| 3000 | { |
| 3001 | default: /* Non-special character */ |
| 3002 | PARSED_LITERAL(c, parsed_pattern); |
| 3003 | break; |
| 3004 | |
| 3005 | |
| 3006 | /* ---- Escape sequence ---- */ |
| 3007 | |
| 3008 | case CHAR_BACKSLASH: |
| 3009 | tempptr = ptr; |
| 3010 | escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options, |
| 3011 | cb->cx->extra_options, FALSE, cb); |
| 3012 | if (errorcode != 0) |
| 3013 | { |
| 3014 | ESCAPE_FAILED: |
| 3015 | if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0) |
| 3016 | goto FAILED; |
| 3017 | ptr = tempptr; |
| 3018 | if (ptr >= ptrend) c = CHAR_BACKSLASH; else |
| 3019 | { |
| 3020 | GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ |
| 3021 | } |
| 3022 | escape = 0; /* Treat as literal character */ |
| 3023 | } |
| 3024 | |
| 3025 | /* The escape was a data escape or literal character. */ |
| 3026 | |
| 3027 | if (escape == 0) |
| 3028 | { |
| 3029 | PARSED_LITERAL(c, parsed_pattern); |
| 3030 | } |
| 3031 | |
| 3032 | /* The escape was a back (or forward) reference. We keep the offset in |
| 3033 | order to give a more useful diagnostic for a bad forward reference. For |
| 3034 | references to groups numbered less than 10 we can't use more than two items |
| 3035 | in parsed_pattern because they may be just two characters in the input (and |
| 3036 | in a 64-bit world an offset may need two elements). So for them, the offset |
| 3037 | of the first occurrent is held in a special vector. */ |
| 3038 | |
| 3039 | else if (escape < 0) |
| 3040 | { |
| 3041 | offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1); |
| 3042 | escape = -escape; |
| 3043 | *parsed_pattern++ = META_BACKREF | (uint32_t)escape; |
| 3044 | if (escape < 10) |
| 3045 | { |
| 3046 | if (cb->small_ref_offset[escape] == PCRE2_UNSET) |
| 3047 | cb->small_ref_offset[escape] = offset; |
| 3048 | } |
| 3049 | else |
| 3050 | { |
| 3051 | PUTOFFSET(offset, parsed_pattern); |
| 3052 | } |
| 3053 | okquantifier = TRUE; |
| 3054 | } |
| 3055 | |
| 3056 | /* The escape was a character class such as \d etc. or other special |
| 3057 | escape indicator such as \A or \X. Most of them generate just a single |
| 3058 | parsed item, but \P and \p are followed by a 16-bit type and a 16-bit |
| 3059 | value. They are supported only when Unicode is available. The type and |
| 3060 | value are packed into a single 32-bit value so that the whole sequences |
| 3061 | uses only two elements in the parsed_vector. This is because the same |
| 3062 | coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is |
| 3063 | set. |
| 3064 | |
| 3065 | There are also some cases where the escape sequence is followed by a name: |
| 3066 | \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name> |
| 3067 | and \g'name' are subroutine calls by name; \g{name} is a synonym for |
| 3068 | \k{name}. Note that \g<number> and \g'number' are handled by check_escape() |
| 3069 | and returned as a negative value (handled above). A name is coded as an |
| 3070 | offset into the pattern and a length. */ |
| 3071 | |
| 3072 | else switch (escape) |
| 3073 | { |
| 3074 | case ESC_C: |
| 3075 | #ifdef NEVER_BACKSLASH_C |
| 3076 | errorcode = ERR85; |
| 3077 | goto ESCAPE_FAILED; |
| 3078 | #else |
| 3079 | if ((options & PCRE2_NEVER_BACKSLASH_C) != 0) |
| 3080 | { |
| 3081 | errorcode = ERR83; |
| 3082 | goto ESCAPE_FAILED; |
| 3083 | } |
| 3084 | #endif |
| 3085 | okquantifier = TRUE; |
| 3086 | *parsed_pattern++ = META_ESCAPE + escape; |
| 3087 | break; |
| 3088 | |
| 3089 | case ESC_X: |
| 3090 | #ifndef SUPPORT_UNICODE |
| 3091 | errorcode = ERR45; /* Supported only with Unicode support */ |
| 3092 | goto ESCAPE_FAILED; |
| 3093 | #endif |
| 3094 | case ESC_H: |
| 3095 | case ESC_h: |
| 3096 | case ESC_N: |
| 3097 | case ESC_R: |
| 3098 | case ESC_V: |
| 3099 | case ESC_v: |
| 3100 | okquantifier = TRUE; |
| 3101 | *parsed_pattern++ = META_ESCAPE + escape; |
| 3102 | break; |
| 3103 | |
| 3104 | default: /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */ |
| 3105 | *parsed_pattern++ = META_ESCAPE + escape; |
| 3106 | break; |
| 3107 | |
| 3108 | /* Escapes that change in UCP mode. Note that PCRE2_UCP will never be set |
| 3109 | without Unicode support because it is checked when pcre2_compile() is |
| 3110 | called. */ |
| 3111 | |
| 3112 | case ESC_d: |
| 3113 | case ESC_D: |
| 3114 | case ESC_s: |
| 3115 | case ESC_S: |
| 3116 | case ESC_w: |
| 3117 | case ESC_W: |
| 3118 | okquantifier = TRUE; |
| 3119 | if ((options & PCRE2_UCP) == 0) |
| 3120 | { |
| 3121 | *parsed_pattern++ = META_ESCAPE + escape; |
| 3122 | } |
| 3123 | else |
| 3124 | { |
| 3125 | *parsed_pattern++ = META_ESCAPE + |
| 3126 | ((escape == ESC_d || escape == ESC_s || escape == ESC_w)? |
| 3127 | ESC_p : ESC_P); |
| 3128 | switch(escape) |
| 3129 | { |
| 3130 | case ESC_d: |
| 3131 | case ESC_D: |
| 3132 | *parsed_pattern++ = (PT_PC << 16) | ucp_Nd; |
| 3133 | break; |
| 3134 | |
| 3135 | case ESC_s: |
| 3136 | case ESC_S: |
| 3137 | *parsed_pattern++ = PT_SPACE << 16; |
| 3138 | break; |
| 3139 | |
| 3140 | case ESC_w: |
| 3141 | case ESC_W: |
| 3142 | *parsed_pattern++ = PT_WORD << 16; |
| 3143 | break; |
| 3144 | } |
| 3145 | } |
| 3146 | break; |
| 3147 | |
| 3148 | /* Unicode property matching */ |
| 3149 | |
| 3150 | case ESC_P: |
| 3151 | case ESC_p: |
| 3152 | #ifdef SUPPORT_UNICODE |
| 3153 | { |
| 3154 | BOOL negated; |
| 3155 | uint16_t ptype = 0, pdata = 0; |
| 3156 | if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb)) |
| 3157 | goto ESCAPE_FAILED; |
| 3158 | if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P; |
| 3159 | *parsed_pattern++ = META_ESCAPE + escape; |
| 3160 | *parsed_pattern++ = (ptype << 16) | pdata; |
| 3161 | okquantifier = TRUE; |
| 3162 | } |
| 3163 | #else |
| 3164 | errorcode = ERR45; |
| 3165 | goto ESCAPE_FAILED; |
| 3166 | #endif |
| 3167 | break; /* End \P and \p */ |
| 3168 | |
| 3169 | /* When \g is used with quotes or angle brackets as delimiters, it is a |
| 3170 | numerical or named subroutine call, and control comes here. When used |
| 3171 | with brace delimiters it is a numberical back reference and does not come |
| 3172 | here because check_escape() returns it directly as a reference. \k is |
| 3173 | always a named back reference. */ |
| 3174 | |
| 3175 | case ESC_g: |
| 3176 | case ESC_k: |
| 3177 | if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET && |
| 3178 | *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE)) |
| 3179 | { |
| 3180 | errorcode = (escape == ESC_g)? ERR57 : ERR69; |
| 3181 | goto ESCAPE_FAILED; |
| 3182 | } |
| 3183 | terminator = (*ptr == CHAR_LESS_THAN_SIGN)? |
| 3184 | CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)? |
| 3185 | CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET; |
| 3186 | |
| 3187 | /* For a non-braced \g, check for a numerical recursion. */ |
| 3188 | |
| 3189 | if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET) |
| 3190 | { |
| 3191 | PCRE2_SPTR p = ptr + 1; |
| 3192 | |
| 3193 | if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i, |
| 3194 | &errorcode)) |
| 3195 | { |
| 3196 | if (p >= ptrend || *p != terminator) |
| 3197 | { |
| 3198 | errorcode = ERR57; |
| 3199 | goto ESCAPE_FAILED; |
| 3200 | } |
| 3201 | ptr = p; |
| 3202 | goto SET_RECURSION; |
| 3203 | } |
| 3204 | if (errorcode != 0) goto ESCAPE_FAILED; |
| 3205 | } |
| 3206 | |
| 3207 | /* Not a numerical recursion */ |
| 3208 | |
| 3209 | if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen, |
| 3210 | &errorcode, cb)) goto ESCAPE_FAILED; |
| 3211 | |
| 3212 | /* \k and \g when used with braces are back references, whereas \g used |
| 3213 | with quotes or angle brackets is a recursion */ |
| 3214 | |
| 3215 | *parsed_pattern++ = |
| 3216 | (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)? |
| 3217 | META_BACKREF_BYNAME : META_RECURSE_BYNAME; |
| 3218 | *parsed_pattern++ = namelen; |
| 3219 | |
| 3220 | PUTOFFSET(offset, parsed_pattern); |
| 3221 | okquantifier = TRUE; |
| 3222 | break; /* End special escape processing */ |
| 3223 | } |
| 3224 | break; /* End escape sequence processing */ |
| 3225 | |
| 3226 | |
| 3227 | /* ---- Single-character special items ---- */ |
| 3228 | |
| 3229 | case CHAR_CIRCUMFLEX_ACCENT: |
| 3230 | *parsed_pattern++ = META_CIRCUMFLEX; |
| 3231 | break; |
| 3232 | |
| 3233 | case CHAR_DOLLAR_SIGN: |
| 3234 | *parsed_pattern++ = META_DOLLAR; |
| 3235 | break; |
| 3236 | |
| 3237 | case CHAR_DOT: |
| 3238 | *parsed_pattern++ = META_DOT; |
| 3239 | okquantifier = TRUE; |
| 3240 | break; |
| 3241 | |
| 3242 | |
| 3243 | /* ---- Single-character quantifiers ---- */ |
| 3244 | |
| 3245 | case CHAR_ASTERISK: |
| 3246 | meta_quantifier = META_ASTERISK; |
| 3247 | goto CHECK_QUANTIFIER; |
| 3248 | |
| 3249 | case CHAR_PLUS: |
| 3250 | meta_quantifier = META_PLUS; |
| 3251 | goto CHECK_QUANTIFIER; |
| 3252 | |
| 3253 | case CHAR_QUESTION_MARK: |
| 3254 | meta_quantifier = META_QUERY; |
| 3255 | goto CHECK_QUANTIFIER; |
| 3256 | |
| 3257 | |
| 3258 | /* ---- Potential {n,m} quantifier ---- */ |
| 3259 | |
| 3260 | case CHAR_LEFT_CURLY_BRACKET: |
| 3261 | if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat, |
| 3262 | &errorcode)) |
| 3263 | { |
| 3264 | if (errorcode != 0) goto FAILED; /* Error in quantifier. */ |
| 3265 | PARSED_LITERAL(c, parsed_pattern); /* Not a quantifier */ |
| 3266 | break; /* No more quantifier processing */ |
| 3267 | } |
| 3268 | meta_quantifier = META_MINMAX; |
| 3269 | /* Fall through */ |
| 3270 | |
| 3271 | |
| 3272 | /* ---- Quantifier post-processing ---- */ |
| 3273 | |
| 3274 | /* Check that a quantifier is allowed after the previous item. */ |
| 3275 | |
| 3276 | CHECK_QUANTIFIER: |
| 3277 | if (!prev_okquantifier) |
| 3278 | { |
| 3279 | errorcode = ERR9; |
| 3280 | goto FAILED_BACK; |
| 3281 | } |
| 3282 | |
| 3283 | /* Most (*VERB)s are not allowed to be quantified, but an ungreedy |
| 3284 | quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a |
| 3285 | sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by |
| 3286 | wrapping it in non-capturing brackets, but we have to allow for a preceding |
| 3287 | (*MARK) for when (*ACCEPT) has an argument. */ |
| 3288 | |
| 3289 | if (parsed_pattern[-1] == META_ACCEPT) |
| 3290 | { |
| 3291 | uint32_t *p; |
| 3292 | for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0]; |
| 3293 | *verbstartptr = META_NOCAPTURE; |
| 3294 | parsed_pattern[1] = META_KET; |
| 3295 | parsed_pattern += 2; |
| 3296 | } |
| 3297 | |
| 3298 | /* Now we can put the quantifier into the parsed pattern vector. At this |
| 3299 | stage, we have only the basic quantifier. The check for a following + or ? |
| 3300 | modifier happens at the top of the loop, after any intervening comments |
| 3301 | have been removed. */ |
| 3302 | |
| 3303 | *parsed_pattern++ = meta_quantifier; |
| 3304 | if (c == CHAR_LEFT_CURLY_BRACKET) |
| 3305 | { |
| 3306 | *parsed_pattern++ = min_repeat; |
| 3307 | *parsed_pattern++ = max_repeat; |
| 3308 | } |
| 3309 | break; |
| 3310 | |
| 3311 | |
| 3312 | /* ---- Character class ---- */ |
| 3313 | |
| 3314 | case CHAR_LEFT_SQUARE_BRACKET: |
| 3315 | okquantifier = TRUE; |
| 3316 | |
| 3317 | /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is |
| 3318 | used for "start of word" and "end of word". As these are otherwise illegal |
| 3319 | sequences, we don't break anything by recognizing them. They are replaced |
| 3320 | by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are |
| 3321 | erroneous and are handled by the normal code below. */ |
| 3322 | |
| 3323 | if (ptrend - ptr >= 6 && |
| 3324 | (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 || |
| 3325 | PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0)) |
| 3326 | { |
| 3327 | *parsed_pattern++ = META_ESCAPE + ESC_b; |
| 3328 | |
| 3329 | if (ptr[2] == CHAR_LESS_THAN_SIGN) |
| 3330 | { |
| 3331 | *parsed_pattern++ = META_LOOKAHEAD; |
| 3332 | } |
| 3333 | else |
| 3334 | { |
| 3335 | *parsed_pattern++ = META_LOOKBEHIND; |
| 3336 | *has_lookbehind = TRUE; |
| 3337 | |
| 3338 | /* The offset is used only for the "non-fixed length" error; this won't |
| 3339 | occur here, so just store zero. */ |
| 3340 | |
| 3341 | PUTOFFSET((PCRE2_SIZE)0, parsed_pattern); |
| 3342 | } |
| 3343 | |
| 3344 | if ((options & PCRE2_UCP) == 0) |
| 3345 | *parsed_pattern++ = META_ESCAPE + ESC_w; |
| 3346 | else |
| 3347 | { |
| 3348 | *parsed_pattern++ = META_ESCAPE + ESC_p; |
| 3349 | *parsed_pattern++ = PT_WORD << 16; |
| 3350 | } |
| 3351 | *parsed_pattern++ = META_KET; |
| 3352 | ptr += 6; |
| 3353 | break; |
| 3354 | } |
| 3355 | |
| 3356 | /* PCRE supports POSIX class stuff inside a class. Perl gives an error if |
| 3357 | they are encountered at the top level, so we'll do that too. */ |
| 3358 | |
| 3359 | if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT || |
| 3360 | *ptr == CHAR_EQUALS_SIGN) && |
| 3361 | check_posix_syntax(ptr, ptrend, &tempptr)) |
| 3362 | { |
| 3363 | errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13; |
| 3364 | goto FAILED; |
| 3365 | } |
| 3366 | |
| 3367 | /* Process a regular character class. If the first character is '^', set |
| 3368 | the negation flag. If the first few characters (either before or after ^) |
| 3369 | are \Q\E or \E or space or tab in extended-more mode, we skip them too. |
| 3370 | This makes for compatibility with Perl. */ |
| 3371 | |
| 3372 | negate_class = FALSE; |
| 3373 | while (ptr < ptrend) |
| 3374 | { |
| 3375 | GETCHARINCTEST(c, ptr); |
| 3376 | if (c == CHAR_BACKSLASH) |
| 3377 | { |
| 3378 | if (ptr < ptrend && *ptr == CHAR_E) ptr++; |
| 3379 | else if (ptrend - ptr >= 3 && |
| 3380 | PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0) |
| 3381 | ptr += 3; |
| 3382 | else |
| 3383 | break; |
| 3384 | } |
| 3385 | else if ((options & PCRE2_EXTENDED_MORE) != 0 && |
| 3386 | (c == CHAR_SPACE || c == CHAR_HT)) /* Note: just these two */ |
| 3387 | continue; |
| 3388 | else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT) |
| 3389 | negate_class = TRUE; |
| 3390 | else break; |
| 3391 | } |
| 3392 | |
| 3393 | /* Now the real contents of the class; c has the first "real" character. |
| 3394 | Empty classes are permitted only if the option is set. */ |
| 3395 | |
| 3396 | if (c == CHAR_RIGHT_SQUARE_BRACKET && |
| 3397 | (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0) |
| 3398 | { |
| 3399 | *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY; |
| 3400 | break; /* End of class processing */ |
| 3401 | } |
| 3402 | |
| 3403 | /* Process a non-empty class. */ |
| 3404 | |
| 3405 | *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS; |
| 3406 | class_range_state = RANGE_NO; |
| 3407 | |
| 3408 | /* In an EBCDIC environment, Perl treats alphabetic ranges specially |
| 3409 | because there are holes in the encoding, and simply using the range A-Z |
| 3410 | (for example) would include the characters in the holes. This applies only |
| 3411 | to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z] |
| 3412 | in this respect. In order to accommodate this, we keep track of whether |
| 3413 | character values are literal or not, and a state variable for handling |
| 3414 | ranges. */ |
| 3415 | |
| 3416 | /* Loop for the contents of the class */ |
| 3417 | |
| 3418 | for (;;) |
| 3419 | { |
| 3420 | BOOL char_is_literal = TRUE; |
| 3421 | |
| 3422 | /* Inside \Q...\E everything is literal except \E */ |
| 3423 | |
| 3424 | if (inescq) |
| 3425 | { |
| 3426 | if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E) |
| 3427 | { |
| 3428 | inescq = FALSE; /* Reset literal state */ |
| 3429 | ptr++; /* Skip the 'E' */ |
| 3430 | goto CLASS_CONTINUE; |
| 3431 | } |
| 3432 | goto CLASS_LITERAL; |
| 3433 | } |
| 3434 | |
| 3435 | /* Skip over space and tab (only) in extended-more mode. */ |
| 3436 | |
| 3437 | if ((options & PCRE2_EXTENDED_MORE) != 0 && |
| 3438 | (c == CHAR_SPACE || c == CHAR_HT)) |
| 3439 | goto CLASS_CONTINUE; |
| 3440 | |
| 3441 | /* Handle POSIX class names. Perl allows a negation extension of the |
| 3442 | form [:^name:]. A square bracket that doesn't match the syntax is |
| 3443 | treated as a literal. We also recognize the POSIX constructions |
| 3444 | [.ch.] and [=ch=] ("collating elements") and fault them, as Perl |
| 3445 | 5.6 and 5.8 do. */ |
| 3446 | |
| 3447 | if (c == CHAR_LEFT_SQUARE_BRACKET && |
| 3448 | ptrend - ptr >= 3 && |
| 3449 | (*ptr == CHAR_COLON || *ptr == CHAR_DOT || |
| 3450 | *ptr == CHAR_EQUALS_SIGN) && |
| 3451 | check_posix_syntax(ptr, ptrend, &tempptr)) |
| 3452 | { |
| 3453 | BOOL posix_negate = FALSE; |
| 3454 | int posix_class; |
| 3455 | |
| 3456 | /* Perl treats a hyphen before a POSIX class as a literal, not the |
| 3457 | start of a range. However, it gives a warning in its warning mode. PCRE |
| 3458 | does not have a warning mode, so we give an error, because this is |
| 3459 | likely an error on the user's part. */ |
| 3460 | |
| 3461 | if (class_range_state == RANGE_STARTED) |
| 3462 | { |
| 3463 | errorcode = ERR50; |
| 3464 | goto FAILED; |
| 3465 | } |
| 3466 | |
| 3467 | if (*ptr != CHAR_COLON) |
| 3468 | { |
| 3469 | errorcode = ERR13; |
| 3470 | goto FAILED_BACK; |
| 3471 | } |
| 3472 | |
| 3473 | if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT) |
| 3474 | { |
| 3475 | posix_negate = TRUE; |
| 3476 | ptr++; |
| 3477 | } |
| 3478 | |
| 3479 | posix_class = check_posix_name(ptr, (int)(tempptr - ptr)); |
| 3480 | if (posix_class < 0) |
| 3481 | { |
| 3482 | errorcode = ERR30; |
| 3483 | goto FAILED; |
| 3484 | } |
| 3485 | ptr = tempptr + 2; |
| 3486 | |
| 3487 | /* Perl treats a hyphen after a POSIX class as a literal, not the |
| 3488 | start of a range. However, it gives a warning in its warning mode |
| 3489 | unless the hyphen is the last character in the class. PCRE does not |
| 3490 | have a warning mode, so we give an error, because this is likely an |
| 3491 | error on the user's part. */ |
| 3492 | |
| 3493 | if (ptr < ptrend - 1 && *ptr == CHAR_MINUS && |
| 3494 | ptr[1] != CHAR_RIGHT_SQUARE_BRACKET) |
| 3495 | { |
| 3496 | errorcode = ERR50; |
| 3497 | goto FAILED; |
| 3498 | } |
| 3499 | |
| 3500 | /* Set "a hyphen is not the start of a range" for the -] case, and also |
| 3501 | in case the POSIX class is followed by \E or \Q\E (possibly repeated - |
| 3502 | fuzzers do that kind of thing) and *then* a hyphen. This causes that |
| 3503 | hyphen to be treated as a literal. I don't think it's worth setting up |
| 3504 | special apparatus to do otherwise. */ |
| 3505 | |
| 3506 | class_range_state = RANGE_NO; |
| 3507 | |
| 3508 | /* When PCRE2_UCP is set, some of the POSIX classes are converted to |
| 3509 | use Unicode properties \p or \P or, in one case, \h or \H. The |
| 3510 | substitutes table has two values per class, containing the type and |
| 3511 | value of a \p or \P item. The special cases are specified with a |
| 3512 | negative type: a non-zero value causes \h or \H to be used, and a zero |
| 3513 | value falls through to behave like a non-UCP POSIX class. */ |
| 3514 | |
| 3515 | #ifdef SUPPORT_UNICODE |
| 3516 | if ((options & PCRE2_UCP) != 0) |
| 3517 | { |
| 3518 | int ptype = posix_substitutes[2*posix_class]; |
| 3519 | int pvalue = posix_substitutes[2*posix_class + 1]; |
| 3520 | if (ptype >= 0) |
| 3521 | { |
| 3522 | *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p); |
| 3523 | *parsed_pattern++ = (ptype << 16) | pvalue; |
| 3524 | goto CLASS_CONTINUE; |
| 3525 | } |
| 3526 | |
| 3527 | if (pvalue != 0) |
| 3528 | { |
| 3529 | *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h); |
| 3530 | goto CLASS_CONTINUE; |
| 3531 | } |
| 3532 | |
| 3533 | /* Fall through */ |
| 3534 | } |
| 3535 | #endif /* SUPPORT_UNICODE */ |
| 3536 | |
| 3537 | /* Non-UCP POSIX class */ |
| 3538 | |
| 3539 | *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX; |
| 3540 | *parsed_pattern++ = posix_class; |
| 3541 | } |
| 3542 | |
| 3543 | /* Handle potential start of range */ |
| 3544 | |
| 3545 | else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED) |
| 3546 | { |
| 3547 | *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)? |
| 3548 | META_RANGE_LITERAL : META_RANGE_ESCAPED; |
| 3549 | class_range_state = RANGE_STARTED; |
| 3550 | } |
| 3551 | |
| 3552 | /* Handle a literal character */ |
| 3553 | |
| 3554 | else if (c != CHAR_BACKSLASH) |
| 3555 | { |
| 3556 | CLASS_LITERAL: |
| 3557 | if (class_range_state == RANGE_STARTED) |
| 3558 | { |
| 3559 | if (c == parsed_pattern[-2]) /* Optimize one-char range */ |
| 3560 | parsed_pattern--; |
| 3561 | else if (parsed_pattern[-2] > c) /* Check range is in order */ |
| 3562 | { |
| 3563 | errorcode = ERR8; |
| 3564 | goto FAILED_BACK; |
| 3565 | } |
| 3566 | else |
| 3567 | { |
| 3568 | if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL) |
| 3569 | parsed_pattern[-1] = META_RANGE_ESCAPED; |
| 3570 | PARSED_LITERAL(c, parsed_pattern); |
| 3571 | } |
| 3572 | class_range_state = RANGE_NO; |
| 3573 | } |
| 3574 | else /* Potential start of range */ |
| 3575 | { |
| 3576 | class_range_state = char_is_literal? |
| 3577 | RANGE_OK_LITERAL : RANGE_OK_ESCAPED; |
| 3578 | PARSED_LITERAL(c, parsed_pattern); |
| 3579 | } |
| 3580 | } |
| 3581 | |
| 3582 | /* Handle escapes in a class */ |
| 3583 | |
| 3584 | else |
| 3585 | { |
| 3586 | tempptr = ptr; |
| 3587 | escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options, |
| 3588 | cb->cx->extra_options, TRUE, cb); |
| 3589 | |
| 3590 | if (errorcode != 0) |
| 3591 | { |
| 3592 | if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0) |
| 3593 | goto FAILED; |
| 3594 | ptr = tempptr; |
| 3595 | if (ptr >= ptrend) c = CHAR_BACKSLASH; else |
| 3596 | { |
| 3597 | GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ |
| 3598 | } |
| 3599 | escape = 0; /* Treat as literal character */ |
| 3600 | } |
| 3601 | |
| 3602 | switch(escape) |
| 3603 | { |
| 3604 | case 0: /* Escaped character code point is in c */ |
| 3605 | char_is_literal = FALSE; |
| 3606 | goto CLASS_LITERAL; |
| 3607 | |
| 3608 | case ESC_b: |
| 3609 | c = CHAR_BS; /* \b is backspace in a class */ |
| 3610 | char_is_literal = FALSE; |
| 3611 | goto CLASS_LITERAL; |
| 3612 | |
| 3613 | case ESC_Q: |
| 3614 | inescq = TRUE; /* Enter literal mode */ |
| 3615 | goto CLASS_CONTINUE; |
| 3616 | |
| 3617 | case ESC_E: /* Ignore orphan \E */ |
| 3618 | goto CLASS_CONTINUE; |
| 3619 | |
| 3620 | case ESC_B: /* Always an error in a class */ |
| 3621 | case ESC_R: |
| 3622 | case ESC_X: |
| 3623 | errorcode = ERR7; |
| 3624 | ptr--; |
| 3625 | goto FAILED; |
| 3626 | } |
| 3627 | |
| 3628 | /* The second part of a range can be a single-character escape |
| 3629 | sequence (detected above), but not any of the other escapes. Perl |
| 3630 | treats a hyphen as a literal in such circumstances. However, in Perl's |
| 3631 | warning mode, a warning is given, so PCRE now faults it, as it is |
| 3632 | almost certainly a mistake on the user's part. */ |
| 3633 | |
| 3634 | if (class_range_state == RANGE_STARTED) |
| 3635 | { |
| 3636 | errorcode = ERR50; |
| 3637 | goto FAILED; /* Not CLASS_ESCAPE_FAILED; always an error */ |
| 3638 | } |
| 3639 | |
| 3640 | /* Of the remaining escapes, only those that define characters are |
| 3641 | allowed in a class. None may start a range. */ |
| 3642 | |
| 3643 | class_range_state = RANGE_NO; |
| 3644 | switch(escape) |
| 3645 | { |
| 3646 | case ESC_N: |
| 3647 | errorcode = ERR71; |
| 3648 | goto FAILED; |
| 3649 | |
| 3650 | case ESC_H: |
| 3651 | case ESC_h: |
| 3652 | case ESC_V: |
| 3653 | case ESC_v: |
| 3654 | *parsed_pattern++ = META_ESCAPE + escape; |
| 3655 | break; |
| 3656 | |
| 3657 | /* These escapes are converted to Unicode property tests when |
| 3658 | PCRE2_UCP is set. */ |
| 3659 | |
| 3660 | case ESC_d: |
| 3661 | case ESC_D: |
| 3662 | case ESC_s: |
| 3663 | case ESC_S: |
| 3664 | case ESC_w: |
| 3665 | case ESC_W: |
| 3666 | if ((options & PCRE2_UCP) == 0) |
| 3667 | { |
| 3668 | *parsed_pattern++ = META_ESCAPE + escape; |
| 3669 | } |
| 3670 | else |
| 3671 | { |
| 3672 | *parsed_pattern++ = META_ESCAPE + |
| 3673 | ((escape == ESC_d || escape == ESC_s || escape == ESC_w)? |
| 3674 | ESC_p : ESC_P); |
| 3675 | switch(escape) |
| 3676 | { |
| 3677 | case ESC_d: |
| 3678 | case ESC_D: |
| 3679 | *parsed_pattern++ = (PT_PC << 16) | ucp_Nd; |
| 3680 | break; |
| 3681 | |
| 3682 | case ESC_s: |
| 3683 | case ESC_S: |
| 3684 | *parsed_pattern++ = PT_SPACE << 16; |
| 3685 | break; |
| 3686 | |
| 3687 | case ESC_w: |
| 3688 | case ESC_W: |
| 3689 | *parsed_pattern++ = PT_WORD << 16; |
| 3690 | break; |
| 3691 | } |
| 3692 | } |
| 3693 | break; |
| 3694 | |
| 3695 | /* Explicit Unicode property matching */ |
| 3696 | |
| 3697 | case ESC_P: |
| 3698 | case ESC_p: |
| 3699 | #ifdef SUPPORT_UNICODE |
| 3700 | { |
| 3701 | BOOL negated; |
| 3702 | uint16_t ptype = 0, pdata = 0; |
| 3703 | if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb)) |
| 3704 | goto FAILED; |
| 3705 | if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P; |
| 3706 | *parsed_pattern++ = META_ESCAPE + escape; |
| 3707 | *parsed_pattern++ = (ptype << 16) | pdata; |
| 3708 | } |
| 3709 | #else |
| 3710 | errorcode = ERR45; |
| 3711 | goto FAILED; |
| 3712 | #endif |
| 3713 | break; /* End \P and \p */ |
| 3714 | |
| 3715 | default: /* All others are not allowed in a class */ |
| 3716 | errorcode = ERR7; |
| 3717 | ptr--; |
| 3718 | goto FAILED; |
| 3719 | } |
| 3720 | |
| 3721 | /* Perl gives a warning unless a following hyphen is the last character |
| 3722 | in the class. PCRE throws an error. */ |
| 3723 | |
| 3724 | if (ptr < ptrend - 1 && *ptr == CHAR_MINUS && |
| 3725 | ptr[1] != CHAR_RIGHT_SQUARE_BRACKET) |
| 3726 | { |
| 3727 | errorcode = ERR50; |
| 3728 | goto FAILED; |
| 3729 | } |
| 3730 | } |
| 3731 | |
| 3732 | /* Proceed to next thing in the class. */ |
| 3733 | |
| 3734 | CLASS_CONTINUE: |
| 3735 | if (ptr >= ptrend) |
| 3736 | { |
| 3737 | errorcode = ERR6; /* Missing terminating ']' */ |
| 3738 | goto FAILED; |
| 3739 | } |
| 3740 | GETCHARINCTEST(c, ptr); |
| 3741 | if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break; |
| 3742 | } /* End of class-processing loop */ |
| 3743 | |
| 3744 | /* -] at the end of a class is a literal '-' */ |
| 3745 | |
| 3746 | if (class_range_state == RANGE_STARTED) |
| 3747 | { |
| 3748 | parsed_pattern[-1] = CHAR_MINUS; |
| 3749 | class_range_state = RANGE_NO; |
| 3750 | } |
| 3751 | |
| 3752 | *parsed_pattern++ = META_CLASS_END; |
| 3753 | break; /* End of character class */ |
| 3754 | |
| 3755 | |
| 3756 | /* ---- Opening parenthesis ---- */ |
| 3757 | |
| 3758 | case CHAR_LEFT_PARENTHESIS: |
| 3759 | if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS; |
| 3760 | |
| 3761 | /* If ( is not followed by ? it is either a capture or a special verb or an |
| 3762 | alpha assertion or a positive non-atomic lookahead. */ |
| 3763 | |
| 3764 | if (*ptr != CHAR_QUESTION_MARK) |
| 3765 | { |
| 3766 | const char *vn; |
| 3767 | |
| 3768 | /* Handle capturing brackets (or non-capturing if auto-capture is turned |
| 3769 | off). */ |
| 3770 | |
| 3771 | if (*ptr != CHAR_ASTERISK) |
| 3772 | { |
| 3773 | nest_depth++; |
| 3774 | if ((options & PCRE2_NO_AUTO_CAPTURE) == 0) |
| 3775 | { |
| 3776 | if (cb->bracount >= MAX_GROUP_NUMBER) |
| 3777 | { |
| 3778 | errorcode = ERR97; |
| 3779 | goto FAILED; |
| 3780 | } |
| 3781 | cb->bracount++; |
| 3782 | *parsed_pattern++ = META_CAPTURE | cb->bracount; |
| 3783 | } |
| 3784 | else *parsed_pattern++ = META_NOCAPTURE; |
| 3785 | } |
| 3786 | |
| 3787 | /* Do nothing for (* followed by end of pattern or ) so it gives a "bad |
| 3788 | quantifier" error rather than "(*MARK) must have an argument". */ |
| 3789 | |
| 3790 | else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS) |
| 3791 | break; |
| 3792 | |
| 3793 | /* Handle "alpha assertions" such as (*pla:...). Most of these are |
| 3794 | synonyms for the historical symbolic assertions, but the script run and |
| 3795 | non-atomic lookaround ones are new. They are distinguished by starting |
| 3796 | with a lower case letter. Checking both ends of the alphabet makes this |
| 3797 | work in all character codes. */ |
| 3798 | |
| 3799 | else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0) |
| 3800 | { |
| 3801 | uint32_t meta; |
| 3802 | |
| 3803 | vn = alasnames; |
| 3804 | if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen, |
| 3805 | &errorcode, cb)) goto FAILED; |
| 3806 | if (ptr >= ptrend || *ptr != CHAR_COLON) |
| 3807 | { |
| 3808 | errorcode = ERR95; /* Malformed */ |
| 3809 | goto FAILED; |
| 3810 | } |
| 3811 | |
| 3812 | /* Scan the table of alpha assertion names */ |
| 3813 | |
| 3814 | for (i = 0; i < alascount; i++) |
| 3815 | { |
| 3816 | if (namelen == alasmeta[i].len && |
| 3817 | PRIV(strncmp_c8)(name, vn, namelen) == 0) |
| 3818 | break; |
| 3819 | vn += alasmeta[i].len + 1; |
| 3820 | } |
| 3821 | |
| 3822 | if (i >= alascount) |
| 3823 | { |
| 3824 | errorcode = ERR95; /* Alpha assertion not recognized */ |
| 3825 | goto FAILED; |
| 3826 | } |
| 3827 | |
| 3828 | /* Check for expecting an assertion condition. If so, only atomic |
| 3829 | lookaround assertions are valid. */ |
| 3830 | |
| 3831 | meta = alasmeta[i].meta; |
| 3832 | if (prev_expect_cond_assert > 0 && |
| 3833 | (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT)) |
| 3834 | { |
| 3835 | errorcode = (meta == META_LOOKAHEAD_NA || meta == META_LOOKBEHIND_NA)? |
| 3836 | ERR98 : ERR28; /* (Atomic) assertion expected */ |
| 3837 | goto FAILED; |
| 3838 | } |
| 3839 | |
| 3840 | /* The lookaround alphabetic synonyms can mostly be handled by jumping |
| 3841 | to the code that handles the traditional symbolic forms. */ |
| 3842 | |
| 3843 | switch(meta) |
| 3844 | { |
| 3845 | default: |
| 3846 | errorcode = ERR89; /* Unknown code; should never occur because */ |
| 3847 | goto FAILED; /* the meta values come from a table above. */ |
| 3848 | |
| 3849 | case META_ATOMIC: |
| 3850 | goto ATOMIC_GROUP; |
| 3851 | |
| 3852 | case META_LOOKAHEAD: |
| 3853 | goto POSITIVE_LOOK_AHEAD; |
| 3854 | |
| 3855 | case META_LOOKAHEAD_NA: |
| 3856 | goto POSITIVE_NONATOMIC_LOOK_AHEAD; |
| 3857 | |
| 3858 | case META_LOOKAHEADNOT: |
| 3859 | goto NEGATIVE_LOOK_AHEAD; |
| 3860 | |
| 3861 | case META_LOOKBEHIND: |
| 3862 | case META_LOOKBEHINDNOT: |
| 3863 | case META_LOOKBEHIND_NA: |
| 3864 | *parsed_pattern++ = meta; |
| 3865 | ptr--; |
| 3866 | goto POST_LOOKBEHIND; |
| 3867 | |
| 3868 | /* The script run facilities are handled here. Unicode support is |
| 3869 | required (give an error if not, as this is a security issue). Always |
| 3870 | record a META_SCRIPT_RUN item. Then, for the atomic version, insert |
| 3871 | META_ATOMIC and remember that we need two META_KETs at the end. */ |
| 3872 | |
| 3873 | case META_SCRIPT_RUN: |
| 3874 | case META_ATOMIC_SCRIPT_RUN: |
| 3875 | #ifdef SUPPORT_UNICODE |
| 3876 | *parsed_pattern++ = META_SCRIPT_RUN; |
| 3877 | nest_depth++; |
| 3878 | ptr++; |
| 3879 | if (meta == META_ATOMIC_SCRIPT_RUN) |
| 3880 | { |
| 3881 | *parsed_pattern++ = META_ATOMIC; |
| 3882 | if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace); |
| 3883 | else if (++top_nest >= end_nests) |
| 3884 | { |
| 3885 | errorcode = ERR84; |
| 3886 | goto FAILED; |
| 3887 | } |
| 3888 | top_nest->nest_depth = nest_depth; |
| 3889 | top_nest->flags = NSF_ATOMICSR; |
| 3890 | top_nest->options = options & PARSE_TRACKED_OPTIONS; |
| 3891 | } |
| 3892 | break; |
| 3893 | #else /* SUPPORT_UNICODE */ |
| 3894 | errorcode = ERR96; |
| 3895 | goto FAILED; |
| 3896 | #endif |
| 3897 | } |
| 3898 | } |
| 3899 | |
| 3900 | |
| 3901 | /* ---- Handle (*VERB) and (*VERB:NAME) ---- */ |
| 3902 | |
| 3903 | else |
| 3904 | { |
| 3905 | vn = verbnames; |
| 3906 | if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen, |
| 3907 | &errorcode, cb)) goto FAILED; |
| 3908 | if (ptr >= ptrend || (*ptr != CHAR_COLON && |
| 3909 | *ptr != CHAR_RIGHT_PARENTHESIS)) |
| 3910 | { |
| 3911 | errorcode = ERR60; /* Malformed */ |
| 3912 | goto FAILED; |
| 3913 | } |
| 3914 | |
| 3915 | /* Scan the table of verb names */ |
| 3916 | |
| 3917 | for (i = 0; i < verbcount; i++) |
| 3918 | { |
| 3919 | if (namelen == verbs[i].len && |
| 3920 | PRIV(strncmp_c8)(name, vn, namelen) == 0) |
| 3921 | break; |
| 3922 | vn += verbs[i].len + 1; |
| 3923 | } |
| 3924 | |
| 3925 | if (i >= verbcount) |
| 3926 | { |
| 3927 | errorcode = ERR60; /* Verb not recognized */ |
| 3928 | goto FAILED; |
| 3929 | } |
| 3930 | |
| 3931 | /* An empty argument is treated as no argument. */ |
| 3932 | |
| 3933 | if (*ptr == CHAR_COLON && ptr + 1 < ptrend && |
| 3934 | ptr[1] == CHAR_RIGHT_PARENTHESIS) |
| 3935 | ptr++; /* Advance to the closing parens */ |
| 3936 | |
| 3937 | /* Check for mandatory non-empty argument; this is (*MARK) */ |
| 3938 | |
| 3939 | if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON) |
| 3940 | { |
| 3941 | errorcode = ERR66; |
| 3942 | goto FAILED; |
| 3943 | } |
| 3944 | |
| 3945 | /* Remember where this verb, possibly with a preceding (*MARK), starts, |
| 3946 | for handling quantified (*ACCEPT). */ |
| 3947 | |
| 3948 | verbstartptr = parsed_pattern; |
| 3949 | okquantifier = (verbs[i].meta == META_ACCEPT); |
| 3950 | |
| 3951 | /* It appears that Perl allows any characters whatsoever, other than a |
| 3952 | closing parenthesis, to appear in arguments ("names"), so we no longer |
| 3953 | insist on letters, digits, and underscores. Perl does not, however, do |
| 3954 | any interpretation within arguments, and has no means of including a |
| 3955 | closing parenthesis. PCRE supports escape processing but only when it |
| 3956 | is requested by an option. We set inverbname TRUE here, and let the |
| 3957 | main loop take care of this so that escape and \x processing is done by |
| 3958 | the main code above. */ |
| 3959 | |
| 3960 | if (*ptr++ == CHAR_COLON) /* Skip past : or ) */ |
| 3961 | { |
| 3962 | /* Some optional arguments can be treated as a preceding (*MARK) */ |
| 3963 | |
| 3964 | if (verbs[i].has_arg < 0) |
| 3965 | { |
| 3966 | add_after_mark = verbs[i].meta; |
| 3967 | *parsed_pattern++ = META_MARK; |
| 3968 | } |
| 3969 | |
| 3970 | /* The remaining verbs with arguments (except *MARK) need a different |
| 3971 | opcode. */ |
| 3972 | |
| 3973 | else |
| 3974 | { |
| 3975 | *parsed_pattern++ = verbs[i].meta + |
| 3976 | ((verbs[i].meta != META_MARK)? 0x00010000u:0); |
| 3977 | } |
| 3978 | |
| 3979 | /* Set up for reading the name in the main loop. */ |
| 3980 | |
| 3981 | verblengthptr = parsed_pattern++; |
| 3982 | verbnamestart = ptr; |
| 3983 | inverbname = TRUE; |
| 3984 | } |
| 3985 | else /* No verb "name" argument */ |
| 3986 | { |
| 3987 | *parsed_pattern++ = verbs[i].meta; |
| 3988 | } |
| 3989 | } /* End of (*VERB) handling */ |
| 3990 | break; /* Done with this parenthesis */ |
| 3991 | } /* End of groups that don't start with (? */ |
| 3992 | |
| 3993 | |
| 3994 | /* ---- Items starting (? ---- */ |
| 3995 | |
| 3996 | /* The type of item is determined by what follows (?. Handle (?| and option |
| 3997 | changes under "default" because both need a new block on the nest stack. |
| 3998 | Comments starting with (?# are handled above. Note that there is some |
| 3999 | ambiguity about the sequence (?- because if a digit follows it's a relative |
| 4000 | recursion or subroutine call whereas otherwise it's an option unsetting. */ |
| 4001 | |
| 4002 | if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS; |
| 4003 | |
| 4004 | switch(*ptr) |
| 4005 | { |
| 4006 | default: |
| 4007 | if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1])) |
| 4008 | goto RECURSION_BYNUMBER; /* The + case is handled by CHAR_PLUS */ |
| 4009 | |
| 4010 | /* We now have either (?| or a (possibly empty) option setting, |
| 4011 | optionally followed by a non-capturing group. */ |
| 4012 | |
| 4013 | nest_depth++; |
| 4014 | if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace); |
| 4015 | else if (++top_nest >= end_nests) |
| 4016 | { |
| 4017 | errorcode = ERR84; |
| 4018 | goto FAILED; |
| 4019 | } |
| 4020 | top_nest->nest_depth = nest_depth; |
| 4021 | top_nest->flags = 0; |
| 4022 | top_nest->options = options & PARSE_TRACKED_OPTIONS; |
| 4023 | |
| 4024 | /* Start of non-capturing group that resets the capture count for each |
| 4025 | branch. */ |
| 4026 | |
| 4027 | if (*ptr == CHAR_VERTICAL_LINE) |
| 4028 | { |
| 4029 | top_nest->reset_group = (uint16_t)cb->bracount; |
| 4030 | top_nest->max_group = (uint16_t)cb->bracount; |
| 4031 | top_nest->flags |= NSF_RESET; |
| 4032 | cb->external_flags |= PCRE2_DUPCAPUSED; |
| 4033 | *parsed_pattern++ = META_NOCAPTURE; |
| 4034 | ptr++; |
| 4035 | } |
| 4036 | |
| 4037 | /* Scan for options imnsxJU to be set or unset. */ |
| 4038 | |
| 4039 | else |
| 4040 | { |
| 4041 | BOOL hyphenok = TRUE; |
| 4042 | uint32_t oldoptions = options; |
| 4043 | |
| 4044 | top_nest->reset_group = 0; |
| 4045 | top_nest->max_group = 0; |
| 4046 | set = unset = 0; |
| 4047 | optset = &set; |
| 4048 | |
| 4049 | /* ^ at the start unsets imnsx and disables the subsequent use of - */ |
| 4050 | |
| 4051 | if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT) |
| 4052 | { |
| 4053 | options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| |
| 4054 | PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE); |
| 4055 | hyphenok = FALSE; |
| 4056 | ptr++; |
| 4057 | } |
| 4058 | |
| 4059 | while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS && |
| 4060 | *ptr != CHAR_COLON) |
| 4061 | { |
| 4062 | switch (*ptr++) |
| 4063 | { |
| 4064 | case CHAR_MINUS: |
| 4065 | if (!hyphenok) |
| 4066 | { |
| 4067 | errorcode = ERR94; |
| 4068 | ptr--; /* Correct the offset */ |
| 4069 | goto FAILED; |
| 4070 | } |
| 4071 | optset = &unset; |
| 4072 | hyphenok = FALSE; |
| 4073 | break; |
| 4074 | |
| 4075 | case CHAR_J: /* Record that it changed in the external options */ |
| 4076 | *optset |= PCRE2_DUPNAMES; |
| 4077 | cb->external_flags |= PCRE2_JCHANGED; |
| 4078 | break; |
| 4079 | |
| 4080 | case CHAR_i: *optset |= PCRE2_CASELESS; break; |
| 4081 | case CHAR_m: *optset |= PCRE2_MULTILINE; break; |
| 4082 | case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break; |
| 4083 | case CHAR_s: *optset |= PCRE2_DOTALL; break; |
| 4084 | case CHAR_U: *optset |= PCRE2_UNGREEDY; break; |
| 4085 | |
| 4086 | /* If x appears twice it sets the extended extended option. */ |
| 4087 | |
| 4088 | case CHAR_x: |
| 4089 | *optset |= PCRE2_EXTENDED; |
| 4090 | if (ptr < ptrend && *ptr == CHAR_x) |
| 4091 | { |
| 4092 | *optset |= PCRE2_EXTENDED_MORE; |
| 4093 | ptr++; |
| 4094 | } |
| 4095 | break; |
| 4096 | |
| 4097 | default: |
| 4098 | errorcode = ERR11; |
| 4099 | ptr--; /* Correct the offset */ |
| 4100 | goto FAILED; |
| 4101 | } |
| 4102 | } |
| 4103 | |
| 4104 | /* If we are setting extended without extended-more, ensure that any |
| 4105 | existing extended-more gets unset. Also, unsetting extended must also |
| 4106 | unset extended-more. */ |
| 4107 | |
| 4108 | if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED || |
| 4109 | (unset & PCRE2_EXTENDED) != 0) |
| 4110 | unset |= PCRE2_EXTENDED_MORE; |
| 4111 | |
| 4112 | options = (options | set) & (~unset); |
| 4113 | |
| 4114 | /* If the options ended with ')' this is not the start of a nested |
| 4115 | group with option changes, so the options change at this level. |
| 4116 | In this case, if the previous level set up a nest block, discard the |
| 4117 | one we have just created. Otherwise adjust it for the previous level. |
| 4118 | If the options ended with ':' we are starting a non-capturing group, |
| 4119 | possibly with an options setting. */ |
| 4120 | |
| 4121 | if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS; |
| 4122 | if (*ptr++ == CHAR_RIGHT_PARENTHESIS) |
| 4123 | { |
| 4124 | nest_depth--; /* This is not a nested group after all. */ |
| 4125 | if (top_nest > (nest_save *)(cb->start_workspace) && |
| 4126 | (top_nest-1)->nest_depth == nest_depth) top_nest--; |
| 4127 | else top_nest->nest_depth = nest_depth; |
| 4128 | } |
| 4129 | else *parsed_pattern++ = META_NOCAPTURE; |
| 4130 | |
| 4131 | /* If nothing changed, no need to record. */ |
| 4132 | |
| 4133 | if (options != oldoptions) |
| 4134 | { |
| 4135 | *parsed_pattern++ = META_OPTIONS; |
| 4136 | *parsed_pattern++ = options; |
| 4137 | } |
| 4138 | } /* End options processing */ |
| 4139 | break; /* End default case after (? */ |
| 4140 | |
| 4141 | |
| 4142 | /* ---- Python syntax support ---- */ |
| 4143 | |
| 4144 | case CHAR_P: |
| 4145 | if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS; |
| 4146 | |
| 4147 | /* (?P<name> is the same as (?<name>, which defines a named group. */ |
| 4148 | |
| 4149 | if (*ptr == CHAR_LESS_THAN_SIGN) |
| 4150 | { |
| 4151 | terminator = CHAR_GREATER_THAN_SIGN; |
| 4152 | goto DEFINE_NAME; |
| 4153 | } |
| 4154 | |
| 4155 | /* (?P>name) is the same as (?&name), which is a recursion or subroutine |
| 4156 | call. */ |
| 4157 | |
| 4158 | if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME; |
| 4159 | |
| 4160 | /* (?P=name) is the same as \k<name>, a back reference by name. Anything |
| 4161 | else after (?P is an error. */ |
| 4162 | |
| 4163 | if (*ptr != CHAR_EQUALS_SIGN) |
| 4164 | { |
| 4165 | errorcode = ERR41; |
| 4166 | goto FAILED; |
| 4167 | } |
| 4168 | if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name, |
| 4169 | &namelen, &errorcode, cb)) goto FAILED; |
| 4170 | *parsed_pattern++ = META_BACKREF_BYNAME; |
| 4171 | *parsed_pattern++ = namelen; |
| 4172 | PUTOFFSET(offset, parsed_pattern); |
| 4173 | okquantifier = TRUE; |
| 4174 | break; /* End of (?P processing */ |
| 4175 | |
| 4176 | |
| 4177 | /* ---- Recursion/subroutine calls by number ---- */ |
| 4178 | |
| 4179 | case CHAR_R: |
| 4180 | i = 0; /* (?R) == (?R0) */ |
| 4181 | ptr++; |
| 4182 | if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) |
| 4183 | { |
| 4184 | errorcode = ERR58; |
| 4185 | goto FAILED; |
| 4186 | } |
| 4187 | goto SET_RECURSION; |
| 4188 | |
| 4189 | /* An item starting (?- followed by a digit comes here via the "default" |
| 4190 | case because (?- followed by a non-digit is an options setting. */ |
| 4191 | |
| 4192 | case CHAR_PLUS: |
| 4193 | if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1])) |
| 4194 | { |
| 4195 | errorcode = ERR29; /* Missing number */ |
| 4196 | goto FAILED; |
| 4197 | } |
| 4198 | /* Fall through */ |
| 4199 | |
| 4200 | case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: |
| 4201 | case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: |
| 4202 | RECURSION_BYNUMBER: |
| 4203 | if (!read_number(&ptr, ptrend, |
| 4204 | (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */ |
| 4205 | MAX_GROUP_NUMBER, ERR61, |
| 4206 | &i, &errorcode)) goto FAILED; |
| 4207 | if (i < 0) /* NB (?0) is permitted */ |
| 4208 | { |
| 4209 | errorcode = ERR15; /* Unknown group */ |
| 4210 | goto FAILED_BACK; |
| 4211 | } |
| 4212 | if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) |
| 4213 | goto UNCLOSED_PARENTHESIS; |
| 4214 | |
| 4215 | SET_RECURSION: |
| 4216 | *parsed_pattern++ = META_RECURSE | (uint32_t)i; |
| 4217 | offset = (PCRE2_SIZE)(ptr - cb->start_pattern); |
| 4218 | ptr++; |
| 4219 | PUTOFFSET(offset, parsed_pattern); |
| 4220 | okquantifier = TRUE; |
| 4221 | break; /* End of recursive call by number handling */ |
| 4222 | |
| 4223 | |
| 4224 | /* ---- Recursion/subroutine calls by name ---- */ |
| 4225 | |
| 4226 | case CHAR_AMPERSAND: |
| 4227 | RECURSE_BY_NAME: |
| 4228 | if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name, |
| 4229 | &namelen, &errorcode, cb)) goto FAILED; |
| 4230 | *parsed_pattern++ = META_RECURSE_BYNAME; |
| 4231 | *parsed_pattern++ = namelen; |
| 4232 | PUTOFFSET(offset, parsed_pattern); |
| 4233 | okquantifier = TRUE; |
| 4234 | break; |
| 4235 | |
| 4236 | /* ---- Callout with numerical or string argument ---- */ |
| 4237 | |
| 4238 | case CHAR_C: |
| 4239 | if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS; |
| 4240 | |
| 4241 | /* If the previous item was a condition starting (?(? an assertion, |
| 4242 | optionally preceded by a callout, is expected. This is checked later on, |
| 4243 | during actual compilation. However we need to identify this kind of |
| 4244 | assertion in this pass because it must not be qualified. The value of |
| 4245 | expect_cond_assert is set to 2 after (?(? is processed. We decrement it |
| 4246 | for a callout - still leaving a positive value that identifies the |
| 4247 | assertion. Multiple callouts or any other items will make it zero or |
| 4248 | less, which doesn't matter because they will cause an error later. */ |
| 4249 | |
| 4250 | expect_cond_assert = prev_expect_cond_assert - 1; |
| 4251 | |
| 4252 | /* If previous_callout is not NULL, it means this follows a previous |
| 4253 | callout. If it was a manual callout, do nothing; this means its "length |
| 4254 | of next pattern item" field will remain zero. If it was an automatic |
| 4255 | callout, abolish it. */ |
| 4256 | |
| 4257 | if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 && |
| 4258 | previous_callout == parsed_pattern - 4 && |
| 4259 | parsed_pattern[-1] == 255) |
| 4260 | parsed_pattern = previous_callout; |
| 4261 | |
| 4262 | /* Save for updating next pattern item length, and skip one item before |
| 4263 | completing. */ |
| 4264 | |
| 4265 | previous_callout = parsed_pattern; |
| 4266 | after_manual_callout = 1; |
| 4267 | |
| 4268 | /* Handle a string argument; specific delimiter is required. */ |
| 4269 | |
| 4270 | if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr)) |
| 4271 | { |
| 4272 | PCRE2_SIZE calloutlength; |
| 4273 | PCRE2_SPTR startptr = ptr; |
| 4274 | |
| 4275 | delimiter = 0; |
| 4276 | for (i = 0; PRIV(callout_start_delims)[i] != 0; i++) |
| 4277 | { |
| 4278 | if (*ptr == PRIV(callout_start_delims)[i]) |
| 4279 | { |
| 4280 | delimiter = PRIV(callout_end_delims)[i]; |
| 4281 | break; |
| 4282 | } |
| 4283 | } |
| 4284 | if (delimiter == 0) |
| 4285 | { |
| 4286 | errorcode = ERR82; |
| 4287 | goto FAILED; |
| 4288 | } |
| 4289 | |
| 4290 | *parsed_pattern = META_CALLOUT_STRING; |
| 4291 | parsed_pattern += 3; /* Skip pattern info */ |
| 4292 | |
| 4293 | for (;;) |
| 4294 | { |
| 4295 | if (++ptr >= ptrend) |
| 4296 | { |
| 4297 | errorcode = ERR81; |
| 4298 | ptr = startptr; /* To give a more useful message */ |
| 4299 | goto FAILED; |
| 4300 | } |
| 4301 | if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter)) |
| 4302 | break; |
| 4303 | } |
| 4304 | |
| 4305 | calloutlength = (PCRE2_SIZE)(ptr - startptr); |
| 4306 | if (calloutlength > UINT32_MAX) |
| 4307 | { |
| 4308 | errorcode = ERR72; |
| 4309 | goto FAILED; |
| 4310 | } |
| 4311 | *parsed_pattern++ = (uint32_t)calloutlength; |
| 4312 | offset = (PCRE2_SIZE)(startptr - cb->start_pattern); |
| 4313 | PUTOFFSET(offset, parsed_pattern); |
| 4314 | } |
| 4315 | |
| 4316 | /* Handle a callout with an optional numerical argument, which must be |
| 4317 | less than or equal to 255. A missing argument gives 0. */ |
| 4318 | |
| 4319 | else |
| 4320 | { |
| 4321 | int n = 0; |
| 4322 | *parsed_pattern = META_CALLOUT_NUMBER; /* Numerical callout */ |
| 4323 | parsed_pattern += 3; /* Skip pattern info */ |
| 4324 | while (ptr < ptrend && IS_DIGIT(*ptr)) |
| 4325 | { |
| 4326 | n = n * 10 + *ptr++ - CHAR_0; |
| 4327 | if (n > 255) |
| 4328 | { |
| 4329 | errorcode = ERR38; |
| 4330 | goto FAILED; |
| 4331 | } |
| 4332 | } |
| 4333 | *parsed_pattern++ = n; |
| 4334 | } |
| 4335 | |
| 4336 | /* Both formats must have a closing parenthesis */ |
| 4337 | |
| 4338 | if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) |
| 4339 | { |
| 4340 | errorcode = ERR39; |
| 4341 | goto FAILED; |
| 4342 | } |
| 4343 | ptr++; |
| 4344 | |
| 4345 | /* Remember the offset to the next item in the pattern, and set a default |
| 4346 | length. This should get updated after the next item is read. */ |
| 4347 | |
| 4348 | previous_callout[1] = (uint32_t)(ptr - cb->start_pattern); |
| 4349 | previous_callout[2] = 0; |
| 4350 | break; /* End callout */ |
| 4351 | |
| 4352 | |
| 4353 | /* ---- Conditional group ---- */ |
| 4354 | |
| 4355 | /* A condition can be an assertion, a number (referring to a numbered |
| 4356 | group's having been set), a name (referring to a named group), or 'R', |
| 4357 | referring to overall recursion. R<digits> and R&name are also permitted |
| 4358 | for recursion state tests. Numbers may be preceded by + or - to specify a |
| 4359 | relative group number. |
| 4360 | |
| 4361 | There are several syntaxes for testing a named group: (?(name)) is used |
| 4362 | by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')). |
| 4363 | |
| 4364 | There are two unfortunate ambiguities. 'R' can be the recursive thing or |
| 4365 | the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be |
| 4366 | the Perl DEFINE feature or the Python named test. We look for a name |
| 4367 | first; if not found, we try the other case. |
| 4368 | |
| 4369 | For compatibility with auto-callouts, we allow a callout to be specified |
| 4370 | before a condition that is an assertion. */ |
| 4371 | |
| 4372 | case CHAR_LEFT_PARENTHESIS: |
| 4373 | if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS; |
| 4374 | nest_depth++; |
| 4375 | |
| 4376 | /* If the next character is ? or * there must be an assertion next |
| 4377 | (optionally preceded by a callout). We do not check this here, but |
| 4378 | instead we set expect_cond_assert to 2. If this is still greater than |
| 4379 | zero (callouts decrement it) when the next assertion is read, it will be |
| 4380 | marked as a condition that must not be repeated. A value greater than |
| 4381 | zero also causes checking that an assertion (possibly with callout) |
| 4382 | follows. */ |
| 4383 | |
| 4384 | if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK) |
| 4385 | { |
| 4386 | *parsed_pattern++ = META_COND_ASSERT; |
| 4387 | ptr--; /* Pull pointer back to the opening parenthesis. */ |
| 4388 | expect_cond_assert = 2; |
| 4389 | break; /* End of conditional */ |
| 4390 | } |
| 4391 | |
| 4392 | /* Handle (?([+-]number)... */ |
| 4393 | |
| 4394 | if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i, |
| 4395 | &errorcode)) |
| 4396 | { |
| 4397 | if (i <= 0) |
| 4398 | { |
| 4399 | errorcode = ERR15; |
| 4400 | goto FAILED; |
| 4401 | } |
| 4402 | *parsed_pattern++ = META_COND_NUMBER; |
| 4403 | offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2); |
| 4404 | PUTOFFSET(offset, parsed_pattern); |
| 4405 | *parsed_pattern++ = i; |
| 4406 | } |
| 4407 | else if (errorcode != 0) goto FAILED; /* Number too big */ |
| 4408 | |
| 4409 | /* No number found. Handle the special case (?(VERSION[>]=n.m)... */ |
| 4410 | |
| 4411 | else if (ptrend - ptr >= 10 && |
| 4412 | PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 && |
| 4413 | ptr[7] != CHAR_RIGHT_PARENTHESIS) |
| 4414 | { |
| 4415 | uint32_t ge = 0; |
| 4416 | int major = 0; |
| 4417 | int minor = 0; |
| 4418 | |
| 4419 | ptr += 7; |
| 4420 | if (*ptr == CHAR_GREATER_THAN_SIGN) |
| 4421 | { |
| 4422 | ge = 1; |
| 4423 | ptr++; |
| 4424 | } |
| 4425 | |
| 4426 | /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT |
| 4427 | references its argument twice. */ |
| 4428 | |
| 4429 | if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr))) |
| 4430 | goto BAD_VERSION_CONDITION; |
| 4431 | |
| 4432 | if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode)) |
| 4433 | goto FAILED; |
| 4434 | |
| 4435 | if (ptr >= ptrend) goto BAD_VERSION_CONDITION; |
| 4436 | if (*ptr == CHAR_DOT) |
| 4437 | { |
| 4438 | if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION; |
| 4439 | minor = (*ptr++ - CHAR_0) * 10; |
| 4440 | if (ptr >= ptrend) goto BAD_VERSION_CONDITION; |
| 4441 | if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0; |
| 4442 | if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) |
| 4443 | goto BAD_VERSION_CONDITION; |
| 4444 | } |
| 4445 | |
| 4446 | *parsed_pattern++ = META_COND_VERSION; |
| 4447 | *parsed_pattern++ = ge; |
| 4448 | *parsed_pattern++ = major; |
| 4449 | *parsed_pattern++ = minor; |
| 4450 | } |
| 4451 | |
| 4452 | /* All the remaining cases now require us to read a name. We cannot at |
| 4453 | this stage distinguish ambiguous cases such as (?(R12) which might be a |
| 4454 | recursion test by number or a name, because the named groups have not yet |
| 4455 | all been identified. Those cases are treated as names, but given a |
| 4456 | different META code. */ |
| 4457 | |
| 4458 | else |
| 4459 | { |
| 4460 | BOOL was_r_ampersand = FALSE; |
| 4461 | |
| 4462 | if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND) |
| 4463 | { |
| 4464 | terminator = CHAR_RIGHT_PARENTHESIS; |
| 4465 | was_r_ampersand = TRUE; |
| 4466 | ptr++; |
| 4467 | } |
| 4468 | else if (*ptr == CHAR_LESS_THAN_SIGN) |
| 4469 | terminator = CHAR_GREATER_THAN_SIGN; |
| 4470 | else if (*ptr == CHAR_APOSTROPHE) |
| 4471 | terminator = CHAR_APOSTROPHE; |
| 4472 | else |
| 4473 | { |
| 4474 | terminator = CHAR_RIGHT_PARENTHESIS; |
| 4475 | ptr--; /* Point to char before name */ |
| 4476 | } |
| 4477 | if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen, |
| 4478 | &errorcode, cb)) goto FAILED; |
| 4479 | |
| 4480 | /* Handle (?(R&name) */ |
| 4481 | |
| 4482 | if (was_r_ampersand) |
| 4483 | { |
| 4484 | *parsed_pattern = META_COND_RNAME; |
| 4485 | ptr--; /* Back to closing parens */ |
| 4486 | } |
| 4487 | |
| 4488 | /* Handle (?(name). If the name is "DEFINE" we identify it with a |
| 4489 | special code. Likewise if the name consists of R followed only by |
| 4490 | digits. Otherwise, handle it like a quoted name. */ |
| 4491 | |
| 4492 | else if (terminator == CHAR_RIGHT_PARENTHESIS) |
| 4493 | { |
| 4494 | if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0) |
| 4495 | *parsed_pattern = META_COND_DEFINE; |
| 4496 | else |
| 4497 | { |
| 4498 | for (i = 1; i < (int)namelen; i++) |
| 4499 | if (!IS_DIGIT(name[i])) break; |
| 4500 | *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)? |
| 4501 | META_COND_RNUMBER : META_COND_NAME; |
| 4502 | } |
| 4503 | ptr--; /* Back to closing parens */ |
| 4504 | } |
| 4505 | |
| 4506 | /* Handle (?('name') or (?(<name>) */ |
| 4507 | |
| 4508 | else *parsed_pattern = META_COND_NAME; |
| 4509 | |
| 4510 | /* All these cases except DEFINE end with the name length and offset; |
| 4511 | DEFINE just has an offset (for the "too many branches" error). */ |
| 4512 | |
| 4513 | if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen; |
| 4514 | PUTOFFSET(offset, parsed_pattern); |
| 4515 | } /* End cases that read a name */ |
| 4516 | |
| 4517 | /* Check the closing parenthesis of the condition */ |
| 4518 | |
| 4519 | if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) |
| 4520 | { |
| 4521 | errorcode = ERR24; |
| 4522 | goto FAILED; |
| 4523 | } |
| 4524 | ptr++; |
| 4525 | break; /* End of condition processing */ |
| 4526 | |
| 4527 | |
| 4528 | /* ---- Atomic group ---- */ |
| 4529 | |
| 4530 | case CHAR_GREATER_THAN_SIGN: |
| 4531 | ATOMIC_GROUP: /* Come from (*atomic: */ |
| 4532 | *parsed_pattern++ = META_ATOMIC; |
| 4533 | nest_depth++; |
| 4534 | ptr++; |
| 4535 | break; |
| 4536 | |
| 4537 | |
| 4538 | /* ---- Lookahead assertions ---- */ |
| 4539 | |
| 4540 | case CHAR_EQUALS_SIGN: |
| 4541 | POSITIVE_LOOK_AHEAD: /* Come from (*pla: */ |
| 4542 | *parsed_pattern++ = META_LOOKAHEAD; |
| 4543 | ptr++; |
| 4544 | goto POST_ASSERTION; |
| 4545 | |
| 4546 | case CHAR_ASTERISK: |
| 4547 | POSITIVE_NONATOMIC_LOOK_AHEAD: /* Come from (?* */ |
| 4548 | *parsed_pattern++ = META_LOOKAHEAD_NA; |
| 4549 | ptr++; |
| 4550 | goto POST_ASSERTION; |
| 4551 | |
| 4552 | case CHAR_EXCLAMATION_MARK: |
| 4553 | NEGATIVE_LOOK_AHEAD: /* Come from (*nla: */ |
| 4554 | *parsed_pattern++ = META_LOOKAHEADNOT; |
| 4555 | ptr++; |
| 4556 | goto POST_ASSERTION; |
| 4557 | |
| 4558 | |
| 4559 | /* ---- Lookbehind assertions ---- */ |
| 4560 | |
| 4561 | /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?< |
| 4562 | is the start of the name of a capturing group. */ |
| 4563 | |
| 4564 | case CHAR_LESS_THAN_SIGN: |
| 4565 | if (ptrend - ptr <= 1 || |
| 4566 | (ptr[1] != CHAR_EQUALS_SIGN && |
| 4567 | ptr[1] != CHAR_EXCLAMATION_MARK && |
| 4568 | ptr[1] != CHAR_ASTERISK)) |
| 4569 | { |
| 4570 | terminator = CHAR_GREATER_THAN_SIGN; |
| 4571 | goto DEFINE_NAME; |
| 4572 | } |
| 4573 | *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)? |
| 4574 | META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)? |
| 4575 | META_LOOKBEHINDNOT : META_LOOKBEHIND_NA; |
| 4576 | |
| 4577 | POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */ |
| 4578 | *has_lookbehind = TRUE; |
| 4579 | offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2); |
| 4580 | PUTOFFSET(offset, parsed_pattern); |
| 4581 | ptr += 2; |
| 4582 | /* Fall through */ |
| 4583 | |
| 4584 | /* If the previous item was a condition starting (?(? an assertion, |
| 4585 | optionally preceded by a callout, is expected. This is checked later on, |
| 4586 | during actual compilation. However we need to identify this kind of |
| 4587 | assertion in this pass because it must not be qualified. The value of |
| 4588 | expect_cond_assert is set to 2 after (?(? is processed. We decrement it |
| 4589 | for a callout - still leaving a positive value that identifies the |
| 4590 | assertion. Multiple callouts or any other items will make it zero or |
| 4591 | less, which doesn't matter because they will cause an error later. */ |
| 4592 | |
| 4593 | POST_ASSERTION: |
| 4594 | nest_depth++; |
| 4595 | if (prev_expect_cond_assert > 0) |
| 4596 | { |
| 4597 | if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace); |
| 4598 | else if (++top_nest >= end_nests) |
| 4599 | { |
| 4600 | errorcode = ERR84; |
| 4601 | goto FAILED; |
| 4602 | } |
| 4603 | top_nest->nest_depth = nest_depth; |
| 4604 | top_nest->flags = NSF_CONDASSERT; |
| 4605 | top_nest->options = options & PARSE_TRACKED_OPTIONS; |
| 4606 | } |
| 4607 | break; |
| 4608 | |
| 4609 | |
| 4610 | /* ---- Define a named group ---- */ |
| 4611 | |
| 4612 | /* A named group may be defined as (?'name') or (?<name>). In the latter |
| 4613 | case we jump to DEFINE_NAME from the disambiguation of (?< above with the |
| 4614 | terminator set to '>'. */ |
| 4615 | |
| 4616 | case CHAR_APOSTROPHE: |
| 4617 | terminator = CHAR_APOSTROPHE; /* Terminator */ |
| 4618 | |
| 4619 | DEFINE_NAME: |
| 4620 | if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen, |
| 4621 | &errorcode, cb)) goto FAILED; |
| 4622 | |
| 4623 | /* We have a name for this capturing group. It is also assigned a number, |
| 4624 | which is its primary means of identification. */ |
| 4625 | |
| 4626 | if (cb->bracount >= MAX_GROUP_NUMBER) |
| 4627 | { |
| 4628 | errorcode = ERR97; |
| 4629 | goto FAILED; |
| 4630 | } |
| 4631 | cb->bracount++; |
| 4632 | *parsed_pattern++ = META_CAPTURE | cb->bracount; |
| 4633 | nest_depth++; |
| 4634 | |
| 4635 | /* Check not too many names */ |
| 4636 | |
| 4637 | if (cb->names_found >= MAX_NAME_COUNT) |
| 4638 | { |
| 4639 | errorcode = ERR49; |
| 4640 | goto FAILED; |
| 4641 | } |
| 4642 | |
| 4643 | /* Adjust the entry size to accommodate the longest name found. */ |
| 4644 | |
| 4645 | if (namelen + IMM2_SIZE + 1 > cb->name_entry_size) |
| 4646 | cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1); |
| 4647 | |
| 4648 | /* Scan the list to check for duplicates. For duplicate names, if the |
| 4649 | number is the same, break the loop, which causes the name to be |
| 4650 | discarded; otherwise, if DUPNAMES is not set, give an error. |
| 4651 | If it is set, allow the name with a different number, but continue |
| 4652 | scanning in case this is a duplicate with the same number. For |
| 4653 | non-duplicate names, give an error if the number is duplicated. */ |
| 4654 | |
| 4655 | isdupname = FALSE; |
| 4656 | ng = cb->named_groups; |
| 4657 | for (i = 0; i < cb->names_found; i++, ng++) |
| 4658 | { |
| 4659 | if (namelen == ng->length && |
| 4660 | PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0) |
| 4661 | { |
| 4662 | if (ng->number == cb->bracount) break; |
| 4663 | if ((options & PCRE2_DUPNAMES) == 0) |
| 4664 | { |
| 4665 | errorcode = ERR43; |
| 4666 | goto FAILED; |
| 4667 | } |
| 4668 | isdupname = ng->isdup = TRUE; /* Mark as a duplicate */ |
| 4669 | cb->dupnames = TRUE; /* Duplicate names exist */ |
| 4670 | } |
| 4671 | else if (ng->number == cb->bracount) |
| 4672 | { |
| 4673 | errorcode = ERR65; |
| 4674 | goto FAILED; |
| 4675 | } |
| 4676 | } |
| 4677 | |
| 4678 | if (i < cb->names_found) break; /* Ignore duplicate with same number */ |
| 4679 | |
| 4680 | /* Increase the list size if necessary */ |
| 4681 | |
| 4682 | if (cb->names_found >= cb->named_group_list_size) |
| 4683 | { |
| 4684 | uint32_t newsize = cb->named_group_list_size * 2; |
| 4685 | named_group *newspace = |
| 4686 | cb->cx->memctl.malloc(newsize * sizeof(named_group), |
| 4687 | cb->cx->memctl.memory_data); |
| 4688 | if (newspace == NULL) |
| 4689 | { |
| 4690 | errorcode = ERR21; |
| 4691 | goto FAILED; |
| 4692 | } |
| 4693 | |
| 4694 | memcpy(newspace, cb->named_groups, |
| 4695 | cb->named_group_list_size * sizeof(named_group)); |
| 4696 | if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE) |
| 4697 | cb->cx->memctl.free((void *)cb->named_groups, |
| 4698 | cb->cx->memctl.memory_data); |
| 4699 | cb->named_groups = newspace; |
| 4700 | cb->named_group_list_size = newsize; |
| 4701 | } |
| 4702 | |
| 4703 | /* Add this name to the list */ |
| 4704 | |
| 4705 | cb->named_groups[cb->names_found].name = name; |
| 4706 | cb->named_groups[cb->names_found].length = (uint16_t)namelen; |
| 4707 | cb->named_groups[cb->names_found].number = cb->bracount; |
| 4708 | cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname; |
| 4709 | cb->names_found++; |
| 4710 | break; |
| 4711 | } /* End of (? switch */ |
| 4712 | break; /* End of ( handling */ |
| 4713 | |
| 4714 | |
| 4715 | /* ---- Branch terminators ---- */ |
| 4716 | |
| 4717 | /* Alternation: reset the capture count if we are in a (?| group. */ |
| 4718 | |
| 4719 | case CHAR_VERTICAL_LINE: |
| 4720 | if (top_nest != NULL && top_nest->nest_depth == nest_depth && |
| 4721 | (top_nest->flags & NSF_RESET) != 0) |
| 4722 | { |
| 4723 | if (cb->bracount > top_nest->max_group) |
| 4724 | top_nest->max_group = (uint16_t)cb->bracount; |
| 4725 | cb->bracount = top_nest->reset_group; |
| 4726 | } |
| 4727 | *parsed_pattern++ = META_ALT; |
| 4728 | break; |
| 4729 | |
| 4730 | /* End of group; reset the capture count to the maximum if we are in a (?| |
| 4731 | group and/or reset the options that are tracked during parsing. Disallow |
| 4732 | quantifier for a condition that is an assertion. */ |
| 4733 | |
| 4734 | case CHAR_RIGHT_PARENTHESIS: |
| 4735 | okquantifier = TRUE; |
| 4736 | if (top_nest != NULL && top_nest->nest_depth == nest_depth) |
| 4737 | { |
| 4738 | options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options; |
| 4739 | if ((top_nest->flags & NSF_RESET) != 0 && |
| 4740 | top_nest->max_group > cb->bracount) |
| 4741 | cb->bracount = top_nest->max_group; |
| 4742 | if ((top_nest->flags & NSF_CONDASSERT) != 0) |
| 4743 | okquantifier = FALSE; |
| 4744 | |
| 4745 | if ((top_nest->flags & NSF_ATOMICSR) != 0) |
| 4746 | { |
| 4747 | *parsed_pattern++ = META_KET; |
| 4748 | } |
| 4749 | |
| 4750 | if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL; |
| 4751 | else top_nest--; |
| 4752 | } |
| 4753 | if (nest_depth == 0) /* Unmatched closing parenthesis */ |
| 4754 | { |
| 4755 | errorcode = ERR22; |
| 4756 | goto FAILED_BACK; |
| 4757 | } |
| 4758 | nest_depth--; |
| 4759 | *parsed_pattern++ = META_KET; |
| 4760 | break; |
| 4761 | } /* End of switch on pattern character */ |
| 4762 | } /* End of main character scan loop */ |
| 4763 | |
| 4764 | /* End of pattern reached. Check for missing ) at the end of a verb name. */ |
| 4765 | |
| 4766 | if (inverbname && ptr >= ptrend) |
| 4767 | { |
| 4768 | errorcode = ERR60; |
| 4769 | goto FAILED; |
| 4770 | } |
| 4771 | |
| 4772 | /* Manage callout for the final item */ |
| 4773 | |
| 4774 | PARSED_END: |
| 4775 | parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout, |
| 4776 | parsed_pattern, cb); |
| 4777 | |
| 4778 | /* Insert trailing items for word and line matching (features provided for the |
| 4779 | benefit of pcre2grep). */ |
| 4780 | |
| 4781 | if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0) |
| 4782 | { |
| 4783 | *parsed_pattern++ = META_KET; |
| 4784 | *parsed_pattern++ = META_DOLLAR; |
| 4785 | } |
| 4786 | else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0) |
| 4787 | { |
| 4788 | *parsed_pattern++ = META_KET; |
| 4789 | *parsed_pattern++ = META_ESCAPE + ESC_b; |
| 4790 | } |
| 4791 | |
| 4792 | /* Terminate the parsed pattern, then return success if all groups are closed. |
| 4793 | Otherwise we have unclosed parentheses. */ |
| 4794 | |
| 4795 | if (parsed_pattern >= parsed_pattern_end) |
| 4796 | { |
| 4797 | errorcode = ERR63; /* Internal error (parsed pattern overflow) */ |
| 4798 | goto FAILED; |
| 4799 | } |
| 4800 | |
| 4801 | *parsed_pattern = META_END; |
| 4802 | if (nest_depth == 0) return 0; |
| 4803 | |
| 4804 | UNCLOSED_PARENTHESIS: |
| 4805 | errorcode = ERR14; |
| 4806 | |
| 4807 | /* Come here for all failures. */ |
| 4808 | |
| 4809 | FAILED: |
| 4810 | cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern); |
| 4811 | return errorcode; |
| 4812 | |
| 4813 | /* Some errors need to indicate the previous character. */ |
| 4814 | |
| 4815 | FAILED_BACK: |
| 4816 | ptr--; |
| 4817 | goto FAILED; |
| 4818 | |
| 4819 | /* This failure happens several times. */ |
| 4820 | |
| 4821 | BAD_VERSION_CONDITION: |
| 4822 | errorcode = ERR79; |
| 4823 | goto FAILED; |
| 4824 | } |
| 4825 | |
| 4826 | |
| 4827 | |
| 4828 | /************************************************* |
| 4829 | * Find first significant opcode * |
| 4830 | *************************************************/ |
| 4831 | |
| 4832 | /* This is called by several functions that scan a compiled expression looking |
| 4833 | for a fixed first character, or an anchoring opcode etc. It skips over things |
| 4834 | that do not influence this. For some calls, it makes sense to skip negative |
| 4835 | forward and all backward assertions, and also the \b assertion; for others it |
| 4836 | does not. |
| 4837 | |
| 4838 | Arguments: |
| 4839 | code pointer to the start of the group |
| 4840 | skipassert TRUE if certain assertions are to be skipped |
| 4841 | |
| 4842 | Returns: pointer to the first significant opcode |
| 4843 | */ |
| 4844 | |
| 4845 | static const PCRE2_UCHAR* |
| 4846 | first_significant_code(PCRE2_SPTR code, BOOL skipassert) |
| 4847 | { |
| 4848 | for (;;) |
| 4849 | { |
| 4850 | switch ((int)*code) |
| 4851 | { |
| 4852 | case OP_ASSERT_NOT: |
| 4853 | case OP_ASSERTBACK: |
| 4854 | case OP_ASSERTBACK_NOT: |
| 4855 | case OP_ASSERTBACK_NA: |
| 4856 | if (!skipassert) return code; |
| 4857 | do code += GET(code, 1); while (*code == OP_ALT); |
| 4858 | code += PRIV(OP_lengths)[*code]; |
| 4859 | break; |
| 4860 | |
| 4861 | case OP_WORD_BOUNDARY: |
| 4862 | case OP_NOT_WORD_BOUNDARY: |
| 4863 | if (!skipassert) return code; |
| 4864 | /* Fall through */ |
| 4865 | |
| 4866 | case OP_CALLOUT: |
| 4867 | case OP_CREF: |
| 4868 | case OP_DNCREF: |
| 4869 | case OP_RREF: |
| 4870 | case OP_DNRREF: |
| 4871 | case OP_FALSE: |
| 4872 | case OP_TRUE: |
| 4873 | code += PRIV(OP_lengths)[*code]; |
| 4874 | break; |
| 4875 | |
| 4876 | case OP_CALLOUT_STR: |
| 4877 | code += GET(code, 1 + 2*LINK_SIZE); |
| 4878 | break; |
| 4879 | |
| 4880 | case OP_SKIPZERO: |
| 4881 | code += 2 + GET(code, 2) + LINK_SIZE; |
| 4882 | break; |
| 4883 | |
| 4884 | case OP_COND: |
| 4885 | case OP_SCOND: |
| 4886 | if (code[1+LINK_SIZE] != OP_FALSE || /* Not DEFINE */ |
| 4887 | code[GET(code, 1)] != OP_KET) /* More than one branch */ |
| 4888 | return code; |
| 4889 | code += GET(code, 1) + 1 + LINK_SIZE; |
| 4890 | break; |
| 4891 | |
| 4892 | case OP_MARK: |
| 4893 | case OP_COMMIT_ARG: |
| 4894 | case OP_PRUNE_ARG: |
| 4895 | case OP_SKIP_ARG: |
| 4896 | case OP_THEN_ARG: |
| 4897 | code += code[1] + PRIV(OP_lengths)[*code]; |
| 4898 | break; |
| 4899 | |
| 4900 | default: |
| 4901 | return code; |
| 4902 | } |
| 4903 | } |
| 4904 | /* Control never reaches here */ |
| 4905 | } |
| 4906 | |
| 4907 | |
| 4908 | |
| 4909 | #ifdef SUPPORT_UNICODE |
| 4910 | /************************************************* |
| 4911 | * Get othercase range * |
| 4912 | *************************************************/ |
| 4913 | |
| 4914 | /* This function is passed the start and end of a class range in UCP mode. It |
| 4915 | searches up the characters, looking for ranges of characters in the "other" |
| 4916 | case. Each call returns the next one, updating the start address. A character |
| 4917 | with multiple other cases is returned on its own with a special return value. |
| 4918 | |
| 4919 | Arguments: |
| 4920 | cptr points to starting character value; updated |
| 4921 | d end value |
| 4922 | ocptr where to put start of othercase range |
| 4923 | odptr where to put end of othercase range |
| 4924 | |
| 4925 | Yield: -1 when no more |
| 4926 | 0 when a range is returned |
| 4927 | >0 the CASESET offset for char with multiple other cases |
| 4928 | in this case, ocptr contains the original |
| 4929 | */ |
| 4930 | |
| 4931 | static int |
| 4932 | get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr, |
| 4933 | uint32_t *odptr) |
| 4934 | { |
| 4935 | uint32_t c, othercase, next; |
| 4936 | unsigned int co; |
| 4937 | |
| 4938 | /* Find the first character that has an other case. If it has multiple other |
| 4939 | cases, return its case offset value. */ |
| 4940 | |
| 4941 | for (c = *cptr; c <= d; c++) |
| 4942 | { |
| 4943 | if ((co = UCD_CASESET(c)) != 0) |
| 4944 | { |
| 4945 | *ocptr = c++; /* Character that has the set */ |
| 4946 | *cptr = c; /* Rest of input range */ |
| 4947 | return (int)co; |
| 4948 | } |
| 4949 | if ((othercase = UCD_OTHERCASE(c)) != c) break; |
| 4950 | } |
| 4951 | |
| 4952 | if (c > d) return -1; /* Reached end of range */ |
| 4953 | |
| 4954 | /* Found a character that has a single other case. Search for the end of the |
| 4955 | range, which is either the end of the input range, or a character that has zero |
| 4956 | or more than one other cases. */ |
| 4957 | |
| 4958 | *ocptr = othercase; |
| 4959 | next = othercase + 1; |
| 4960 | |
| 4961 | for (++c; c <= d; c++) |
| 4962 | { |
| 4963 | if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break; |
| 4964 | next++; |
| 4965 | } |
| 4966 | |
| 4967 | *odptr = next - 1; /* End of othercase range */ |
| 4968 | *cptr = c; /* Rest of input range */ |
| 4969 | return 0; |
| 4970 | } |
| 4971 | #endif /* SUPPORT_UNICODE */ |
| 4972 | |
| 4973 | |
| 4974 | |
| 4975 | /************************************************* |
| 4976 | * Add a character or range to a class (internal) * |
| 4977 | *************************************************/ |
| 4978 | |
| 4979 | /* This function packages up the logic of adding a character or range of |
| 4980 | characters to a class. The character values in the arguments will be within the |
| 4981 | valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is |
| 4982 | called only from within the "add to class" group of functions, some of which |
| 4983 | are recursive and mutually recursive. The external entry point is |
| 4984 | add_to_class(). |
| 4985 | |
| 4986 | Arguments: |
| 4987 | classbits the bit map for characters < 256 |
| 4988 | uchardptr points to the pointer for extra data |
| 4989 | options the options word |
| 4990 | cb compile data |
| 4991 | start start of range character |
| 4992 | end end of range character |
| 4993 | |
| 4994 | Returns: the number of < 256 characters added |
| 4995 | the pointer to extra data is updated |
| 4996 | */ |
| 4997 | |
| 4998 | static unsigned int |
| 4999 | add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr, |
| 5000 | uint32_t options, compile_block *cb, uint32_t start, uint32_t end) |
| 5001 | { |
| 5002 | uint32_t c; |
| 5003 | uint32_t classbits_end = (end <= 0xff ? end : 0xff); |
| 5004 | unsigned int n8 = 0; |
| 5005 | |
| 5006 | /* If caseless matching is required, scan the range and process alternate |
| 5007 | cases. In Unicode, there are 8-bit characters that have alternate cases that |
| 5008 | are greater than 255 and vice-versa. Sometimes we can just extend the original |
| 5009 | range. */ |
| 5010 | |
| 5011 | if ((options & PCRE2_CASELESS) != 0) |
| 5012 | { |
| 5013 | #ifdef SUPPORT_UNICODE |
| 5014 | if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0) |
| 5015 | { |
| 5016 | int rc; |
| 5017 | uint32_t oc, od; |
| 5018 | |
| 5019 | options &= ~PCRE2_CASELESS; /* Remove for recursive calls */ |
| 5020 | c = start; |
| 5021 | |
| 5022 | while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0) |
| 5023 | { |
| 5024 | /* Handle a single character that has more than one other case. */ |
| 5025 | |
| 5026 | if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr, options, cb, |
| 5027 | PRIV(ucd_caseless_sets) + rc, oc); |
| 5028 | |
| 5029 | /* Do nothing if the other case range is within the original range. */ |
| 5030 | |
| 5031 | else if (oc >= cb->class_range_start && od <= cb->class_range_end) continue; |
| 5032 | |
| 5033 | /* Extend the original range if there is overlap, noting that if oc < c, we |
| 5034 | can't have od > end because a subrange is always shorter than the basic |
| 5035 | range. Otherwise, use a recursive call to add the additional range. */ |
| 5036 | |
| 5037 | else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */ |
| 5038 | else if (od > end && oc <= end + 1) |
| 5039 | { |
| 5040 | end = od; /* Extend upwards */ |
| 5041 | if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff); |
| 5042 | } |
| 5043 | else n8 += add_to_class_internal(classbits, uchardptr, options, cb, oc, od); |
| 5044 | } |
| 5045 | } |
| 5046 | else |
| 5047 | #endif /* SUPPORT_UNICODE */ |
| 5048 | |
| 5049 | /* Not UTF mode */ |
| 5050 | |
| 5051 | for (c = start; c <= classbits_end; c++) |
| 5052 | { |
| 5053 | SETBIT(classbits, cb->fcc[c]); |
| 5054 | n8++; |
| 5055 | } |
| 5056 | } |
| 5057 | |
| 5058 | /* Now handle the originally supplied range. Adjust the final value according |
| 5059 | to the bit length - this means that the same lists of (e.g.) horizontal spaces |
| 5060 | can be used in all cases. */ |
| 5061 | |
| 5062 | if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR) |
| 5063 | end = MAX_NON_UTF_CHAR; |
| 5064 | |
| 5065 | if (start > cb->class_range_start && end < cb->class_range_end) return n8; |
| 5066 | |
| 5067 | /* Use the bitmap for characters < 256. Otherwise use extra data.*/ |
| 5068 | |
| 5069 | for (c = start; c <= classbits_end; c++) |
| 5070 | { |
| 5071 | /* Regardless of start, c will always be <= 255. */ |
| 5072 | SETBIT(classbits, c); |
| 5073 | n8++; |
| 5074 | } |
| 5075 | |
| 5076 | #ifdef SUPPORT_WIDE_CHARS |
| 5077 | if (start <= 0xff) start = 0xff + 1; |
| 5078 | |
| 5079 | if (end >= start) |
| 5080 | { |
| 5081 | PCRE2_UCHAR *uchardata = *uchardptr; |
| 5082 | |
| 5083 | #ifdef SUPPORT_UNICODE |
| 5084 | if ((options & PCRE2_UTF) != 0) |
| 5085 | { |
| 5086 | if (start < end) |
| 5087 | { |
| 5088 | *uchardata++ = XCL_RANGE; |
| 5089 | uchardata += PRIV(ord2utf)(start, uchardata); |
| 5090 | uchardata += PRIV(ord2utf)(end, uchardata); |
| 5091 | } |
| 5092 | else if (start == end) |
| 5093 | { |
| 5094 | *uchardata++ = XCL_SINGLE; |
| 5095 | uchardata += PRIV(ord2utf)(start, uchardata); |
| 5096 | } |
| 5097 | } |
| 5098 | else |
| 5099 | #endif /* SUPPORT_UNICODE */ |
| 5100 | |
| 5101 | /* Without UTF support, character values are constrained by the bit length, |
| 5102 | and can only be > 256 for 16-bit and 32-bit libraries. */ |
| 5103 | |
| 5104 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
| 5105 | {} |
| 5106 | #else |
| 5107 | if (start < end) |
| 5108 | { |
| 5109 | *uchardata++ = XCL_RANGE; |
| 5110 | *uchardata++ = start; |
| 5111 | *uchardata++ = end; |
| 5112 | } |
| 5113 | else if (start == end) |
| 5114 | { |
| 5115 | *uchardata++ = XCL_SINGLE; |
| 5116 | *uchardata++ = start; |
| 5117 | } |
| 5118 | #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ |
| 5119 | *uchardptr = uchardata; /* Updata extra data pointer */ |
| 5120 | } |
| 5121 | #else /* SUPPORT_WIDE_CHARS */ |
| 5122 | (void)uchardptr; /* Avoid compiler warning */ |
| 5123 | #endif /* SUPPORT_WIDE_CHARS */ |
| 5124 | |
| 5125 | return n8; /* Number of 8-bit characters */ |
| 5126 | } |
| 5127 | |
| 5128 | |
| 5129 | |
| 5130 | #ifdef SUPPORT_UNICODE |
| 5131 | /************************************************* |
| 5132 | * Add a list of characters to a class (internal) * |
| 5133 | *************************************************/ |
| 5134 | |
| 5135 | /* This function is used for adding a list of case-equivalent characters to a |
| 5136 | class when in UTF mode. This function is called only from within |
| 5137 | add_to_class_internal(), with which it is mutually recursive. |
| 5138 | |
| 5139 | Arguments: |
| 5140 | classbits the bit map for characters < 256 |
| 5141 | uchardptr points to the pointer for extra data |
| 5142 | options the options word |
| 5143 | cb contains pointers to tables etc. |
| 5144 | p points to row of 32-bit values, terminated by NOTACHAR |
| 5145 | except character to omit; this is used when adding lists of |
| 5146 | case-equivalent characters to avoid including the one we |
| 5147 | already know about |
| 5148 | |
| 5149 | Returns: the number of < 256 characters added |
| 5150 | the pointer to extra data is updated |
| 5151 | */ |
| 5152 | |
| 5153 | static unsigned int |
| 5154 | add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr, |
| 5155 | uint32_t options, compile_block *cb, const uint32_t *p, unsigned int except) |
| 5156 | { |
| 5157 | unsigned int n8 = 0; |
| 5158 | while (p[0] < NOTACHAR) |
| 5159 | { |
| 5160 | unsigned int n = 0; |
| 5161 | if (p[0] != except) |
| 5162 | { |
| 5163 | while(p[n+1] == p[0] + n + 1) n++; |
| 5164 | n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]); |
| 5165 | } |
| 5166 | p += n + 1; |
| 5167 | } |
| 5168 | return n8; |
| 5169 | } |
| 5170 | #endif |
| 5171 | |
| 5172 | |
| 5173 | |
| 5174 | /************************************************* |
| 5175 | * External entry point for add range to class * |
| 5176 | *************************************************/ |
| 5177 | |
| 5178 | /* This function sets the overall range so that the internal functions can try |
| 5179 | to avoid duplication when handling case-independence. |
| 5180 | |
| 5181 | Arguments: |
| 5182 | classbits the bit map for characters < 256 |
| 5183 | uchardptr points to the pointer for extra data |
| 5184 | options the options word |
| 5185 | cb compile data |
| 5186 | start start of range character |
| 5187 | end end of range character |
| 5188 | |
| 5189 | Returns: the number of < 256 characters added |
| 5190 | the pointer to extra data is updated |
| 5191 | */ |
| 5192 | |
| 5193 | static unsigned int |
| 5194 | add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, |
| 5195 | compile_block *cb, uint32_t start, uint32_t end) |
| 5196 | { |
| 5197 | cb->class_range_start = start; |
| 5198 | cb->class_range_end = end; |
| 5199 | return add_to_class_internal(classbits, uchardptr, options, cb, start, end); |
| 5200 | } |
| 5201 | |
| 5202 | |
| 5203 | /************************************************* |
| 5204 | * External entry point for add list to class * |
| 5205 | *************************************************/ |
| 5206 | |
| 5207 | /* This function is used for adding a list of horizontal or vertical whitespace |
| 5208 | characters to a class. The list must be in order so that ranges of characters |
| 5209 | can be detected and handled appropriately. This function sets the overall range |
| 5210 | so that the internal functions can try to avoid duplication when handling |
| 5211 | case-independence. |
| 5212 | |
| 5213 | Arguments: |
| 5214 | classbits the bit map for characters < 256 |
| 5215 | uchardptr points to the pointer for extra data |
| 5216 | options the options word |
| 5217 | cb contains pointers to tables etc. |
| 5218 | p points to row of 32-bit values, terminated by NOTACHAR |
| 5219 | except character to omit; this is used when adding lists of |
| 5220 | case-equivalent characters to avoid including the one we |
| 5221 | already know about |
| 5222 | |
| 5223 | Returns: the number of < 256 characters added |
| 5224 | the pointer to extra data is updated |
| 5225 | */ |
| 5226 | |
| 5227 | static unsigned int |
| 5228 | add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, |
| 5229 | compile_block *cb, const uint32_t *p, unsigned int except) |
| 5230 | { |
| 5231 | unsigned int n8 = 0; |
| 5232 | while (p[0] < NOTACHAR) |
| 5233 | { |
| 5234 | unsigned int n = 0; |
| 5235 | if (p[0] != except) |
| 5236 | { |
| 5237 | while(p[n+1] == p[0] + n + 1) n++; |
| 5238 | cb->class_range_start = p[0]; |
| 5239 | cb->class_range_end = p[n]; |
| 5240 | n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]); |
| 5241 | } |
| 5242 | p += n + 1; |
| 5243 | } |
| 5244 | return n8; |
| 5245 | } |
| 5246 | |
| 5247 | |
| 5248 | |
| 5249 | /************************************************* |
| 5250 | * Add characters not in a list to a class * |
| 5251 | *************************************************/ |
| 5252 | |
| 5253 | /* This function is used for adding the complement of a list of horizontal or |
| 5254 | vertical whitespace to a class. The list must be in order. |
| 5255 | |
| 5256 | Arguments: |
| 5257 | classbits the bit map for characters < 256 |
| 5258 | uchardptr points to the pointer for extra data |
| 5259 | options the options word |
| 5260 | cb contains pointers to tables etc. |
| 5261 | p points to row of 32-bit values, terminated by NOTACHAR |
| 5262 | |
| 5263 | Returns: the number of < 256 characters added |
| 5264 | the pointer to extra data is updated |
| 5265 | */ |
| 5266 | |
| 5267 | static unsigned int |
| 5268 | add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, |
| 5269 | uint32_t options, compile_block *cb, const uint32_t *p) |
| 5270 | { |
| 5271 | BOOL utf = (options & PCRE2_UTF) != 0; |
| 5272 | unsigned int n8 = 0; |
| 5273 | if (p[0] > 0) |
| 5274 | n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1); |
| 5275 | while (p[0] < NOTACHAR) |
| 5276 | { |
| 5277 | while (p[1] == p[0] + 1) p++; |
| 5278 | n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1, |
| 5279 | (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1); |
| 5280 | p++; |
| 5281 | } |
| 5282 | return n8; |
| 5283 | } |
| 5284 | |
| 5285 | |
| 5286 | |
| 5287 | /************************************************* |
| 5288 | * Find details of duplicate group names * |
| 5289 | *************************************************/ |
| 5290 | |
| 5291 | /* This is called from compile_branch() when it needs to know the index and |
| 5292 | count of duplicates in the names table when processing named backreferences, |
| 5293 | either directly, or as conditions. |
| 5294 | |
| 5295 | Arguments: |
| 5296 | name points to the name |
| 5297 | length the length of the name |
| 5298 | indexptr where to put the index |
| 5299 | countptr where to put the count of duplicates |
| 5300 | errorcodeptr where to put an error code |
| 5301 | cb the compile block |
| 5302 | |
| 5303 | Returns: TRUE if OK, FALSE if not, error code set |
| 5304 | */ |
| 5305 | |
| 5306 | static BOOL |
| 5307 | find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr, |
| 5308 | int *countptr, int *errorcodeptr, compile_block *cb) |
| 5309 | { |
| 5310 | uint32_t i, groupnumber; |
| 5311 | int count; |
| 5312 | PCRE2_UCHAR *slot = cb->name_table; |
| 5313 | |
| 5314 | /* Find the first entry in the table */ |
| 5315 | |
| 5316 | for (i = 0; i < cb->names_found; i++) |
| 5317 | { |
| 5318 | if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 && |
| 5319 | slot[IMM2_SIZE+length] == 0) break; |
| 5320 | slot += cb->name_entry_size; |
| 5321 | } |
| 5322 | |
| 5323 | /* This should not occur, because this function is called only when we know we |
| 5324 | have duplicate names. Give an internal error. */ |
| 5325 | |
| 5326 | if (i >= cb->names_found) |
| 5327 | { |
| 5328 | *errorcodeptr = ERR53; |
| 5329 | cb->erroroffset = name - cb->start_pattern; |
| 5330 | return FALSE; |
| 5331 | } |
| 5332 | |
| 5333 | /* Record the index and then see how many duplicates there are, updating the |
| 5334 | backref map and maximum back reference as we do. */ |
| 5335 | |
| 5336 | *indexptr = i; |
| 5337 | count = 0; |
| 5338 | |
| 5339 | for (;;) |
| 5340 | { |
| 5341 | count++; |
| 5342 | groupnumber = GET2(slot,0); |
| 5343 | cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1; |
| 5344 | if (groupnumber > cb->top_backref) cb->top_backref = groupnumber; |
| 5345 | if (++i >= cb->names_found) break; |
| 5346 | slot += cb->name_entry_size; |
| 5347 | if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 || |
| 5348 | (slot+IMM2_SIZE)[length] != 0) break; |
| 5349 | } |
| 5350 | |
| 5351 | *countptr = count; |
| 5352 | return TRUE; |
| 5353 | } |
| 5354 | |
| 5355 | |
| 5356 | |
| 5357 | /************************************************* |
| 5358 | * Compile one branch * |
| 5359 | *************************************************/ |
| 5360 | |
| 5361 | /* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If |
| 5362 | the options are changed during the branch, the pointer is used to change the |
| 5363 | external options bits. This function is used during the pre-compile phase when |
| 5364 | we are trying to find out the amount of memory needed, as well as during the |
| 5365 | real compile phase. The value of lengthptr distinguishes the two phases. |
| 5366 | |
| 5367 | Arguments: |
| 5368 | optionsptr pointer to the option bits |
| 5369 | codeptr points to the pointer to the current code point |
| 5370 | pptrptr points to the current parsed pattern pointer |
| 5371 | errorcodeptr points to error code variable |
| 5372 | firstcuptr place to put the first required code unit |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 5373 | firstcuflagsptr place to put the first code unit flags |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 5374 | reqcuptr place to put the last required code unit |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 5375 | reqcuflagsptr place to put the last required code unit flags |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 5376 | bcptr points to current branch chain |
| 5377 | cb contains pointers to tables etc. |
| 5378 | lengthptr NULL during the real compile phase |
| 5379 | points to length accumulator during pre-compile phase |
| 5380 | |
| 5381 | Returns: 0 There's been an error, *errorcodeptr is non-zero |
| 5382 | +1 Success, this branch must match at least one character |
| 5383 | -1 Success, this branch may match an empty string |
| 5384 | */ |
| 5385 | |
| 5386 | static int |
| 5387 | compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr, uint32_t **pptrptr, |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 5388 | int *errorcodeptr, uint32_t *firstcuptr, uint32_t *firstcuflagsptr, |
| 5389 | uint32_t *reqcuptr, uint32_t *reqcuflagsptr, branch_chain *bcptr, |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 5390 | compile_block *cb, PCRE2_SIZE *lengthptr) |
| 5391 | { |
| 5392 | int bravalue = 0; |
| 5393 | int okreturn = -1; |
| 5394 | int group_return = 0; |
| 5395 | uint32_t repeat_min = 0, repeat_max = 0; /* To please picky compilers */ |
| 5396 | uint32_t greedy_default, greedy_non_default; |
| 5397 | uint32_t repeat_type, op_type; |
| 5398 | uint32_t options = *optionsptr; /* May change dynamically */ |
| 5399 | uint32_t firstcu, reqcu; |
| 5400 | uint32_t zeroreqcu, zerofirstcu; |
| 5401 | uint32_t escape; |
| 5402 | uint32_t *pptr = *pptrptr; |
| 5403 | uint32_t meta, meta_arg; |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 5404 | uint32_t firstcuflags, reqcuflags; |
| 5405 | uint32_t zeroreqcuflags, zerofirstcuflags; |
| 5406 | uint32_t req_caseopt, reqvary, tempreqvary; |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 5407 | PCRE2_SIZE offset = 0; |
| 5408 | PCRE2_SIZE length_prevgroup = 0; |
| 5409 | PCRE2_UCHAR *code = *codeptr; |
| 5410 | PCRE2_UCHAR *last_code = code; |
| 5411 | PCRE2_UCHAR *orig_code = code; |
| 5412 | PCRE2_UCHAR *tempcode; |
| 5413 | PCRE2_UCHAR *previous = NULL; |
| 5414 | PCRE2_UCHAR op_previous; |
| 5415 | BOOL groupsetfirstcu = FALSE; |
| 5416 | BOOL had_accept = FALSE; |
| 5417 | BOOL matched_char = FALSE; |
| 5418 | BOOL previous_matched_char = FALSE; |
| 5419 | BOOL reset_caseful = FALSE; |
| 5420 | const uint8_t *cbits = cb->cbits; |
| 5421 | uint8_t classbits[32]; |
| 5422 | |
| 5423 | /* We can fish out the UTF setting once and for all into a BOOL, but we must |
| 5424 | not do this for other options (e.g. PCRE2_EXTENDED) because they may change |
| 5425 | dynamically as we process the pattern. */ |
| 5426 | |
| 5427 | #ifdef SUPPORT_UNICODE |
| 5428 | BOOL utf = (options & PCRE2_UTF) != 0; |
| 5429 | BOOL ucp = (options & PCRE2_UCP) != 0; |
| 5430 | #else /* No Unicode support */ |
| 5431 | BOOL utf = FALSE; |
| 5432 | #endif |
| 5433 | |
| 5434 | /* Helper variables for OP_XCLASS opcode (for characters > 255). We define |
| 5435 | class_uchardata always so that it can be passed to add_to_class() always, |
| 5436 | though it will not be used in non-UTF 8-bit cases. This avoids having to supply |
| 5437 | alternative calls for the different cases. */ |
| 5438 | |
| 5439 | PCRE2_UCHAR *class_uchardata; |
| 5440 | #ifdef SUPPORT_WIDE_CHARS |
| 5441 | BOOL xclass; |
| 5442 | PCRE2_UCHAR *class_uchardata_base; |
| 5443 | #endif |
| 5444 | |
| 5445 | /* Set up the default and non-default settings for greediness */ |
| 5446 | |
| 5447 | greedy_default = ((options & PCRE2_UNGREEDY) != 0); |
| 5448 | greedy_non_default = greedy_default ^ 1; |
| 5449 | |
| 5450 | /* Initialize no first unit, no required unit. REQ_UNSET means "no char |
| 5451 | matching encountered yet". It gets changed to REQ_NONE if we hit something that |
| 5452 | matches a non-fixed first unit; reqcu just remains unset if we never find one. |
| 5453 | |
| 5454 | When we hit a repeat whose minimum is zero, we may have to adjust these values |
| 5455 | to take the zero repeat into account. This is implemented by setting them to |
| 5456 | zerofirstcu and zeroreqcu when such a repeat is encountered. The individual |
| 5457 | item types that can be repeated set these backoff variables appropriately. */ |
| 5458 | |
| 5459 | firstcu = reqcu = zerofirstcu = zeroreqcu = 0; |
| 5460 | firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET; |
| 5461 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 5462 | /* The variable req_caseopt contains either the REQ_CASELESS bit or zero, |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 5463 | according to the current setting of the caseless flag. The REQ_CASELESS value |
| 5464 | leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables |
| 5465 | to record the case status of the value. This is used only for ASCII characters. |
| 5466 | */ |
| 5467 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 5468 | req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0; |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 5469 | |
| 5470 | /* Switch on next META item until the end of the branch */ |
| 5471 | |
| 5472 | for (;; pptr++) |
| 5473 | { |
| 5474 | #ifdef SUPPORT_WIDE_CHARS |
| 5475 | BOOL xclass_has_prop; |
| 5476 | #endif |
| 5477 | BOOL negate_class; |
| 5478 | BOOL should_flip_negation; |
| 5479 | BOOL match_all_or_no_wide_chars; |
| 5480 | BOOL possessive_quantifier; |
| 5481 | BOOL note_group_empty; |
| 5482 | int class_has_8bitchar; |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 5483 | uint32_t mclength; |
| 5484 | uint32_t skipunits; |
| 5485 | uint32_t subreqcu, subfirstcu; |
| 5486 | uint32_t groupnumber; |
| 5487 | uint32_t verbarglen, verbculen; |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 5488 | uint32_t subreqcuflags, subfirstcuflags; |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 5489 | open_capitem *oc; |
| 5490 | PCRE2_UCHAR mcbuffer[8]; |
| 5491 | |
| 5492 | /* Get next META item in the pattern and its potential argument. */ |
| 5493 | |
| 5494 | meta = META_CODE(*pptr); |
| 5495 | meta_arg = META_DATA(*pptr); |
| 5496 | |
| 5497 | /* If we are in the pre-compile phase, accumulate the length used for the |
| 5498 | previous cycle of this loop, unless the next item is a quantifier. */ |
| 5499 | |
| 5500 | if (lengthptr != NULL) |
| 5501 | { |
| 5502 | if (code > cb->start_workspace + cb->workspace_size - |
| 5503 | WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */ |
| 5504 | { |
| 5505 | *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)? |
| 5506 | ERR52 : ERR86; |
| 5507 | return 0; |
| 5508 | } |
| 5509 | |
| 5510 | /* There is at least one situation where code goes backwards: this is the |
| 5511 | case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier |
| 5512 | is processed, the whole class is eliminated. However, it is created first, |
| 5513 | so we have to allow memory for it. Therefore, don't ever reduce the length |
| 5514 | at this point. */ |
| 5515 | |
| 5516 | if (code < last_code) code = last_code; |
| 5517 | |
| 5518 | /* If the next thing is not a quantifier, we add the length of the previous |
| 5519 | item into the total, and reset the code pointer to the start of the |
| 5520 | workspace. Otherwise leave the previous item available to be quantified. */ |
| 5521 | |
| 5522 | if (meta < META_ASTERISK || meta > META_MINMAX_QUERY) |
| 5523 | { |
| 5524 | if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code)) |
| 5525 | { |
| 5526 | *errorcodeptr = ERR20; /* Integer overflow */ |
| 5527 | return 0; |
| 5528 | } |
| 5529 | *lengthptr += (PCRE2_SIZE)(code - orig_code); |
| 5530 | if (*lengthptr > MAX_PATTERN_SIZE) |
| 5531 | { |
| 5532 | *errorcodeptr = ERR20; /* Pattern is too large */ |
| 5533 | return 0; |
| 5534 | } |
| 5535 | code = orig_code; |
| 5536 | } |
| 5537 | |
| 5538 | /* Remember where this code item starts so we can catch the "backwards" |
| 5539 | case above next time round. */ |
| 5540 | |
| 5541 | last_code = code; |
| 5542 | } |
| 5543 | |
| 5544 | /* Process the next parsed pattern item. If it is not a quantifier, remember |
| 5545 | where it starts so that it can be quantified when a quantifier follows. |
| 5546 | Checking for the legality of quantifiers happens in parse_regex(), except for |
| 5547 | a quantifier after an assertion that is a condition. */ |
| 5548 | |
| 5549 | if (meta < META_ASTERISK || meta > META_MINMAX_QUERY) |
| 5550 | { |
| 5551 | previous = code; |
| 5552 | if (matched_char && !had_accept) okreturn = 1; |
| 5553 | } |
| 5554 | |
| 5555 | previous_matched_char = matched_char; |
| 5556 | matched_char = FALSE; |
| 5557 | note_group_empty = FALSE; |
| 5558 | skipunits = 0; /* Default value for most subgroups */ |
| 5559 | |
| 5560 | switch(meta) |
| 5561 | { |
| 5562 | /* ===================================================================*/ |
| 5563 | /* The branch terminates at pattern end or | or ) */ |
| 5564 | |
| 5565 | case META_END: |
| 5566 | case META_ALT: |
| 5567 | case META_KET: |
| 5568 | *firstcuptr = firstcu; |
| 5569 | *firstcuflagsptr = firstcuflags; |
| 5570 | *reqcuptr = reqcu; |
| 5571 | *reqcuflagsptr = reqcuflags; |
| 5572 | *codeptr = code; |
| 5573 | *pptrptr = pptr; |
| 5574 | return okreturn; |
| 5575 | |
| 5576 | |
| 5577 | /* ===================================================================*/ |
| 5578 | /* Handle single-character metacharacters. In multiline mode, ^ disables |
| 5579 | the setting of any following char as a first character. */ |
| 5580 | |
| 5581 | case META_CIRCUMFLEX: |
| 5582 | if ((options & PCRE2_MULTILINE) != 0) |
| 5583 | { |
| 5584 | if (firstcuflags == REQ_UNSET) |
| 5585 | zerofirstcuflags = firstcuflags = REQ_NONE; |
| 5586 | *code++ = OP_CIRCM; |
| 5587 | } |
| 5588 | else *code++ = OP_CIRC; |
| 5589 | break; |
| 5590 | |
| 5591 | case META_DOLLAR: |
| 5592 | *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL; |
| 5593 | break; |
| 5594 | |
| 5595 | /* There can never be a first char if '.' is first, whatever happens about |
| 5596 | repeats. The value of reqcu doesn't change either. */ |
| 5597 | |
| 5598 | case META_DOT: |
| 5599 | matched_char = TRUE; |
| 5600 | if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; |
| 5601 | zerofirstcu = firstcu; |
| 5602 | zerofirstcuflags = firstcuflags; |
| 5603 | zeroreqcu = reqcu; |
| 5604 | zeroreqcuflags = reqcuflags; |
| 5605 | *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY; |
| 5606 | break; |
| 5607 | |
| 5608 | |
| 5609 | /* ===================================================================*/ |
| 5610 | /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set. |
| 5611 | Otherwise, an initial ']' is taken as a data character. When empty classes |
| 5612 | are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must |
| 5613 | match any character, so generate OP_ALLANY. */ |
| 5614 | |
| 5615 | case META_CLASS_EMPTY: |
| 5616 | case META_CLASS_EMPTY_NOT: |
| 5617 | matched_char = TRUE; |
| 5618 | *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL; |
| 5619 | if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; |
| 5620 | zerofirstcu = firstcu; |
| 5621 | zerofirstcuflags = firstcuflags; |
| 5622 | break; |
| 5623 | |
| 5624 | |
| 5625 | /* ===================================================================*/ |
| 5626 | /* Non-empty character class. If the included characters are all < 256, we |
| 5627 | build a 32-byte bitmap of the permitted characters, except in the special |
| 5628 | case where there is only one such character. For negated classes, we build |
| 5629 | the map as usual, then invert it at the end. However, we use a different |
| 5630 | opcode so that data characters > 255 can be handled correctly. |
| 5631 | |
| 5632 | If the class contains characters outside the 0-255 range, a different |
| 5633 | opcode is compiled. It may optionally have a bit map for characters < 256, |
| 5634 | but those above are are explicitly listed afterwards. A flag code unit |
| 5635 | tells whether the bitmap is present, and whether this is a negated class or |
| 5636 | not. */ |
| 5637 | |
| 5638 | case META_CLASS_NOT: |
| 5639 | case META_CLASS: |
| 5640 | matched_char = TRUE; |
| 5641 | negate_class = meta == META_CLASS_NOT; |
| 5642 | |
| 5643 | /* We can optimize the case of a single character in a class by generating |
| 5644 | OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's |
| 5645 | negative. In the negative case there can be no first char if this item is |
| 5646 | first, whatever repeat count may follow. In the case of reqcu, save the |
| 5647 | previous value for reinstating. */ |
| 5648 | |
| 5649 | /* NOTE: at present this optimization is not effective if the only |
| 5650 | character in a class in 32-bit, non-UCP mode has its top bit set. */ |
| 5651 | |
| 5652 | if (pptr[1] < META_END && pptr[2] == META_CLASS_END) |
| 5653 | { |
| 5654 | #ifdef SUPPORT_UNICODE |
| 5655 | uint32_t d; |
| 5656 | #endif |
| 5657 | uint32_t c = pptr[1]; |
| 5658 | |
| 5659 | pptr += 2; /* Move on to class end */ |
| 5660 | if (meta == META_CLASS) /* A positive one-char class can be */ |
| 5661 | { /* handled as a normal literal character. */ |
| 5662 | meta = c; /* Set up the character */ |
| 5663 | goto NORMAL_CHAR_SET; |
| 5664 | } |
| 5665 | |
| 5666 | /* Handle a negative one-character class */ |
| 5667 | |
| 5668 | zeroreqcu = reqcu; |
| 5669 | zeroreqcuflags = reqcuflags; |
| 5670 | if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; |
| 5671 | zerofirstcu = firstcu; |
| 5672 | zerofirstcuflags = firstcuflags; |
| 5673 | |
| 5674 | /* For caseless UTF or UCP mode, check whether this character has more |
| 5675 | than one other case. If so, generate a special OP_NOTPROP item instead of |
| 5676 | OP_NOTI. */ |
| 5677 | |
| 5678 | #ifdef SUPPORT_UNICODE |
| 5679 | if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 && |
| 5680 | (d = UCD_CASESET(c)) != 0) |
| 5681 | { |
| 5682 | *code++ = OP_NOTPROP; |
| 5683 | *code++ = PT_CLIST; |
| 5684 | *code++ = d; |
| 5685 | break; /* We are finished with this class */ |
| 5686 | } |
| 5687 | #endif |
| 5688 | /* Char has only one other case, or UCP not available */ |
| 5689 | |
| 5690 | *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT; |
| 5691 | code += PUTCHAR(c, code); |
| 5692 | break; /* We are finished with this class */ |
| 5693 | } /* End of 1-char optimization */ |
| 5694 | |
| 5695 | /* Handle character classes that contain more than just one literal |
| 5696 | character. If there are exactly two characters in a positive class, see if |
| 5697 | they are case partners. This can be optimized to generate a caseless single |
| 5698 | character match (which also sets first/required code units if relevant). */ |
| 5699 | |
| 5700 | if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END && |
| 5701 | pptr[3] == META_CLASS_END) |
| 5702 | { |
| 5703 | uint32_t c = pptr[1]; |
| 5704 | |
| 5705 | #ifdef SUPPORT_UNICODE |
| 5706 | if (UCD_CASESET(c) == 0) |
| 5707 | #endif |
| 5708 | { |
| 5709 | uint32_t d; |
| 5710 | |
| 5711 | #ifdef SUPPORT_UNICODE |
| 5712 | if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else |
| 5713 | #endif |
| 5714 | { |
| 5715 | #if PCRE2_CODE_UNIT_WIDTH != 8 |
| 5716 | if (c > 255) d = c; else |
| 5717 | #endif |
| 5718 | d = TABLE_GET(c, cb->fcc, c); |
| 5719 | } |
| 5720 | |
| 5721 | if (c != d && pptr[2] == d) |
| 5722 | { |
| 5723 | pptr += 3; /* Move on to class end */ |
| 5724 | meta = c; |
| 5725 | if ((options & PCRE2_CASELESS) == 0) |
| 5726 | { |
| 5727 | reset_caseful = TRUE; |
| 5728 | options |= PCRE2_CASELESS; |
| 5729 | req_caseopt = REQ_CASELESS; |
| 5730 | } |
| 5731 | goto CLASS_CASELESS_CHAR; |
| 5732 | } |
| 5733 | } |
| 5734 | } |
| 5735 | |
| 5736 | /* If a non-extended class contains a negative special such as \S, we need |
| 5737 | to flip the negation flag at the end, so that support for characters > 255 |
| 5738 | works correctly (they are all included in the class). An extended class may |
| 5739 | need to insert specific matching or non-matching code for wide characters. |
| 5740 | */ |
| 5741 | |
| 5742 | should_flip_negation = match_all_or_no_wide_chars = FALSE; |
| 5743 | |
| 5744 | /* Extended class (xclass) will be used when characters > 255 |
| 5745 | might match. */ |
| 5746 | |
| 5747 | #ifdef SUPPORT_WIDE_CHARS |
| 5748 | xclass = FALSE; |
| 5749 | class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */ |
| 5750 | class_uchardata_base = class_uchardata; /* Save the start */ |
| 5751 | #endif |
| 5752 | |
| 5753 | /* For optimization purposes, we track some properties of the class: |
| 5754 | class_has_8bitchar will be non-zero if the class contains at least one |
| 5755 | character with a code point less than 256; xclass_has_prop will be TRUE if |
| 5756 | Unicode property checks are present in the class. */ |
| 5757 | |
| 5758 | class_has_8bitchar = 0; |
| 5759 | #ifdef SUPPORT_WIDE_CHARS |
| 5760 | xclass_has_prop = FALSE; |
| 5761 | #endif |
| 5762 | |
| 5763 | /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map |
| 5764 | in a temporary bit of memory, in case the class contains fewer than two |
| 5765 | 8-bit characters because in that case the compiled code doesn't use the bit |
| 5766 | map. */ |
| 5767 | |
| 5768 | memset(classbits, 0, 32 * sizeof(uint8_t)); |
| 5769 | |
| 5770 | /* Process items until META_CLASS_END is reached. */ |
| 5771 | |
| 5772 | while ((meta = *(++pptr)) != META_CLASS_END) |
| 5773 | { |
| 5774 | /* Handle POSIX classes such as [:alpha:] etc. */ |
| 5775 | |
| 5776 | if (meta == META_POSIX || meta == META_POSIX_NEG) |
| 5777 | { |
| 5778 | BOOL local_negate = (meta == META_POSIX_NEG); |
| 5779 | int posix_class = *(++pptr); |
| 5780 | int taboffset, tabopt; |
| 5781 | uint8_t pbits[32]; |
| 5782 | |
| 5783 | should_flip_negation = local_negate; /* Note negative special */ |
| 5784 | |
| 5785 | /* If matching is caseless, upper and lower are converted to alpha. |
| 5786 | This relies on the fact that the class table starts with alpha, |
| 5787 | lower, upper as the first 3 entries. */ |
| 5788 | |
| 5789 | if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2) |
| 5790 | posix_class = 0; |
| 5791 | |
| 5792 | /* When PCRE2_UCP is set, some of the POSIX classes are converted to |
| 5793 | different escape sequences that use Unicode properties \p or \P. |
| 5794 | Others that are not available via \p or \P have to generate |
| 5795 | XCL_PROP/XCL_NOTPROP directly, which is done here. */ |
| 5796 | |
| 5797 | #ifdef SUPPORT_UNICODE |
| 5798 | if ((options & PCRE2_UCP) != 0) switch(posix_class) |
| 5799 | { |
| 5800 | case PC_GRAPH: |
| 5801 | case PC_PRINT: |
| 5802 | case PC_PUNCT: |
| 5803 | *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP; |
| 5804 | *class_uchardata++ = (PCRE2_UCHAR) |
| 5805 | ((posix_class == PC_GRAPH)? PT_PXGRAPH : |
| 5806 | (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT); |
| 5807 | *class_uchardata++ = 0; |
| 5808 | xclass_has_prop = TRUE; |
| 5809 | goto CONTINUE_CLASS; |
| 5810 | |
| 5811 | /* For the other POSIX classes (ascii, xdigit) we are going to |
| 5812 | fall through to the non-UCP case and build a bit map for |
| 5813 | characters with code points less than 256. However, if we are in |
| 5814 | a negated POSIX class, characters with code points greater than |
| 5815 | 255 must either all match or all not match, depending on whether |
| 5816 | the whole class is not or is negated. For example, for |
| 5817 | [[:^ascii:]... they must all match, whereas for [^[:^xdigit:]... |
| 5818 | they must not. |
| 5819 | |
| 5820 | In the special case where there are no xclass items, this is |
| 5821 | automatically handled by the use of OP_CLASS or OP_NCLASS, but an |
| 5822 | explicit range is needed for OP_XCLASS. Setting a flag here |
| 5823 | causes the range to be generated later when it is known that |
| 5824 | OP_XCLASS is required. In the 8-bit library this is relevant only in |
| 5825 | utf mode, since no wide characters can exist otherwise. */ |
| 5826 | |
| 5827 | default: |
| 5828 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
| 5829 | if (utf) |
| 5830 | #endif |
| 5831 | match_all_or_no_wide_chars |= local_negate; |
| 5832 | break; |
| 5833 | } |
| 5834 | #endif /* SUPPORT_UNICODE */ |
| 5835 | |
| 5836 | /* In the non-UCP case, or when UCP makes no difference, we build the |
| 5837 | bit map for the POSIX class in a chunk of local store because we may |
| 5838 | be adding and subtracting from it, and we don't want to subtract bits |
| 5839 | that may be in the main map already. At the end we or the result into |
| 5840 | the bit map that is being built. */ |
| 5841 | |
| 5842 | posix_class *= 3; |
| 5843 | |
| 5844 | /* Copy in the first table (always present) */ |
| 5845 | |
| 5846 | memcpy(pbits, cbits + posix_class_maps[posix_class], |
| 5847 | 32 * sizeof(uint8_t)); |
| 5848 | |
| 5849 | /* If there is a second table, add or remove it as required. */ |
| 5850 | |
| 5851 | taboffset = posix_class_maps[posix_class + 1]; |
| 5852 | tabopt = posix_class_maps[posix_class + 2]; |
| 5853 | |
| 5854 | if (taboffset >= 0) |
| 5855 | { |
| 5856 | if (tabopt >= 0) |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 5857 | for (int i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset]; |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 5858 | else |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 5859 | for (int i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset]; |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 5860 | } |
| 5861 | |
| 5862 | /* Now see if we need to remove any special characters. An option |
| 5863 | value of 1 removes vertical space and 2 removes underscore. */ |
| 5864 | |
| 5865 | if (tabopt < 0) tabopt = -tabopt; |
| 5866 | if (tabopt == 1) pbits[1] &= ~0x3c; |
| 5867 | else if (tabopt == 2) pbits[11] &= 0x7f; |
| 5868 | |
| 5869 | /* Add the POSIX table or its complement into the main table that is |
| 5870 | being built and we are done. */ |
| 5871 | |
| 5872 | if (local_negate) |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 5873 | for (int i = 0; i < 32; i++) classbits[i] |= (uint8_t)(~pbits[i]); |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 5874 | else |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 5875 | for (int i = 0; i < 32; i++) classbits[i] |= pbits[i]; |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 5876 | |
| 5877 | /* Every class contains at least one < 256 character. */ |
| 5878 | |
| 5879 | class_has_8bitchar = 1; |
| 5880 | goto CONTINUE_CLASS; /* End of POSIX handling */ |
| 5881 | } |
| 5882 | |
| 5883 | /* Other than POSIX classes, the only items we should encounter are |
| 5884 | \d-type escapes and literal characters (possibly as ranges). */ |
| 5885 | |
| 5886 | if (meta == META_BIGVALUE) |
| 5887 | { |
| 5888 | meta = *(++pptr); |
| 5889 | goto CLASS_LITERAL; |
| 5890 | } |
| 5891 | |
| 5892 | /* Any other non-literal must be an escape */ |
| 5893 | |
| 5894 | if (meta >= META_END) |
| 5895 | { |
| 5896 | if (META_CODE(meta) != META_ESCAPE) |
| 5897 | { |
| 5898 | #ifdef DEBUG_SHOW_PARSED |
| 5899 | fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x " |
| 5900 | "in character class\n", meta); |
| 5901 | #endif |
| 5902 | *errorcodeptr = ERR89; /* Internal error - unrecognized. */ |
| 5903 | return 0; |
| 5904 | } |
| 5905 | escape = META_DATA(meta); |
| 5906 | |
| 5907 | /* Every class contains at least one < 256 character. */ |
| 5908 | |
| 5909 | class_has_8bitchar++; |
| 5910 | |
| 5911 | switch(escape) |
| 5912 | { |
| 5913 | case ESC_d: |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 5914 | for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit]; |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 5915 | break; |
| 5916 | |
| 5917 | case ESC_D: |
| 5918 | should_flip_negation = TRUE; |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 5919 | for (int i = 0; i < 32; i++) |
| 5920 | classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]); |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 5921 | break; |
| 5922 | |
| 5923 | case ESC_w: |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 5924 | for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word]; |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 5925 | break; |
| 5926 | |
| 5927 | case ESC_W: |
| 5928 | should_flip_negation = TRUE; |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 5929 | for (int i = 0; i < 32; i++) |
| 5930 | classbits[i] |= (uint8_t)(~cbits[i+cbit_word]); |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 5931 | break; |
| 5932 | |
| 5933 | /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl |
| 5934 | 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was |
| 5935 | previously set by something earlier in the character class. |
| 5936 | Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so |
| 5937 | we could just adjust the appropriate bit. From PCRE 8.34 we no |
| 5938 | longer treat \s and \S specially. */ |
| 5939 | |
| 5940 | case ESC_s: |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 5941 | for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space]; |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 5942 | break; |
| 5943 | |
| 5944 | case ESC_S: |
| 5945 | should_flip_negation = TRUE; |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 5946 | for (int i = 0; i < 32; i++) |
| 5947 | classbits[i] |= (uint8_t)(~cbits[i+cbit_space]); |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 5948 | break; |
| 5949 | |
| 5950 | /* When adding the horizontal or vertical space lists to a class, or |
| 5951 | their complements, disable PCRE2_CASELESS, because it justs wastes |
| 5952 | time, and in the "not-x" UTF cases can create unwanted duplicates in |
| 5953 | the XCLASS list (provoked by characters that have more than one other |
| 5954 | case and by both cases being in the same "not-x" sublist). */ |
| 5955 | |
| 5956 | case ESC_h: |
| 5957 | (void)add_list_to_class(classbits, &class_uchardata, |
| 5958 | options & ~PCRE2_CASELESS, cb, PRIV(hspace_list), NOTACHAR); |
| 5959 | break; |
| 5960 | |
| 5961 | case ESC_H: |
| 5962 | (void)add_not_list_to_class(classbits, &class_uchardata, |
| 5963 | options & ~PCRE2_CASELESS, cb, PRIV(hspace_list)); |
| 5964 | break; |
| 5965 | |
| 5966 | case ESC_v: |
| 5967 | (void)add_list_to_class(classbits, &class_uchardata, |
| 5968 | options & ~PCRE2_CASELESS, cb, PRIV(vspace_list), NOTACHAR); |
| 5969 | break; |
| 5970 | |
| 5971 | case ESC_V: |
| 5972 | (void)add_not_list_to_class(classbits, &class_uchardata, |
| 5973 | options & ~PCRE2_CASELESS, cb, PRIV(vspace_list)); |
| 5974 | break; |
| 5975 | |
| 5976 | /* If Unicode is not supported, \P and \p are not allowed and are |
| 5977 | faulted at parse time, so will never appear here. */ |
| 5978 | |
| 5979 | #ifdef SUPPORT_UNICODE |
| 5980 | case ESC_p: |
| 5981 | case ESC_P: |
| 5982 | { |
| 5983 | uint32_t ptype = *(++pptr) >> 16; |
| 5984 | uint32_t pdata = *pptr & 0xffff; |
| 5985 | *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP; |
| 5986 | *class_uchardata++ = ptype; |
| 5987 | *class_uchardata++ = pdata; |
| 5988 | xclass_has_prop = TRUE; |
| 5989 | class_has_8bitchar--; /* Undo! */ |
| 5990 | } |
| 5991 | break; |
| 5992 | #endif |
| 5993 | } |
| 5994 | |
| 5995 | goto CONTINUE_CLASS; |
| 5996 | } /* End handling \d-type escapes */ |
| 5997 | |
| 5998 | /* A literal character may be followed by a range meta. At parse time |
| 5999 | there are checks for out-of-order characters, for ranges where the two |
| 6000 | characters are equal, and for hyphens that cannot indicate a range. At |
| 6001 | this point, therefore, no checking is needed. */ |
| 6002 | |
| 6003 | else |
| 6004 | { |
| 6005 | uint32_t c, d; |
| 6006 | |
| 6007 | CLASS_LITERAL: |
| 6008 | c = d = meta; |
| 6009 | |
| 6010 | /* Remember if \r or \n were explicitly used */ |
| 6011 | |
| 6012 | if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF; |
| 6013 | |
| 6014 | /* Process a character range */ |
| 6015 | |
| 6016 | if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED) |
| 6017 | { |
| 6018 | #ifdef EBCDIC |
| 6019 | BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL); |
| 6020 | #endif |
| 6021 | pptr += 2; |
| 6022 | d = *pptr; |
| 6023 | if (d == META_BIGVALUE) d = *(++pptr); |
| 6024 | |
| 6025 | /* Remember an explicit \r or \n, and add the range to the class. */ |
| 6026 | |
| 6027 | if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF; |
| 6028 | |
| 6029 | /* In an EBCDIC environment, Perl treats alphabetic ranges specially |
| 6030 | because there are holes in the encoding, and simply using the range |
| 6031 | A-Z (for example) would include the characters in the holes. This |
| 6032 | applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */ |
| 6033 | |
| 6034 | #ifdef EBCDIC |
| 6035 | if (range_is_literal && |
| 6036 | (cb->ctypes[c] & ctype_letter) != 0 && |
| 6037 | (cb->ctypes[d] & ctype_letter) != 0 && |
| 6038 | (c <= CHAR_z) == (d <= CHAR_z)) |
| 6039 | { |
| 6040 | uint32_t uc = (d <= CHAR_z)? 0 : 64; |
| 6041 | uint32_t C = c - uc; |
| 6042 | uint32_t D = d - uc; |
| 6043 | |
| 6044 | if (C <= CHAR_i) |
| 6045 | { |
| 6046 | class_has_8bitchar += |
| 6047 | add_to_class(classbits, &class_uchardata, options, cb, C + uc, |
| 6048 | ((D < CHAR_i)? D : CHAR_i) + uc); |
| 6049 | C = CHAR_j; |
| 6050 | } |
| 6051 | |
| 6052 | if (C <= D && C <= CHAR_r) |
| 6053 | { |
| 6054 | class_has_8bitchar += |
| 6055 | add_to_class(classbits, &class_uchardata, options, cb, C + uc, |
| 6056 | ((D < CHAR_r)? D : CHAR_r) + uc); |
| 6057 | C = CHAR_s; |
| 6058 | } |
| 6059 | |
| 6060 | if (C <= D) |
| 6061 | { |
| 6062 | class_has_8bitchar += |
| 6063 | add_to_class(classbits, &class_uchardata, options, cb, C + uc, |
| 6064 | D + uc); |
| 6065 | } |
| 6066 | } |
| 6067 | else |
| 6068 | #endif |
| 6069 | /* Not an EBCDIC special range */ |
| 6070 | |
| 6071 | class_has_8bitchar += |
| 6072 | add_to_class(classbits, &class_uchardata, options, cb, c, d); |
| 6073 | goto CONTINUE_CLASS; /* Go get the next char in the class */ |
| 6074 | } /* End of range handling */ |
| 6075 | |
| 6076 | |
| 6077 | /* Handle a single character. */ |
| 6078 | |
| 6079 | class_has_8bitchar += |
| 6080 | add_to_class(classbits, &class_uchardata, options, cb, meta, meta); |
| 6081 | } |
| 6082 | |
| 6083 | /* Continue to the next item in the class. */ |
| 6084 | |
| 6085 | CONTINUE_CLASS: |
| 6086 | |
| 6087 | #ifdef SUPPORT_WIDE_CHARS |
| 6088 | /* If any wide characters or Unicode properties have been encountered, |
| 6089 | set xclass = TRUE. Then, in the pre-compile phase, accumulate the length |
| 6090 | of the extra data and reset the pointer. This is so that very large |
| 6091 | classes that contain a zillion wide characters or Unicode property tests |
| 6092 | do not overwrite the workspace (which is on the stack). */ |
| 6093 | |
| 6094 | if (class_uchardata > class_uchardata_base) |
| 6095 | { |
| 6096 | xclass = TRUE; |
| 6097 | if (lengthptr != NULL) |
| 6098 | { |
| 6099 | *lengthptr += class_uchardata - class_uchardata_base; |
| 6100 | class_uchardata = class_uchardata_base; |
| 6101 | } |
| 6102 | } |
| 6103 | #endif |
| 6104 | |
| 6105 | continue; /* Needed to avoid error when not supporting wide chars */ |
| 6106 | } /* End of main class-processing loop */ |
| 6107 | |
| 6108 | /* If this class is the first thing in the branch, there can be no first |
| 6109 | char setting, whatever the repeat count. Any reqcu setting must remain |
| 6110 | unchanged after any kind of repeat. */ |
| 6111 | |
| 6112 | if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; |
| 6113 | zerofirstcu = firstcu; |
| 6114 | zerofirstcuflags = firstcuflags; |
| 6115 | zeroreqcu = reqcu; |
| 6116 | zeroreqcuflags = reqcuflags; |
| 6117 | |
| 6118 | /* If there are characters with values > 255, or Unicode property settings |
| 6119 | (\p or \P), we have to compile an extended class, with its own opcode, |
| 6120 | unless there were no property settings and there was a negated special such |
| 6121 | as \S in the class, and PCRE2_UCP is not set, because in that case all |
| 6122 | characters > 255 are in or not in the class, so any that were explicitly |
| 6123 | given as well can be ignored. |
| 6124 | |
| 6125 | In the UCP case, if certain negated POSIX classes ([:^ascii:] or |
| 6126 | [^:xdigit:]) were present in a class, we either have to match or not match |
| 6127 | all wide characters (depending on whether the whole class is or is not |
| 6128 | negated). This requirement is indicated by match_all_or_no_wide_chars being |
| 6129 | true. We do this by including an explicit range, which works in both cases. |
| 6130 | This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there |
| 6131 | cannot be any wide characters in 8-bit non-UTF mode. |
| 6132 | |
| 6133 | When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit |
| 6134 | class where \S etc is present without PCRE2_UCP, causing an extended class |
| 6135 | to be compiled, we make sure that all characters > 255 are included by |
| 6136 | forcing match_all_or_no_wide_chars to be true. |
| 6137 | |
| 6138 | If, when generating an xclass, there are no characters < 256, we can omit |
| 6139 | the bitmap in the actual compiled code. */ |
| 6140 | |
| 6141 | #ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */ |
| 6142 | if (xclass && ( |
| 6143 | #ifdef SUPPORT_UNICODE |
| 6144 | (options & PCRE2_UCP) != 0 || |
| 6145 | #endif |
| 6146 | xclass_has_prop || !should_flip_negation)) |
| 6147 | { |
| 6148 | if (match_all_or_no_wide_chars || ( |
| 6149 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
| 6150 | utf && |
| 6151 | #endif |
| 6152 | should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0)) |
| 6153 | { |
| 6154 | *class_uchardata++ = XCL_RANGE; |
| 6155 | if (utf) /* Will always be utf in the 8-bit library */ |
| 6156 | { |
| 6157 | class_uchardata += PRIV(ord2utf)(0x100, class_uchardata); |
| 6158 | class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata); |
| 6159 | } |
| 6160 | else /* Can only happen for the 16-bit & 32-bit libraries */ |
| 6161 | { |
| 6162 | #if PCRE2_CODE_UNIT_WIDTH == 16 |
| 6163 | *class_uchardata++ = 0x100; |
| 6164 | *class_uchardata++ = 0xffffu; |
| 6165 | #elif PCRE2_CODE_UNIT_WIDTH == 32 |
| 6166 | *class_uchardata++ = 0x100; |
| 6167 | *class_uchardata++ = 0xffffffffu; |
| 6168 | #endif |
| 6169 | } |
| 6170 | } |
| 6171 | *class_uchardata++ = XCL_END; /* Marks the end of extra data */ |
| 6172 | *code++ = OP_XCLASS; |
| 6173 | code += LINK_SIZE; |
| 6174 | *code = negate_class? XCL_NOT:0; |
| 6175 | if (xclass_has_prop) *code |= XCL_HASPROP; |
| 6176 | |
| 6177 | /* If the map is required, move up the extra data to make room for it; |
| 6178 | otherwise just move the code pointer to the end of the extra data. */ |
| 6179 | |
| 6180 | if (class_has_8bitchar > 0) |
| 6181 | { |
| 6182 | *code++ |= XCL_MAP; |
| 6183 | (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code, |
| 6184 | CU2BYTES(class_uchardata - code)); |
| 6185 | if (negate_class && !xclass_has_prop) |
| 6186 | { |
| 6187 | /* Using 255 ^ instead of ~ avoids clang sanitize warning. */ |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 6188 | for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i]; |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 6189 | } |
| 6190 | memcpy(code, classbits, 32); |
| 6191 | code = class_uchardata + (32 / sizeof(PCRE2_UCHAR)); |
| 6192 | } |
| 6193 | else code = class_uchardata; |
| 6194 | |
| 6195 | /* Now fill in the complete length of the item */ |
| 6196 | |
| 6197 | PUT(previous, 1, (int)(code - previous)); |
| 6198 | break; /* End of class handling */ |
| 6199 | } |
| 6200 | #endif /* SUPPORT_WIDE_CHARS */ |
| 6201 | |
| 6202 | /* If there are no characters > 255, or they are all to be included or |
| 6203 | excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the |
| 6204 | whole class was negated and whether there were negative specials such as \S |
| 6205 | (non-UCP) in the class. Then copy the 32-byte map into the code vector, |
| 6206 | negating it if necessary. */ |
| 6207 | |
| 6208 | *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS; |
| 6209 | if (lengthptr == NULL) /* Save time in the pre-compile phase */ |
| 6210 | { |
| 6211 | if (negate_class) |
| 6212 | { |
| 6213 | /* Using 255 ^ instead of ~ avoids clang sanitize warning. */ |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 6214 | for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i]; |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 6215 | } |
| 6216 | memcpy(code, classbits, 32); |
| 6217 | } |
| 6218 | code += 32 / sizeof(PCRE2_UCHAR); |
| 6219 | break; /* End of class processing */ |
| 6220 | |
| 6221 | |
| 6222 | /* ===================================================================*/ |
| 6223 | /* Deal with (*VERB)s. */ |
| 6224 | |
| 6225 | /* Check for open captures before ACCEPT and close those that are within |
| 6226 | the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an |
| 6227 | assertion. In the first pass, just accumulate the length required; |
| 6228 | otherwise hitting (*ACCEPT) inside many nested parentheses can cause |
| 6229 | workspace overflow. Do not set firstcu after *ACCEPT. */ |
| 6230 | |
| 6231 | case META_ACCEPT: |
| 6232 | cb->had_accept = had_accept = TRUE; |
| 6233 | for (oc = cb->open_caps; |
| 6234 | oc != NULL && oc->assert_depth >= cb->assert_depth; |
| 6235 | oc = oc->next) |
| 6236 | { |
| 6237 | if (lengthptr != NULL) |
| 6238 | { |
| 6239 | *lengthptr += CU2BYTES(1) + IMM2_SIZE; |
| 6240 | } |
| 6241 | else |
| 6242 | { |
| 6243 | *code++ = OP_CLOSE; |
| 6244 | PUT2INC(code, 0, oc->number); |
| 6245 | } |
| 6246 | } |
| 6247 | *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT; |
| 6248 | if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; |
| 6249 | break; |
| 6250 | |
| 6251 | case META_PRUNE: |
| 6252 | case META_SKIP: |
| 6253 | cb->had_pruneorskip = TRUE; |
| 6254 | /* Fall through */ |
| 6255 | case META_COMMIT: |
| 6256 | case META_FAIL: |
| 6257 | *code++ = verbops[(meta - META_MARK) >> 16]; |
| 6258 | break; |
| 6259 | |
| 6260 | case META_THEN: |
| 6261 | cb->external_flags |= PCRE2_HASTHEN; |
| 6262 | *code++ = OP_THEN; |
| 6263 | break; |
| 6264 | |
| 6265 | /* Handle verbs with arguments. Arguments can be very long, especially in |
| 6266 | 16- and 32-bit modes, and can overflow the workspace in the first pass. |
| 6267 | However, the argument length is constrained to be small enough to fit in |
| 6268 | one code unit. This check happens in parse_regex(). In the first pass, |
| 6269 | instead of putting the argument into memory, we just update the length |
| 6270 | counter and set up an empty argument. */ |
| 6271 | |
| 6272 | case META_THEN_ARG: |
| 6273 | cb->external_flags |= PCRE2_HASTHEN; |
| 6274 | goto VERB_ARG; |
| 6275 | |
| 6276 | case META_PRUNE_ARG: |
| 6277 | case META_SKIP_ARG: |
| 6278 | cb->had_pruneorskip = TRUE; |
| 6279 | /* Fall through */ |
| 6280 | case META_MARK: |
| 6281 | case META_COMMIT_ARG: |
| 6282 | VERB_ARG: |
| 6283 | *code++ = verbops[(meta - META_MARK) >> 16]; |
| 6284 | /* The length is in characters. */ |
| 6285 | verbarglen = *(++pptr); |
| 6286 | verbculen = 0; |
| 6287 | tempcode = code++; |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 6288 | for (int i = 0; i < (int)verbarglen; i++) |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 6289 | { |
| 6290 | meta = *(++pptr); |
| 6291 | #ifdef SUPPORT_UNICODE |
| 6292 | if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else |
| 6293 | #endif |
| 6294 | { |
| 6295 | mclength = 1; |
| 6296 | mcbuffer[0] = meta; |
| 6297 | } |
| 6298 | if (lengthptr != NULL) *lengthptr += mclength; else |
| 6299 | { |
| 6300 | memcpy(code, mcbuffer, CU2BYTES(mclength)); |
| 6301 | code += mclength; |
| 6302 | verbculen += mclength; |
| 6303 | } |
| 6304 | } |
| 6305 | |
| 6306 | *tempcode = verbculen; /* Fill in the code unit length */ |
| 6307 | *code++ = 0; /* Terminating zero */ |
| 6308 | break; |
| 6309 | |
| 6310 | |
| 6311 | /* ===================================================================*/ |
| 6312 | /* Handle options change. The new setting must be passed back for use in |
| 6313 | subsequent branches. Reset the greedy defaults and the case value for |
| 6314 | firstcu and reqcu. */ |
| 6315 | |
| 6316 | case META_OPTIONS: |
| 6317 | *optionsptr = options = *(++pptr); |
| 6318 | greedy_default = ((options & PCRE2_UNGREEDY) != 0); |
| 6319 | greedy_non_default = greedy_default ^ 1; |
| 6320 | req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0; |
| 6321 | break; |
| 6322 | |
| 6323 | |
| 6324 | /* ===================================================================*/ |
| 6325 | /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous |
| 6326 | because it could be a numerical check on recursion, or a name check on a |
| 6327 | group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that |
| 6328 | we can handle it either way. We first try for a name; if not found, process |
| 6329 | the number. */ |
| 6330 | |
| 6331 | case META_COND_RNUMBER: /* (?(Rdigits) */ |
| 6332 | case META_COND_NAME: /* (?(name) or (?'name') or ?(<name>) */ |
| 6333 | case META_COND_RNAME: /* (?(R&name) - test for recursion */ |
| 6334 | bravalue = OP_COND; |
| 6335 | { |
| 6336 | int count, index; |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 6337 | unsigned int i; |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 6338 | PCRE2_SPTR name; |
| 6339 | named_group *ng = cb->named_groups; |
| 6340 | uint32_t length = *(++pptr); |
| 6341 | |
| 6342 | GETPLUSOFFSET(offset, pptr); |
| 6343 | name = cb->start_pattern + offset; |
| 6344 | |
| 6345 | /* In the first pass, the names generated in the pre-pass are available, |
| 6346 | but the main name table has not yet been created. Scan the list of names |
| 6347 | generated in the pre-pass in order to get a number and whether or not |
| 6348 | this name is duplicated. If it is not duplicated, we can handle it as a |
| 6349 | numerical group. */ |
| 6350 | |
| 6351 | for (i = 0; i < cb->names_found; i++, ng++) |
| 6352 | { |
| 6353 | if (length == ng->length && |
| 6354 | PRIV(strncmp)(name, ng->name, length) == 0) |
| 6355 | { |
| 6356 | if (!ng->isdup) |
| 6357 | { |
| 6358 | code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF; |
| 6359 | PUT2(code, 2+LINK_SIZE, ng->number); |
| 6360 | if (ng->number > cb->top_backref) cb->top_backref = ng->number; |
| 6361 | skipunits = 1+IMM2_SIZE; |
| 6362 | goto GROUP_PROCESS_NOTE_EMPTY; |
| 6363 | } |
| 6364 | break; /* Found a duplicated name */ |
| 6365 | } |
| 6366 | } |
| 6367 | |
| 6368 | /* If the name was not found we have a bad reference, unless we are |
| 6369 | dealing with R<digits>, which is treated as a recursion test by number. |
| 6370 | */ |
| 6371 | |
| 6372 | if (i >= cb->names_found) |
| 6373 | { |
| 6374 | groupnumber = 0; |
| 6375 | if (meta == META_COND_RNUMBER) |
| 6376 | { |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 6377 | for (i = 1; i < length; i++) |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 6378 | { |
| 6379 | groupnumber = groupnumber * 10 + name[i] - CHAR_0; |
| 6380 | if (groupnumber > MAX_GROUP_NUMBER) |
| 6381 | { |
| 6382 | *errorcodeptr = ERR61; |
| 6383 | cb->erroroffset = offset + i; |
| 6384 | return 0; |
| 6385 | } |
| 6386 | } |
| 6387 | } |
| 6388 | |
| 6389 | if (meta != META_COND_RNUMBER || groupnumber > cb->bracount) |
| 6390 | { |
| 6391 | *errorcodeptr = ERR15; |
| 6392 | cb->erroroffset = offset; |
| 6393 | return 0; |
| 6394 | } |
| 6395 | |
| 6396 | /* (?Rdigits) treated as a recursion reference by number. A value of |
| 6397 | zero (which is the result of both (?R) and (?R0)) means "any", and is |
| 6398 | translated into RREF_ANY (which is 0xffff). */ |
| 6399 | |
| 6400 | if (groupnumber == 0) groupnumber = RREF_ANY; |
| 6401 | code[1+LINK_SIZE] = OP_RREF; |
| 6402 | PUT2(code, 2+LINK_SIZE, groupnumber); |
| 6403 | skipunits = 1+IMM2_SIZE; |
| 6404 | goto GROUP_PROCESS_NOTE_EMPTY; |
| 6405 | } |
| 6406 | |
| 6407 | /* A duplicated name was found. Note that if an R<digits> name is found |
| 6408 | (META_COND_RNUMBER), it is a reference test, not a recursion test. */ |
| 6409 | |
| 6410 | code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF; |
| 6411 | |
| 6412 | /* We have a duplicated name. In the compile pass we have to search the |
| 6413 | main table in order to get the index and count values. */ |
| 6414 | |
| 6415 | count = 0; /* Values for first pass (avoids compiler warning) */ |
| 6416 | index = 0; |
| 6417 | if (lengthptr == NULL && !find_dupname_details(name, length, &index, |
| 6418 | &count, errorcodeptr, cb)) return 0; |
| 6419 | |
| 6420 | /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and |
| 6421 | insert appropriate data values. */ |
| 6422 | |
| 6423 | code[1+LINK_SIZE]++; |
| 6424 | skipunits = 1+2*IMM2_SIZE; |
| 6425 | PUT2(code, 2+LINK_SIZE, index); |
| 6426 | PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count); |
| 6427 | } |
| 6428 | goto GROUP_PROCESS_NOTE_EMPTY; |
| 6429 | |
| 6430 | /* The DEFINE condition is always false. Its internal groups may never |
| 6431 | be called, so matched_char must remain false, hence the jump to |
| 6432 | GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */ |
| 6433 | |
| 6434 | case META_COND_DEFINE: |
| 6435 | bravalue = OP_COND; |
| 6436 | GETPLUSOFFSET(offset, pptr); |
| 6437 | code[1+LINK_SIZE] = OP_DEFINE; |
| 6438 | skipunits = 1; |
| 6439 | goto GROUP_PROCESS; |
| 6440 | |
| 6441 | /* Conditional test of a group's being set. */ |
| 6442 | |
| 6443 | case META_COND_NUMBER: |
| 6444 | bravalue = OP_COND; |
| 6445 | GETPLUSOFFSET(offset, pptr); |
| 6446 | groupnumber = *(++pptr); |
| 6447 | if (groupnumber > cb->bracount) |
| 6448 | { |
| 6449 | *errorcodeptr = ERR15; |
| 6450 | cb->erroroffset = offset; |
| 6451 | return 0; |
| 6452 | } |
| 6453 | if (groupnumber > cb->top_backref) cb->top_backref = groupnumber; |
| 6454 | offset -= 2; /* Point at initial ( for too many branches error */ |
| 6455 | code[1+LINK_SIZE] = OP_CREF; |
| 6456 | skipunits = 1+IMM2_SIZE; |
| 6457 | PUT2(code, 2+LINK_SIZE, groupnumber); |
| 6458 | goto GROUP_PROCESS_NOTE_EMPTY; |
| 6459 | |
| 6460 | /* Test for the PCRE2 version. */ |
| 6461 | |
| 6462 | case META_COND_VERSION: |
| 6463 | bravalue = OP_COND; |
| 6464 | if (pptr[1] > 0) |
| 6465 | code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) || |
| 6466 | (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))? |
| 6467 | OP_TRUE : OP_FALSE; |
| 6468 | else |
| 6469 | code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])? |
| 6470 | OP_TRUE : OP_FALSE; |
| 6471 | skipunits = 1; |
| 6472 | pptr += 3; |
| 6473 | goto GROUP_PROCESS_NOTE_EMPTY; |
| 6474 | |
| 6475 | /* The condition is an assertion, possibly preceded by a callout. */ |
| 6476 | |
| 6477 | case META_COND_ASSERT: |
| 6478 | bravalue = OP_COND; |
| 6479 | goto GROUP_PROCESS_NOTE_EMPTY; |
| 6480 | |
| 6481 | |
| 6482 | /* ===================================================================*/ |
| 6483 | /* Handle all kinds of nested bracketed groups. The non-capturing, |
| 6484 | non-conditional cases are here; others come to GROUP_PROCESS via goto. */ |
| 6485 | |
| 6486 | case META_LOOKAHEAD: |
| 6487 | bravalue = OP_ASSERT; |
| 6488 | cb->assert_depth += 1; |
| 6489 | goto GROUP_PROCESS; |
| 6490 | |
| 6491 | case META_LOOKAHEAD_NA: |
| 6492 | bravalue = OP_ASSERT_NA; |
| 6493 | cb->assert_depth += 1; |
| 6494 | goto GROUP_PROCESS; |
| 6495 | |
| 6496 | /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird |
| 6497 | thing to do, but Perl allows all assertions to be quantified, and when |
| 6498 | they contain capturing parentheses there may be a potential use for |
| 6499 | this feature. Not that that applies to a quantified (?!) but we allow |
| 6500 | it for uniformity. */ |
| 6501 | |
| 6502 | case META_LOOKAHEADNOT: |
| 6503 | if (pptr[1] == META_KET && |
| 6504 | (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY)) |
| 6505 | { |
| 6506 | *code++ = OP_FAIL; |
| 6507 | pptr++; |
| 6508 | } |
| 6509 | else |
| 6510 | { |
| 6511 | bravalue = OP_ASSERT_NOT; |
| 6512 | cb->assert_depth += 1; |
| 6513 | goto GROUP_PROCESS; |
| 6514 | } |
| 6515 | break; |
| 6516 | |
| 6517 | case META_LOOKBEHIND: |
| 6518 | bravalue = OP_ASSERTBACK; |
| 6519 | cb->assert_depth += 1; |
| 6520 | goto GROUP_PROCESS; |
| 6521 | |
| 6522 | case META_LOOKBEHINDNOT: |
| 6523 | bravalue = OP_ASSERTBACK_NOT; |
| 6524 | cb->assert_depth += 1; |
| 6525 | goto GROUP_PROCESS; |
| 6526 | |
| 6527 | case META_LOOKBEHIND_NA: |
| 6528 | bravalue = OP_ASSERTBACK_NA; |
| 6529 | cb->assert_depth += 1; |
| 6530 | goto GROUP_PROCESS; |
| 6531 | |
| 6532 | case META_ATOMIC: |
| 6533 | bravalue = OP_ONCE; |
| 6534 | goto GROUP_PROCESS_NOTE_EMPTY; |
| 6535 | |
| 6536 | case META_SCRIPT_RUN: |
| 6537 | bravalue = OP_SCRIPT_RUN; |
| 6538 | goto GROUP_PROCESS_NOTE_EMPTY; |
| 6539 | |
| 6540 | case META_NOCAPTURE: |
| 6541 | bravalue = OP_BRA; |
| 6542 | /* Fall through */ |
| 6543 | |
| 6544 | /* Process nested bracketed regex. The nesting depth is maintained for the |
| 6545 | benefit of the stackguard function. The test for too deep nesting is now |
| 6546 | done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS; |
| 6547 | others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take |
| 6548 | note of whether or not they may match an empty string. */ |
| 6549 | |
| 6550 | GROUP_PROCESS_NOTE_EMPTY: |
| 6551 | note_group_empty = TRUE; |
| 6552 | |
| 6553 | GROUP_PROCESS: |
| 6554 | cb->parens_depth += 1; |
| 6555 | *code = bravalue; |
| 6556 | pptr++; |
| 6557 | tempcode = code; |
| 6558 | tempreqvary = cb->req_varyopt; /* Save value before group */ |
| 6559 | length_prevgroup = 0; /* Initialize for pre-compile phase */ |
| 6560 | |
| 6561 | if ((group_return = |
| 6562 | compile_regex( |
| 6563 | options, /* The option state */ |
| 6564 | &tempcode, /* Where to put code (updated) */ |
| 6565 | &pptr, /* Input pointer (updated) */ |
| 6566 | errorcodeptr, /* Where to put an error message */ |
| 6567 | skipunits, /* Skip over bracket number */ |
| 6568 | &subfirstcu, /* For possible first char */ |
| 6569 | &subfirstcuflags, |
| 6570 | &subreqcu, /* For possible last char */ |
| 6571 | &subreqcuflags, |
| 6572 | bcptr, /* Current branch chain */ |
| 6573 | cb, /* Compile data block */ |
| 6574 | (lengthptr == NULL)? NULL : /* Actual compile phase */ |
| 6575 | &length_prevgroup /* Pre-compile phase */ |
| 6576 | )) == 0) |
| 6577 | return 0; /* Error */ |
| 6578 | |
| 6579 | cb->parens_depth -= 1; |
| 6580 | |
| 6581 | /* If that was a non-conditional significant group (not an assertion, not a |
| 6582 | DEFINE) that matches at least one character, then the current item matches |
| 6583 | a character. Conditionals are handled below. */ |
| 6584 | |
| 6585 | if (note_group_empty && bravalue != OP_COND && group_return > 0) |
| 6586 | matched_char = TRUE; |
| 6587 | |
| 6588 | /* If we've just compiled an assertion, pop the assert depth. */ |
| 6589 | |
| 6590 | if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NA) |
| 6591 | cb->assert_depth -= 1; |
| 6592 | |
| 6593 | /* At the end of compiling, code is still pointing to the start of the |
| 6594 | group, while tempcode has been updated to point past the end of the group. |
| 6595 | The parsed pattern pointer (pptr) is on the closing META_KET. |
| 6596 | |
| 6597 | If this is a conditional bracket, check that there are no more than |
| 6598 | two branches in the group, or just one if it's a DEFINE group. We do this |
| 6599 | in the real compile phase, not in the pre-pass, where the whole group may |
| 6600 | not be available. */ |
| 6601 | |
| 6602 | if (bravalue == OP_COND && lengthptr == NULL) |
| 6603 | { |
| 6604 | PCRE2_UCHAR *tc = code; |
| 6605 | int condcount = 0; |
| 6606 | |
| 6607 | do { |
| 6608 | condcount++; |
| 6609 | tc += GET(tc,1); |
| 6610 | } |
| 6611 | while (*tc != OP_KET); |
| 6612 | |
| 6613 | /* A DEFINE group is never obeyed inline (the "condition" is always |
| 6614 | false). It must have only one branch. Having checked this, change the |
| 6615 | opcode to OP_FALSE. */ |
| 6616 | |
| 6617 | if (code[LINK_SIZE+1] == OP_DEFINE) |
| 6618 | { |
| 6619 | if (condcount > 1) |
| 6620 | { |
| 6621 | cb->erroroffset = offset; |
| 6622 | *errorcodeptr = ERR54; |
| 6623 | return 0; |
| 6624 | } |
| 6625 | code[LINK_SIZE+1] = OP_FALSE; |
| 6626 | bravalue = OP_DEFINE; /* A flag to suppress char handling below */ |
| 6627 | } |
| 6628 | |
| 6629 | /* A "normal" conditional group. If there is just one branch, we must not |
| 6630 | make use of its firstcu or reqcu, because this is equivalent to an |
| 6631 | empty second branch. Also, it may match an empty string. If there are two |
| 6632 | branches, this item must match a character if the group must. */ |
| 6633 | |
| 6634 | else |
| 6635 | { |
| 6636 | if (condcount > 2) |
| 6637 | { |
| 6638 | cb->erroroffset = offset; |
| 6639 | *errorcodeptr = ERR27; |
| 6640 | return 0; |
| 6641 | } |
| 6642 | if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE; |
| 6643 | else if (group_return > 0) matched_char = TRUE; |
| 6644 | } |
| 6645 | } |
| 6646 | |
| 6647 | /* In the pre-compile phase, update the length by the length of the group, |
| 6648 | less the brackets at either end. Then reduce the compiled code to just a |
| 6649 | set of non-capturing brackets so that it doesn't use much memory if it is |
| 6650 | duplicated by a quantifier.*/ |
| 6651 | |
| 6652 | if (lengthptr != NULL) |
| 6653 | { |
| 6654 | if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE) |
| 6655 | { |
| 6656 | *errorcodeptr = ERR20; |
| 6657 | return 0; |
| 6658 | } |
| 6659 | *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE; |
| 6660 | code++; /* This already contains bravalue */ |
| 6661 | PUTINC(code, 0, 1 + LINK_SIZE); |
| 6662 | *code++ = OP_KET; |
| 6663 | PUTINC(code, 0, 1 + LINK_SIZE); |
| 6664 | break; /* No need to waste time with special character handling */ |
| 6665 | } |
| 6666 | |
| 6667 | /* Otherwise update the main code pointer to the end of the group. */ |
| 6668 | |
| 6669 | code = tempcode; |
| 6670 | |
| 6671 | /* For a DEFINE group, required and first character settings are not |
| 6672 | relevant. */ |
| 6673 | |
| 6674 | if (bravalue == OP_DEFINE) break; |
| 6675 | |
| 6676 | /* Handle updating of the required and first code units for other types of |
| 6677 | group. Update for normal brackets of all kinds, and conditions with two |
| 6678 | branches (see code above). If the bracket is followed by a quantifier with |
| 6679 | zero repeat, we have to back off. Hence the definition of zeroreqcu and |
| 6680 | zerofirstcu outside the main loop so that they can be accessed for the back |
| 6681 | off. */ |
| 6682 | |
| 6683 | zeroreqcu = reqcu; |
| 6684 | zeroreqcuflags = reqcuflags; |
| 6685 | zerofirstcu = firstcu; |
| 6686 | zerofirstcuflags = firstcuflags; |
| 6687 | groupsetfirstcu = FALSE; |
| 6688 | |
| 6689 | if (bravalue >= OP_ONCE) /* Not an assertion */ |
| 6690 | { |
| 6691 | /* If we have not yet set a firstcu in this branch, take it from the |
| 6692 | subpattern, remembering that it was set here so that a repeat of more |
| 6693 | than one can replicate it as reqcu if necessary. If the subpattern has |
| 6694 | no firstcu, set "none" for the whole branch. In both cases, a zero |
| 6695 | repeat forces firstcu to "none". */ |
| 6696 | |
| 6697 | if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET) |
| 6698 | { |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 6699 | if (subfirstcuflags < REQ_NONE) |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 6700 | { |
| 6701 | firstcu = subfirstcu; |
| 6702 | firstcuflags = subfirstcuflags; |
| 6703 | groupsetfirstcu = TRUE; |
| 6704 | } |
| 6705 | else firstcuflags = REQ_NONE; |
| 6706 | zerofirstcuflags = REQ_NONE; |
| 6707 | } |
| 6708 | |
| 6709 | /* If firstcu was previously set, convert the subpattern's firstcu |
| 6710 | into reqcu if there wasn't one, using the vary flag that was in |
| 6711 | existence beforehand. */ |
| 6712 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 6713 | else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE) |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 6714 | { |
| 6715 | subreqcu = subfirstcu; |
| 6716 | subreqcuflags = subfirstcuflags | tempreqvary; |
| 6717 | } |
| 6718 | |
| 6719 | /* If the subpattern set a required code unit (or set a first code unit |
| 6720 | that isn't really the first code unit - see above), set it. */ |
| 6721 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 6722 | if (subreqcuflags < REQ_NONE) |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 6723 | { |
| 6724 | reqcu = subreqcu; |
| 6725 | reqcuflags = subreqcuflags; |
| 6726 | } |
| 6727 | } |
| 6728 | |
| 6729 | /* For a forward assertion, we take the reqcu, if set, provided that the |
| 6730 | group has also set a firstcu. This can be helpful if the pattern that |
| 6731 | follows the assertion doesn't set a different char. For example, it's |
| 6732 | useful for /(?=abcde).+/. We can't set firstcu for an assertion, however |
| 6733 | because it leads to incorrect effect for patterns such as /(?=a)a.+/ when |
| 6734 | the "real" "a" would then become a reqcu instead of a firstcu. This is |
| 6735 | overcome by a scan at the end if there's no firstcu, looking for an |
| 6736 | asserted first char. A similar effect for patterns like /(?=.*X)X$/ means |
| 6737 | we must only take the reqcu when the group also set a firstcu. Otherwise, |
| 6738 | in that example, 'X' ends up set for both. */ |
| 6739 | |
| 6740 | else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) && |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 6741 | subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE) |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 6742 | { |
| 6743 | reqcu = subreqcu; |
| 6744 | reqcuflags = subreqcuflags; |
| 6745 | } |
| 6746 | |
| 6747 | break; /* End of nested group handling */ |
| 6748 | |
| 6749 | |
| 6750 | /* ===================================================================*/ |
| 6751 | /* Handle named backreferences and recursions. */ |
| 6752 | |
| 6753 | case META_BACKREF_BYNAME: |
| 6754 | case META_RECURSE_BYNAME: |
| 6755 | { |
| 6756 | int count, index; |
| 6757 | PCRE2_SPTR name; |
| 6758 | BOOL is_dupname = FALSE; |
| 6759 | named_group *ng = cb->named_groups; |
| 6760 | uint32_t length = *(++pptr); |
| 6761 | |
| 6762 | GETPLUSOFFSET(offset, pptr); |
| 6763 | name = cb->start_pattern + offset; |
| 6764 | |
| 6765 | /* In the first pass, the names generated in the pre-pass are available, |
| 6766 | but the main name table has not yet been created. Scan the list of names |
| 6767 | generated in the pre-pass in order to get a number and whether or not |
| 6768 | this name is duplicated. */ |
| 6769 | |
| 6770 | groupnumber = 0; |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 6771 | for (unsigned int i = 0; i < cb->names_found; i++, ng++) |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 6772 | { |
| 6773 | if (length == ng->length && |
| 6774 | PRIV(strncmp)(name, ng->name, length) == 0) |
| 6775 | { |
| 6776 | is_dupname = ng->isdup; |
| 6777 | groupnumber = ng->number; |
| 6778 | |
| 6779 | /* For a recursion, that's all that is needed. We can now go to |
| 6780 | the code that handles numerical recursion, applying it to the first |
| 6781 | group with the given name. */ |
| 6782 | |
| 6783 | if (meta == META_RECURSE_BYNAME) |
| 6784 | { |
| 6785 | meta_arg = groupnumber; |
| 6786 | goto HANDLE_NUMERICAL_RECURSION; |
| 6787 | } |
| 6788 | |
| 6789 | /* For a back reference, update the back reference map and the |
| 6790 | maximum back reference. */ |
| 6791 | |
| 6792 | cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1; |
| 6793 | if (groupnumber > cb->top_backref) |
| 6794 | cb->top_backref = groupnumber; |
| 6795 | } |
| 6796 | } |
| 6797 | |
| 6798 | /* If the name was not found we have a bad reference. */ |
| 6799 | |
| 6800 | if (groupnumber == 0) |
| 6801 | { |
| 6802 | *errorcodeptr = ERR15; |
| 6803 | cb->erroroffset = offset; |
| 6804 | return 0; |
| 6805 | } |
| 6806 | |
| 6807 | /* If a back reference name is not duplicated, we can handle it as |
| 6808 | a numerical reference. */ |
| 6809 | |
| 6810 | if (!is_dupname) |
| 6811 | { |
| 6812 | meta_arg = groupnumber; |
| 6813 | goto HANDLE_SINGLE_REFERENCE; |
| 6814 | } |
| 6815 | |
| 6816 | /* If a back reference name is duplicated, we generate a different |
| 6817 | opcode to a numerical back reference. In the second pass we must |
| 6818 | search for the index and count in the final name table. */ |
| 6819 | |
| 6820 | count = 0; /* Values for first pass (avoids compiler warning) */ |
| 6821 | index = 0; |
| 6822 | if (lengthptr == NULL && !find_dupname_details(name, length, &index, |
| 6823 | &count, errorcodeptr, cb)) return 0; |
| 6824 | |
| 6825 | if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; |
| 6826 | *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF; |
| 6827 | PUT2INC(code, 0, index); |
| 6828 | PUT2INC(code, 0, count); |
| 6829 | } |
| 6830 | break; |
| 6831 | |
| 6832 | |
| 6833 | /* ===================================================================*/ |
| 6834 | /* Handle a numerical callout. */ |
| 6835 | |
| 6836 | case META_CALLOUT_NUMBER: |
| 6837 | code[0] = OP_CALLOUT; |
| 6838 | PUT(code, 1, pptr[1]); /* Offset to next pattern item */ |
| 6839 | PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */ |
| 6840 | code[1 + 2*LINK_SIZE] = pptr[3]; |
| 6841 | pptr += 3; |
| 6842 | code += PRIV(OP_lengths)[OP_CALLOUT]; |
| 6843 | break; |
| 6844 | |
| 6845 | |
| 6846 | /* ===================================================================*/ |
| 6847 | /* Handle a callout with a string argument. In the pre-pass we just compute |
| 6848 | the length without generating anything. The length in pptr[3] includes both |
| 6849 | delimiters; in the actual compile only the first one is copied, but a |
| 6850 | terminating zero is added. Any doubled delimiters within the string make |
| 6851 | this an overestimate, but it is not worth bothering about. */ |
| 6852 | |
| 6853 | case META_CALLOUT_STRING: |
| 6854 | if (lengthptr != NULL) |
| 6855 | { |
| 6856 | *lengthptr += pptr[3] + (1 + 4*LINK_SIZE); |
| 6857 | pptr += 3; |
| 6858 | SKIPOFFSET(pptr); |
| 6859 | } |
| 6860 | |
| 6861 | /* In the real compile we can copy the string. The starting delimiter is |
| 6862 | included so that the client can discover it if they want. We also pass the |
| 6863 | start offset to help a script language give better error messages. */ |
| 6864 | |
| 6865 | else |
| 6866 | { |
| 6867 | PCRE2_SPTR pp; |
| 6868 | uint32_t delimiter; |
| 6869 | uint32_t length = pptr[3]; |
| 6870 | PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE); |
| 6871 | |
| 6872 | code[0] = OP_CALLOUT_STR; |
| 6873 | PUT(code, 1, pptr[1]); /* Offset to next pattern item */ |
| 6874 | PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */ |
| 6875 | |
| 6876 | pptr += 3; |
| 6877 | GETPLUSOFFSET(offset, pptr); /* Offset to string in pattern */ |
| 6878 | pp = cb->start_pattern + offset; |
| 6879 | delimiter = *callout_string++ = *pp++; |
| 6880 | if (delimiter == CHAR_LEFT_CURLY_BRACKET) |
| 6881 | delimiter = CHAR_RIGHT_CURLY_BRACKET; |
| 6882 | PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1)); /* One after delimiter */ |
| 6883 | |
| 6884 | /* The syntax of the pattern was checked in the parsing scan. The length |
| 6885 | includes both delimiters, but we have passed the opening one just above, |
| 6886 | so we reduce length before testing it. The test is for > 1 because we do |
| 6887 | not want to copy the final delimiter. This also ensures that pp[1] is |
| 6888 | accessible. */ |
| 6889 | |
| 6890 | while (--length > 1) |
| 6891 | { |
| 6892 | if (*pp == delimiter && pp[1] == delimiter) |
| 6893 | { |
| 6894 | *callout_string++ = delimiter; |
| 6895 | pp += 2; |
| 6896 | length--; |
| 6897 | } |
| 6898 | else *callout_string++ = *pp++; |
| 6899 | } |
| 6900 | *callout_string++ = CHAR_NUL; |
| 6901 | |
| 6902 | /* Set the length of the entire item, the advance to its end. */ |
| 6903 | |
| 6904 | PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code)); |
| 6905 | code = callout_string; |
| 6906 | } |
| 6907 | break; |
| 6908 | |
| 6909 | |
| 6910 | /* ===================================================================*/ |
| 6911 | /* Handle repetition. The different types are all sorted out in the parsing |
| 6912 | pass. */ |
| 6913 | |
| 6914 | case META_MINMAX_PLUS: |
| 6915 | case META_MINMAX_QUERY: |
| 6916 | case META_MINMAX: |
| 6917 | repeat_min = *(++pptr); |
| 6918 | repeat_max = *(++pptr); |
| 6919 | goto REPEAT; |
| 6920 | |
| 6921 | case META_ASTERISK: |
| 6922 | case META_ASTERISK_PLUS: |
| 6923 | case META_ASTERISK_QUERY: |
| 6924 | repeat_min = 0; |
| 6925 | repeat_max = REPEAT_UNLIMITED; |
| 6926 | goto REPEAT; |
| 6927 | |
| 6928 | case META_PLUS: |
| 6929 | case META_PLUS_PLUS: |
| 6930 | case META_PLUS_QUERY: |
| 6931 | repeat_min = 1; |
| 6932 | repeat_max = REPEAT_UNLIMITED; |
| 6933 | goto REPEAT; |
| 6934 | |
| 6935 | case META_QUERY: |
| 6936 | case META_QUERY_PLUS: |
| 6937 | case META_QUERY_QUERY: |
| 6938 | repeat_min = 0; |
| 6939 | repeat_max = 1; |
| 6940 | |
| 6941 | REPEAT: |
| 6942 | if (previous_matched_char && repeat_min > 0) matched_char = TRUE; |
| 6943 | |
| 6944 | /* Remember whether this is a variable length repeat, and default to |
| 6945 | single-char opcodes. */ |
| 6946 | |
| 6947 | reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY; |
| 6948 | op_type = 0; |
| 6949 | |
| 6950 | /* Adjust first and required code units for a zero repeat. */ |
| 6951 | |
| 6952 | if (repeat_min == 0) |
| 6953 | { |
| 6954 | firstcu = zerofirstcu; |
| 6955 | firstcuflags = zerofirstcuflags; |
| 6956 | reqcu = zeroreqcu; |
| 6957 | reqcuflags = zeroreqcuflags; |
| 6958 | } |
| 6959 | |
| 6960 | /* Note the greediness and possessiveness. */ |
| 6961 | |
| 6962 | switch (meta) |
| 6963 | { |
| 6964 | case META_MINMAX_PLUS: |
| 6965 | case META_ASTERISK_PLUS: |
| 6966 | case META_PLUS_PLUS: |
| 6967 | case META_QUERY_PLUS: |
| 6968 | repeat_type = 0; /* Force greedy */ |
| 6969 | possessive_quantifier = TRUE; |
| 6970 | break; |
| 6971 | |
| 6972 | case META_MINMAX_QUERY: |
| 6973 | case META_ASTERISK_QUERY: |
| 6974 | case META_PLUS_QUERY: |
| 6975 | case META_QUERY_QUERY: |
| 6976 | repeat_type = greedy_non_default; |
| 6977 | possessive_quantifier = FALSE; |
| 6978 | break; |
| 6979 | |
| 6980 | default: |
| 6981 | repeat_type = greedy_default; |
| 6982 | possessive_quantifier = FALSE; |
| 6983 | break; |
| 6984 | } |
| 6985 | |
| 6986 | /* Save start of previous item, in case we have to move it up in order to |
| 6987 | insert something before it, and remember what it was. */ |
| 6988 | |
| 6989 | tempcode = previous; |
| 6990 | op_previous = *previous; |
| 6991 | |
| 6992 | /* Now handle repetition for the different types of item. If the repeat |
| 6993 | minimum and the repeat maximum are both 1, we can ignore the quantifier for |
| 6994 | non-parenthesized items, as they have only one alternative. For anything in |
| 6995 | parentheses, we must not ignore if {1} is possessive. */ |
| 6996 | |
| 6997 | switch (op_previous) |
| 6998 | { |
| 6999 | /* If previous was a character or negated character match, abolish the |
| 7000 | item and generate a repeat item instead. If a char item has a minimum of |
| 7001 | more than one, ensure that it is set in reqcu - it might not be if a |
| 7002 | sequence such as x{3} is the first thing in a branch because the x will |
| 7003 | have gone into firstcu instead. */ |
| 7004 | |
| 7005 | case OP_CHAR: |
| 7006 | case OP_CHARI: |
| 7007 | case OP_NOT: |
| 7008 | case OP_NOTI: |
| 7009 | if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT; |
| 7010 | op_type = chartypeoffset[op_previous - OP_CHAR]; |
| 7011 | |
| 7012 | /* Deal with UTF characters that take up more than one code unit. */ |
| 7013 | |
| 7014 | #ifdef MAYBE_UTF_MULTI |
| 7015 | if (utf && NOT_FIRSTCU(code[-1])) |
| 7016 | { |
| 7017 | PCRE2_UCHAR *lastchar = code - 1; |
| 7018 | BACKCHAR(lastchar); |
| 7019 | mclength = (uint32_t)(code - lastchar); /* Length of UTF character */ |
| 7020 | memcpy(mcbuffer, lastchar, CU2BYTES(mclength)); /* Save the char */ |
| 7021 | } |
| 7022 | else |
| 7023 | #endif /* MAYBE_UTF_MULTI */ |
| 7024 | |
| 7025 | /* Handle the case of a single code unit - either with no UTF support, or |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 7026 | with UTF disabled, or for a single-code-unit UTF character. In the latter |
| 7027 | case, for a repeated positive match, get the caseless flag for the |
| 7028 | required code unit from the previous character, because a class like [Aa] |
| 7029 | sets a caseless A but by now the req_caseopt flag has been reset. */ |
| 7030 | |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 7031 | { |
| 7032 | mcbuffer[0] = code[-1]; |
| 7033 | mclength = 1; |
| 7034 | if (op_previous <= OP_CHARI && repeat_min > 1) |
| 7035 | { |
| 7036 | reqcu = mcbuffer[0]; |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 7037 | reqcuflags = cb->req_varyopt; |
| 7038 | if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS; |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 7039 | } |
| 7040 | } |
| 7041 | goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */ |
| 7042 | |
| 7043 | /* If previous was a character class or a back reference, we put the |
| 7044 | repeat stuff after it, but just skip the item if the repeat was {0,0}. */ |
| 7045 | |
| 7046 | #ifdef SUPPORT_WIDE_CHARS |
| 7047 | case OP_XCLASS: |
| 7048 | #endif |
| 7049 | case OP_CLASS: |
| 7050 | case OP_NCLASS: |
| 7051 | case OP_REF: |
| 7052 | case OP_REFI: |
| 7053 | case OP_DNREF: |
| 7054 | case OP_DNREFI: |
| 7055 | |
| 7056 | if (repeat_max == 0) |
| 7057 | { |
| 7058 | code = previous; |
| 7059 | goto END_REPEAT; |
| 7060 | } |
| 7061 | if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT; |
| 7062 | |
| 7063 | if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED) |
| 7064 | *code++ = OP_CRSTAR + repeat_type; |
| 7065 | else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED) |
| 7066 | *code++ = OP_CRPLUS + repeat_type; |
| 7067 | else if (repeat_min == 0 && repeat_max == 1) |
| 7068 | *code++ = OP_CRQUERY + repeat_type; |
| 7069 | else |
| 7070 | { |
| 7071 | *code++ = OP_CRRANGE + repeat_type; |
| 7072 | PUT2INC(code, 0, repeat_min); |
| 7073 | if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0; /* 2-byte encoding for max */ |
| 7074 | PUT2INC(code, 0, repeat_max); |
| 7075 | } |
| 7076 | break; |
| 7077 | |
| 7078 | /* If previous is OP_FAIL, it was generated by an empty class [] |
| 7079 | (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be |
| 7080 | generated, that is by (*FAIL) or (?!), disallow a quantifier at parse |
| 7081 | time. We can just ignore this repeat. */ |
| 7082 | |
| 7083 | case OP_FAIL: |
| 7084 | goto END_REPEAT; |
| 7085 | |
| 7086 | /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets |
| 7087 | because pcre2_match() could not handle backtracking into recursively |
| 7088 | called groups. Now that this backtracking is available, we no longer need |
| 7089 | to do this. However, we still need to replicate recursions as we do for |
| 7090 | groups so as to have independent backtracking points. We can replicate |
| 7091 | for the minimum number of repeats directly. For optional repeats we now |
| 7092 | wrap the recursion in OP_BRA brackets and make use of the bracket |
| 7093 | repetition. */ |
| 7094 | |
| 7095 | case OP_RECURSE: |
| 7096 | if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier) |
| 7097 | goto END_REPEAT; |
| 7098 | |
| 7099 | /* Generate unwrapped repeats for a non-zero minimum, except when the |
| 7100 | minimum is 1 and the maximum unlimited, because that can be handled with |
| 7101 | OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the |
| 7102 | minimum, we just need to generate the appropriate additional copies. |
| 7103 | Otherwise we need to generate one more, to simulate the situation when |
| 7104 | the minimum is zero. */ |
| 7105 | |
| 7106 | if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED)) |
| 7107 | { |
| 7108 | int replicate = repeat_min; |
| 7109 | if (repeat_min == repeat_max) replicate--; |
| 7110 | |
| 7111 | /* In the pre-compile phase, we don't actually do the replication. We |
| 7112 | just adjust the length as if we had. Do some paranoid checks for |
| 7113 | potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit |
| 7114 | integer type when available, otherwise double. */ |
| 7115 | |
| 7116 | if (lengthptr != NULL) |
| 7117 | { |
| 7118 | PCRE2_SIZE delta = replicate*(1 + LINK_SIZE); |
| 7119 | if ((INT64_OR_DOUBLE)replicate* |
| 7120 | (INT64_OR_DOUBLE)(1 + LINK_SIZE) > |
| 7121 | (INT64_OR_DOUBLE)INT_MAX || |
| 7122 | OFLOW_MAX - *lengthptr < delta) |
| 7123 | { |
| 7124 | *errorcodeptr = ERR20; |
| 7125 | return 0; |
| 7126 | } |
| 7127 | *lengthptr += delta; |
| 7128 | } |
| 7129 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 7130 | else for (int i = 0; i < replicate; i++) |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 7131 | { |
| 7132 | memcpy(code, previous, CU2BYTES(1 + LINK_SIZE)); |
| 7133 | previous = code; |
| 7134 | code += 1 + LINK_SIZE; |
| 7135 | } |
| 7136 | |
| 7137 | /* If the number of repeats is fixed, we are done. Otherwise, adjust |
| 7138 | the counts and fall through. */ |
| 7139 | |
| 7140 | if (repeat_min == repeat_max) break; |
| 7141 | if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min; |
| 7142 | repeat_min = 0; |
| 7143 | } |
| 7144 | |
| 7145 | /* Wrap the recursion call in OP_BRA brackets. */ |
| 7146 | |
| 7147 | (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE)); |
| 7148 | op_previous = *previous = OP_BRA; |
| 7149 | PUT(previous, 1, 2 + 2*LINK_SIZE); |
| 7150 | previous[2 + 2*LINK_SIZE] = OP_KET; |
| 7151 | PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE); |
| 7152 | code += 2 + 2 * LINK_SIZE; |
| 7153 | length_prevgroup = 3 + 3*LINK_SIZE; |
| 7154 | group_return = -1; /* Set "may match empty string" */ |
| 7155 | |
| 7156 | /* Now treat as a repeated OP_BRA. */ |
| 7157 | /* Fall through */ |
| 7158 | |
| 7159 | /* If previous was a bracket group, we may have to replicate it in |
| 7160 | certain cases. Note that at this point we can encounter only the "basic" |
| 7161 | bracket opcodes such as BRA and CBRA, as this is the place where they get |
| 7162 | converted into the more special varieties such as BRAPOS and SBRA. |
| 7163 | Originally, PCRE did not allow repetition of assertions, but now it does, |
| 7164 | for Perl compatibility. */ |
| 7165 | |
| 7166 | case OP_ASSERT: |
| 7167 | case OP_ASSERT_NOT: |
| 7168 | case OP_ASSERT_NA: |
| 7169 | case OP_ASSERTBACK: |
| 7170 | case OP_ASSERTBACK_NOT: |
| 7171 | case OP_ASSERTBACK_NA: |
| 7172 | case OP_ONCE: |
| 7173 | case OP_SCRIPT_RUN: |
| 7174 | case OP_BRA: |
| 7175 | case OP_CBRA: |
| 7176 | case OP_COND: |
| 7177 | { |
| 7178 | int len = (int)(code - previous); |
| 7179 | PCRE2_UCHAR *bralink = NULL; |
| 7180 | PCRE2_UCHAR *brazeroptr = NULL; |
| 7181 | |
| 7182 | if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier) |
| 7183 | goto END_REPEAT; |
| 7184 | |
| 7185 | /* Repeating a DEFINE group (or any group where the condition is always |
| 7186 | FALSE and there is only one branch) is pointless, but Perl allows the |
| 7187 | syntax, so we just ignore the repeat. */ |
| 7188 | |
| 7189 | if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE && |
| 7190 | previous[GET(previous, 1)] != OP_ALT) |
| 7191 | goto END_REPEAT; |
| 7192 | |
| 7193 | /* Perl allows all assertions to be quantified, and when they contain |
| 7194 | capturing parentheses and/or are optional there are potential uses for |
| 7195 | this feature. PCRE2 used to force the maximum quantifier to 1 on the |
| 7196 | invalid grounds that further repetition was never useful. This was |
| 7197 | always a bit pointless, since an assertion could be wrapped with a |
| 7198 | repeated group to achieve the effect. General repetition is now |
| 7199 | permitted, but if the maximum is unlimited it is set to one more than |
| 7200 | the minimum. */ |
| 7201 | |
| 7202 | if (op_previous < OP_ONCE) /* Assertion */ |
| 7203 | { |
| 7204 | if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1; |
| 7205 | } |
| 7206 | |
| 7207 | /* The case of a zero minimum is special because of the need to stick |
| 7208 | OP_BRAZERO in front of it, and because the group appears once in the |
| 7209 | data, whereas in other cases it appears the minimum number of times. For |
| 7210 | this reason, it is simplest to treat this case separately, as otherwise |
| 7211 | the code gets far too messy. There are several special subcases when the |
| 7212 | minimum is zero. */ |
| 7213 | |
| 7214 | if (repeat_min == 0) |
| 7215 | { |
| 7216 | /* If the maximum is also zero, we used to just omit the group from |
| 7217 | the output altogether, like this: |
| 7218 | |
| 7219 | ** if (repeat_max == 0) |
| 7220 | ** { |
| 7221 | ** code = previous; |
| 7222 | ** goto END_REPEAT; |
| 7223 | ** } |
| 7224 | |
| 7225 | However, that fails when a group or a subgroup within it is |
| 7226 | referenced as a subroutine from elsewhere in the pattern, so now we |
| 7227 | stick in OP_SKIPZERO in front of it so that it is skipped on |
| 7228 | execution. As we don't have a list of which groups are referenced, we |
| 7229 | cannot do this selectively. |
| 7230 | |
| 7231 | If the maximum is 1 or unlimited, we just have to stick in the |
| 7232 | BRAZERO and do no more at this point. */ |
| 7233 | |
| 7234 | if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED) |
| 7235 | { |
| 7236 | (void)memmove(previous + 1, previous, CU2BYTES(len)); |
| 7237 | code++; |
| 7238 | if (repeat_max == 0) |
| 7239 | { |
| 7240 | *previous++ = OP_SKIPZERO; |
| 7241 | goto END_REPEAT; |
| 7242 | } |
| 7243 | brazeroptr = previous; /* Save for possessive optimizing */ |
| 7244 | *previous++ = OP_BRAZERO + repeat_type; |
| 7245 | } |
| 7246 | |
| 7247 | /* If the maximum is greater than 1 and limited, we have to replicate |
| 7248 | in a nested fashion, sticking OP_BRAZERO before each set of brackets. |
| 7249 | The first one has to be handled carefully because it's the original |
| 7250 | copy, which has to be moved up. The remainder can be handled by code |
| 7251 | that is common with the non-zero minimum case below. We have to |
| 7252 | adjust the value or repeat_max, since one less copy is required. */ |
| 7253 | |
| 7254 | else |
| 7255 | { |
| 7256 | int linkoffset; |
| 7257 | (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len)); |
| 7258 | code += 2 + LINK_SIZE; |
| 7259 | *previous++ = OP_BRAZERO + repeat_type; |
| 7260 | *previous++ = OP_BRA; |
| 7261 | |
| 7262 | /* We chain together the bracket link offset fields that have to be |
| 7263 | filled in later when the ends of the brackets are reached. */ |
| 7264 | |
| 7265 | linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink); |
| 7266 | bralink = previous; |
| 7267 | PUTINC(previous, 0, linkoffset); |
| 7268 | } |
| 7269 | |
| 7270 | if (repeat_max != REPEAT_UNLIMITED) repeat_max--; |
| 7271 | } |
| 7272 | |
| 7273 | /* If the minimum is greater than zero, replicate the group as many |
| 7274 | times as necessary, and adjust the maximum to the number of subsequent |
| 7275 | copies that we need. */ |
| 7276 | |
| 7277 | else |
| 7278 | { |
| 7279 | if (repeat_min > 1) |
| 7280 | { |
| 7281 | /* In the pre-compile phase, we don't actually do the replication. |
| 7282 | We just adjust the length as if we had. Do some paranoid checks for |
| 7283 | potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit |
| 7284 | integer type when available, otherwise double. */ |
| 7285 | |
| 7286 | if (lengthptr != NULL) |
| 7287 | { |
| 7288 | PCRE2_SIZE delta = (repeat_min - 1)*length_prevgroup; |
| 7289 | if ((INT64_OR_DOUBLE)(repeat_min - 1)* |
| 7290 | (INT64_OR_DOUBLE)length_prevgroup > |
| 7291 | (INT64_OR_DOUBLE)INT_MAX || |
| 7292 | OFLOW_MAX - *lengthptr < delta) |
| 7293 | { |
| 7294 | *errorcodeptr = ERR20; |
| 7295 | return 0; |
| 7296 | } |
| 7297 | *lengthptr += delta; |
| 7298 | } |
| 7299 | |
| 7300 | /* This is compiling for real. If there is a set first code unit |
| 7301 | for the group, and we have not yet set a "required code unit", set |
| 7302 | it. */ |
| 7303 | |
| 7304 | else |
| 7305 | { |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 7306 | if (groupsetfirstcu && reqcuflags >= REQ_NONE) |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 7307 | { |
| 7308 | reqcu = firstcu; |
| 7309 | reqcuflags = firstcuflags; |
| 7310 | } |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 7311 | for (uint32_t i = 1; i < repeat_min; i++) |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 7312 | { |
| 7313 | memcpy(code, previous, CU2BYTES(len)); |
| 7314 | code += len; |
| 7315 | } |
| 7316 | } |
| 7317 | } |
| 7318 | |
| 7319 | if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min; |
| 7320 | } |
| 7321 | |
| 7322 | /* This code is common to both the zero and non-zero minimum cases. If |
| 7323 | the maximum is limited, it replicates the group in a nested fashion, |
| 7324 | remembering the bracket starts on a stack. In the case of a zero |
| 7325 | minimum, the first one was set up above. In all cases the repeat_max |
| 7326 | now specifies the number of additional copies needed. Again, we must |
| 7327 | remember to replicate entries on the forward reference list. */ |
| 7328 | |
| 7329 | if (repeat_max != REPEAT_UNLIMITED) |
| 7330 | { |
| 7331 | /* In the pre-compile phase, we don't actually do the replication. We |
| 7332 | just adjust the length as if we had. For each repetition we must add |
| 7333 | 1 to the length for BRAZERO and for all but the last repetition we |
| 7334 | must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some |
| 7335 | paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type |
| 7336 | is a 64-bit integer type when available, otherwise double. */ |
| 7337 | |
| 7338 | if (lengthptr != NULL && repeat_max > 0) |
| 7339 | { |
| 7340 | PCRE2_SIZE delta = repeat_max*(length_prevgroup + 1 + 2 + 2*LINK_SIZE) - |
| 7341 | 2 - 2*LINK_SIZE; /* Last one doesn't nest */ |
| 7342 | if ((INT64_OR_DOUBLE)repeat_max * |
| 7343 | (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE) |
| 7344 | > (INT64_OR_DOUBLE)INT_MAX || |
| 7345 | OFLOW_MAX - *lengthptr < delta) |
| 7346 | { |
| 7347 | *errorcodeptr = ERR20; |
| 7348 | return 0; |
| 7349 | } |
| 7350 | *lengthptr += delta; |
| 7351 | } |
| 7352 | |
| 7353 | /* This is compiling for real */ |
| 7354 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 7355 | else for (uint32_t i = repeat_max; i >= 1; i--) |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 7356 | { |
| 7357 | *code++ = OP_BRAZERO + repeat_type; |
| 7358 | |
| 7359 | /* All but the final copy start a new nesting, maintaining the |
| 7360 | chain of brackets outstanding. */ |
| 7361 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 7362 | if (i != 1) |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 7363 | { |
| 7364 | int linkoffset; |
| 7365 | *code++ = OP_BRA; |
| 7366 | linkoffset = (bralink == NULL)? 0 : (int)(code - bralink); |
| 7367 | bralink = code; |
| 7368 | PUTINC(code, 0, linkoffset); |
| 7369 | } |
| 7370 | |
| 7371 | memcpy(code, previous, CU2BYTES(len)); |
| 7372 | code += len; |
| 7373 | } |
| 7374 | |
| 7375 | /* Now chain through the pending brackets, and fill in their length |
| 7376 | fields (which are holding the chain links pro tem). */ |
| 7377 | |
| 7378 | while (bralink != NULL) |
| 7379 | { |
| 7380 | int oldlinkoffset; |
| 7381 | int linkoffset = (int)(code - bralink + 1); |
| 7382 | PCRE2_UCHAR *bra = code - linkoffset; |
| 7383 | oldlinkoffset = GET(bra, 1); |
| 7384 | bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset; |
| 7385 | *code++ = OP_KET; |
| 7386 | PUTINC(code, 0, linkoffset); |
| 7387 | PUT(bra, 1, linkoffset); |
| 7388 | } |
| 7389 | } |
| 7390 | |
| 7391 | /* If the maximum is unlimited, set a repeater in the final copy. For |
| 7392 | SCRIPT_RUN and ONCE brackets, that's all we need to do. However, |
| 7393 | possessively repeated ONCE brackets can be converted into non-capturing |
| 7394 | brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this |
| 7395 | saves having to deal with possessive ONCEs specially. |
| 7396 | |
| 7397 | Otherwise, when we are doing the actual compile phase, check to see |
| 7398 | whether this group is one that could match an empty string. If so, |
| 7399 | convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so |
| 7400 | that runtime checking can be done. [This check is also applied to ONCE |
| 7401 | and SCRIPT_RUN groups at runtime, but in a different way.] |
| 7402 | |
| 7403 | Then, if the quantifier was possessive and the bracket is not a |
| 7404 | conditional, we convert the BRA code to the POS form, and the KET code |
| 7405 | to KETRPOS. (It turns out to be convenient at runtime to detect this |
| 7406 | kind of subpattern at both the start and at the end.) The use of |
| 7407 | special opcodes makes it possible to reduce greatly the stack usage in |
| 7408 | pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to |
| 7409 | OP_BRAPOSZERO. |
| 7410 | |
| 7411 | Then, if the minimum number of matches is 1 or 0, cancel the possessive |
| 7412 | flag so that the default action below, of wrapping everything inside |
| 7413 | atomic brackets, does not happen. When the minimum is greater than 1, |
| 7414 | there will be earlier copies of the group, and so we still have to wrap |
| 7415 | the whole thing. */ |
| 7416 | |
| 7417 | else |
| 7418 | { |
| 7419 | PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE; |
| 7420 | PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1); |
| 7421 | |
| 7422 | /* Convert possessive ONCE brackets to non-capturing */ |
| 7423 | |
| 7424 | if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA; |
| 7425 | |
| 7426 | /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need |
| 7427 | to do is to set the KET. */ |
| 7428 | |
| 7429 | if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN) |
| 7430 | *ketcode = OP_KETRMAX + repeat_type; |
| 7431 | |
| 7432 | /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs |
| 7433 | (which have been converted to non-capturing above). */ |
| 7434 | |
| 7435 | else |
| 7436 | { |
| 7437 | /* In the compile phase, adjust the opcode if the group can match |
| 7438 | an empty string. For a conditional group with only one branch, the |
| 7439 | value of group_return will not show "could be empty", so we must |
| 7440 | check that separately. */ |
| 7441 | |
| 7442 | if (lengthptr == NULL) |
| 7443 | { |
| 7444 | if (group_return < 0) *bracode += OP_SBRA - OP_BRA; |
| 7445 | if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT) |
| 7446 | *bracode = OP_SCOND; |
| 7447 | } |
| 7448 | |
| 7449 | /* Handle possessive quantifiers. */ |
| 7450 | |
| 7451 | if (possessive_quantifier) |
| 7452 | { |
| 7453 | /* For COND brackets, we wrap the whole thing in a possessively |
| 7454 | repeated non-capturing bracket, because we have not invented POS |
| 7455 | versions of the COND opcodes. */ |
| 7456 | |
| 7457 | if (*bracode == OP_COND || *bracode == OP_SCOND) |
| 7458 | { |
| 7459 | int nlen = (int)(code - bracode); |
| 7460 | (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen)); |
| 7461 | code += 1 + LINK_SIZE; |
| 7462 | nlen += 1 + LINK_SIZE; |
| 7463 | *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS; |
| 7464 | *code++ = OP_KETRPOS; |
| 7465 | PUTINC(code, 0, nlen); |
| 7466 | PUT(bracode, 1, nlen); |
| 7467 | } |
| 7468 | |
| 7469 | /* For non-COND brackets, we modify the BRA code and use KETRPOS. */ |
| 7470 | |
| 7471 | else |
| 7472 | { |
| 7473 | *bracode += 1; /* Switch to xxxPOS opcodes */ |
| 7474 | *ketcode = OP_KETRPOS; |
| 7475 | } |
| 7476 | |
| 7477 | /* If the minimum is zero, mark it as possessive, then unset the |
| 7478 | possessive flag when the minimum is 0 or 1. */ |
| 7479 | |
| 7480 | if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO; |
| 7481 | if (repeat_min < 2) possessive_quantifier = FALSE; |
| 7482 | } |
| 7483 | |
| 7484 | /* Non-possessive quantifier */ |
| 7485 | |
| 7486 | else *ketcode = OP_KETRMAX + repeat_type; |
| 7487 | } |
| 7488 | } |
| 7489 | } |
| 7490 | break; |
| 7491 | |
| 7492 | /* If previous was a character type match (\d or similar), abolish it and |
| 7493 | create a suitable repeat item. The code is shared with single-character |
| 7494 | repeats by setting op_type to add a suitable offset into repeat_type. |
| 7495 | Note the the Unicode property types will be present only when |
| 7496 | SUPPORT_UNICODE is defined, but we don't wrap the little bits of code |
| 7497 | here because it just makes it horribly messy. */ |
| 7498 | |
| 7499 | default: |
| 7500 | if (op_previous >= OP_EODN) /* Not a character type - internal error */ |
| 7501 | { |
| 7502 | *errorcodeptr = ERR10; |
| 7503 | return 0; |
| 7504 | } |
| 7505 | else |
| 7506 | { |
| 7507 | int prop_type, prop_value; |
| 7508 | PCRE2_UCHAR *oldcode; |
| 7509 | |
| 7510 | if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT; |
| 7511 | |
| 7512 | op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ |
| 7513 | mclength = 0; /* Not a character */ |
| 7514 | |
| 7515 | if (op_previous == OP_PROP || op_previous == OP_NOTPROP) |
| 7516 | { |
| 7517 | prop_type = previous[1]; |
| 7518 | prop_value = previous[2]; |
| 7519 | } |
| 7520 | else |
| 7521 | { |
| 7522 | /* Come here from just above with a character in mcbuffer/mclength. */ |
| 7523 | OUTPUT_SINGLE_REPEAT: |
| 7524 | prop_type = prop_value = -1; |
| 7525 | } |
| 7526 | |
| 7527 | /* At this point, if prop_type == prop_value == -1 we either have a |
| 7528 | character in mcbuffer when mclength is greater than zero, or we have |
| 7529 | mclength zero, in which case there is a non-property character type in |
| 7530 | op_previous. If prop_type/value are not negative, we have a property |
| 7531 | character type in op_previous. */ |
| 7532 | |
| 7533 | oldcode = code; /* Save where we were */ |
| 7534 | code = previous; /* Usually overwrite previous item */ |
| 7535 | |
| 7536 | /* If the maximum is zero then the minimum must also be zero; Perl allows |
| 7537 | this case, so we do too - by simply omitting the item altogether. */ |
| 7538 | |
| 7539 | if (repeat_max == 0) goto END_REPEAT; |
| 7540 | |
| 7541 | /* Combine the op_type with the repeat_type */ |
| 7542 | |
| 7543 | repeat_type += op_type; |
| 7544 | |
| 7545 | /* A minimum of zero is handled either as the special case * or ?, or as |
| 7546 | an UPTO, with the maximum given. */ |
| 7547 | |
| 7548 | if (repeat_min == 0) |
| 7549 | { |
| 7550 | if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type; |
| 7551 | else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type; |
| 7552 | else |
| 7553 | { |
| 7554 | *code++ = OP_UPTO + repeat_type; |
| 7555 | PUT2INC(code, 0, repeat_max); |
| 7556 | } |
| 7557 | } |
| 7558 | |
| 7559 | /* A repeat minimum of 1 is optimized into some special cases. If the |
| 7560 | maximum is unlimited, we use OP_PLUS. Otherwise, the original item is |
| 7561 | left in place and, if the maximum is greater than 1, we use OP_UPTO with |
| 7562 | one less than the maximum. */ |
| 7563 | |
| 7564 | else if (repeat_min == 1) |
| 7565 | { |
| 7566 | if (repeat_max == REPEAT_UNLIMITED) |
| 7567 | *code++ = OP_PLUS + repeat_type; |
| 7568 | else |
| 7569 | { |
| 7570 | code = oldcode; /* Leave previous item in place */ |
| 7571 | if (repeat_max == 1) goto END_REPEAT; |
| 7572 | *code++ = OP_UPTO + repeat_type; |
| 7573 | PUT2INC(code, 0, repeat_max - 1); |
| 7574 | } |
| 7575 | } |
| 7576 | |
| 7577 | /* The case {n,n} is just an EXACT, while the general case {n,m} is |
| 7578 | handled as an EXACT followed by an UPTO or STAR or QUERY. */ |
| 7579 | |
| 7580 | else |
| 7581 | { |
| 7582 | *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */ |
| 7583 | PUT2INC(code, 0, repeat_min); |
| 7584 | |
| 7585 | /* Unless repeat_max equals repeat_min, fill in the data for EXACT, |
| 7586 | and then generate the second opcode. For a repeated Unicode property |
| 7587 | match, there are two extra values that define the required property, |
| 7588 | and mclength is set zero to indicate this. */ |
| 7589 | |
| 7590 | if (repeat_max != repeat_min) |
| 7591 | { |
| 7592 | if (mclength > 0) |
| 7593 | { |
| 7594 | memcpy(code, mcbuffer, CU2BYTES(mclength)); |
| 7595 | code += mclength; |
| 7596 | } |
| 7597 | else |
| 7598 | { |
| 7599 | *code++ = op_previous; |
| 7600 | if (prop_type >= 0) |
| 7601 | { |
| 7602 | *code++ = prop_type; |
| 7603 | *code++ = prop_value; |
| 7604 | } |
| 7605 | } |
| 7606 | |
| 7607 | /* Now set up the following opcode */ |
| 7608 | |
| 7609 | if (repeat_max == REPEAT_UNLIMITED) |
| 7610 | *code++ = OP_STAR + repeat_type; |
| 7611 | else |
| 7612 | { |
| 7613 | repeat_max -= repeat_min; |
| 7614 | if (repeat_max == 1) |
| 7615 | { |
| 7616 | *code++ = OP_QUERY + repeat_type; |
| 7617 | } |
| 7618 | else |
| 7619 | { |
| 7620 | *code++ = OP_UPTO + repeat_type; |
| 7621 | PUT2INC(code, 0, repeat_max); |
| 7622 | } |
| 7623 | } |
| 7624 | } |
| 7625 | } |
| 7626 | |
| 7627 | /* Fill in the character or character type for the final opcode. */ |
| 7628 | |
| 7629 | if (mclength > 0) |
| 7630 | { |
| 7631 | memcpy(code, mcbuffer, CU2BYTES(mclength)); |
| 7632 | code += mclength; |
| 7633 | } |
| 7634 | else |
| 7635 | { |
| 7636 | *code++ = op_previous; |
| 7637 | if (prop_type >= 0) |
| 7638 | { |
| 7639 | *code++ = prop_type; |
| 7640 | *code++ = prop_value; |
| 7641 | } |
| 7642 | } |
| 7643 | } |
| 7644 | break; |
| 7645 | } /* End of switch on different op_previous values */ |
| 7646 | |
| 7647 | |
| 7648 | /* If the character following a repeat is '+', possessive_quantifier is |
| 7649 | TRUE. For some opcodes, there are special alternative opcodes for this |
| 7650 | case. For anything else, we wrap the entire repeated item inside OP_ONCE |
| 7651 | brackets. Logically, the '+' notation is just syntactic sugar, taken from |
| 7652 | Sun's Java package, but the special opcodes can optimize it. |
| 7653 | |
| 7654 | Some (but not all) possessively repeated subpatterns have already been |
| 7655 | completely handled in the code just above. For them, possessive_quantifier |
| 7656 | is always FALSE at this stage. Note that the repeated item starts at |
| 7657 | tempcode, not at previous, which might be the first part of a string whose |
| 7658 | (former) last char we repeated. */ |
| 7659 | |
| 7660 | if (possessive_quantifier) |
| 7661 | { |
| 7662 | int len; |
| 7663 | |
| 7664 | /* Possessifying an EXACT quantifier has no effect, so we can ignore it. |
| 7665 | However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6}, |
| 7666 | {5,}, or {5,10}). We skip over an EXACT item; if the length of what |
| 7667 | remains is greater than zero, there's a further opcode that can be |
| 7668 | handled. If not, do nothing, leaving the EXACT alone. */ |
| 7669 | |
| 7670 | switch(*tempcode) |
| 7671 | { |
| 7672 | case OP_TYPEEXACT: |
| 7673 | tempcode += PRIV(OP_lengths)[*tempcode] + |
| 7674 | ((tempcode[1 + IMM2_SIZE] == OP_PROP |
| 7675 | || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0); |
| 7676 | break; |
| 7677 | |
| 7678 | /* CHAR opcodes are used for exacts whose count is 1. */ |
| 7679 | |
| 7680 | case OP_CHAR: |
| 7681 | case OP_CHARI: |
| 7682 | case OP_NOT: |
| 7683 | case OP_NOTI: |
| 7684 | case OP_EXACT: |
| 7685 | case OP_EXACTI: |
| 7686 | case OP_NOTEXACT: |
| 7687 | case OP_NOTEXACTI: |
| 7688 | tempcode += PRIV(OP_lengths)[*tempcode]; |
| 7689 | #ifdef SUPPORT_UNICODE |
| 7690 | if (utf && HAS_EXTRALEN(tempcode[-1])) |
| 7691 | tempcode += GET_EXTRALEN(tempcode[-1]); |
| 7692 | #endif |
| 7693 | break; |
| 7694 | |
| 7695 | /* For the class opcodes, the repeat operator appears at the end; |
| 7696 | adjust tempcode to point to it. */ |
| 7697 | |
| 7698 | case OP_CLASS: |
| 7699 | case OP_NCLASS: |
| 7700 | tempcode += 1 + 32/sizeof(PCRE2_UCHAR); |
| 7701 | break; |
| 7702 | |
| 7703 | #ifdef SUPPORT_WIDE_CHARS |
| 7704 | case OP_XCLASS: |
| 7705 | tempcode += GET(tempcode, 1); |
| 7706 | break; |
| 7707 | #endif |
| 7708 | } |
| 7709 | |
| 7710 | /* If tempcode is equal to code (which points to the end of the repeated |
| 7711 | item), it means we have skipped an EXACT item but there is no following |
| 7712 | QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In |
| 7713 | all other cases, tempcode will be pointing to the repeat opcode, and will |
| 7714 | be less than code, so the value of len will be greater than 0. */ |
| 7715 | |
| 7716 | len = (int)(code - tempcode); |
| 7717 | if (len > 0) |
| 7718 | { |
| 7719 | unsigned int repcode = *tempcode; |
| 7720 | |
| 7721 | /* There is a table for possessifying opcodes, all of which are less |
| 7722 | than OP_CALLOUT. A zero entry means there is no possessified version. |
| 7723 | */ |
| 7724 | |
| 7725 | if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0) |
| 7726 | *tempcode = opcode_possessify[repcode]; |
| 7727 | |
| 7728 | /* For opcode without a special possessified version, wrap the item in |
| 7729 | ONCE brackets. */ |
| 7730 | |
| 7731 | else |
| 7732 | { |
| 7733 | (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len)); |
| 7734 | code += 1 + LINK_SIZE; |
| 7735 | len += 1 + LINK_SIZE; |
| 7736 | tempcode[0] = OP_ONCE; |
| 7737 | *code++ = OP_KET; |
| 7738 | PUTINC(code, 0, len); |
| 7739 | PUT(tempcode, 1, len); |
| 7740 | } |
| 7741 | } |
| 7742 | } |
| 7743 | |
| 7744 | /* We set the "follows varying string" flag for subsequently encountered |
| 7745 | reqcus if it isn't already set and we have just passed a varying length |
| 7746 | item. */ |
| 7747 | |
| 7748 | END_REPEAT: |
| 7749 | cb->req_varyopt |= reqvary; |
| 7750 | break; |
| 7751 | |
| 7752 | |
| 7753 | /* ===================================================================*/ |
| 7754 | /* Handle a 32-bit data character with a value greater than META_END. */ |
| 7755 | |
| 7756 | case META_BIGVALUE: |
| 7757 | pptr++; |
| 7758 | goto NORMAL_CHAR; |
| 7759 | |
| 7760 | |
| 7761 | /* ===============================================================*/ |
| 7762 | /* Handle a back reference by number, which is the meta argument. The |
| 7763 | pattern offsets for back references to group numbers less than 10 are held |
| 7764 | in a special vector, to avoid using more than two parsed pattern elements |
| 7765 | in 64-bit environments. We only need the offset to the first occurrence, |
| 7766 | because if that doesn't fail, subsequent ones will also be OK. */ |
| 7767 | |
| 7768 | case META_BACKREF: |
| 7769 | if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg]; |
| 7770 | else GETPLUSOFFSET(offset, pptr); |
| 7771 | |
| 7772 | if (meta_arg > cb->bracount) |
| 7773 | { |
| 7774 | cb->erroroffset = offset; |
| 7775 | *errorcodeptr = ERR15; /* Non-existent subpattern */ |
| 7776 | return 0; |
| 7777 | } |
| 7778 | |
| 7779 | /* Come here from named backref handling when the reference is to a |
| 7780 | single group (that is, not to a duplicated name). The back reference |
| 7781 | data will have already been updated. We must disable firstcu if not |
| 7782 | set, to cope with cases like (?=(\w+))\1: which would otherwise set ':' |
| 7783 | later. */ |
| 7784 | |
| 7785 | HANDLE_SINGLE_REFERENCE: |
| 7786 | if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE; |
| 7787 | *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF; |
| 7788 | PUT2INC(code, 0, meta_arg); |
| 7789 | |
| 7790 | /* Update the map of back references, and keep the highest one. We |
| 7791 | could do this in parse_regex() for numerical back references, but not |
| 7792 | for named back references, because we don't know the numbers to which |
| 7793 | named back references refer. So we do it all in this function. */ |
| 7794 | |
| 7795 | cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1; |
| 7796 | if (meta_arg > cb->top_backref) cb->top_backref = meta_arg; |
| 7797 | break; |
| 7798 | |
| 7799 | |
| 7800 | /* ===============================================================*/ |
| 7801 | /* Handle recursion by inserting the number of the called group (which is |
| 7802 | the meta argument) after OP_RECURSE. At the end of compiling the pattern is |
| 7803 | scanned and these numbers are replaced by offsets within the pattern. It is |
| 7804 | done like this to avoid problems with forward references and adjusting |
| 7805 | offsets when groups are duplicated and moved (as discovered in previous |
| 7806 | implementations). Note that a recursion does not have a set first |
| 7807 | character. */ |
| 7808 | |
| 7809 | case META_RECURSE: |
| 7810 | GETPLUSOFFSET(offset, pptr); |
| 7811 | if (meta_arg > cb->bracount) |
| 7812 | { |
| 7813 | cb->erroroffset = offset; |
| 7814 | *errorcodeptr = ERR15; /* Non-existent subpattern */ |
| 7815 | return 0; |
| 7816 | } |
| 7817 | HANDLE_NUMERICAL_RECURSION: |
| 7818 | *code = OP_RECURSE; |
| 7819 | PUT(code, 1, meta_arg); |
| 7820 | code += 1 + LINK_SIZE; |
| 7821 | groupsetfirstcu = FALSE; |
| 7822 | cb->had_recurse = TRUE; |
| 7823 | if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; |
| 7824 | zerofirstcu = firstcu; |
| 7825 | zerofirstcuflags = firstcuflags; |
| 7826 | break; |
| 7827 | |
| 7828 | |
| 7829 | /* ===============================================================*/ |
| 7830 | /* Handle capturing parentheses; the number is the meta argument. */ |
| 7831 | |
| 7832 | case META_CAPTURE: |
| 7833 | bravalue = OP_CBRA; |
| 7834 | skipunits = IMM2_SIZE; |
| 7835 | PUT2(code, 1+LINK_SIZE, meta_arg); |
| 7836 | cb->lastcapture = meta_arg; |
| 7837 | goto GROUP_PROCESS_NOTE_EMPTY; |
| 7838 | |
| 7839 | |
| 7840 | /* ===============================================================*/ |
| 7841 | /* Handle escape sequence items. For ones like \d, the ESC_values are |
| 7842 | arranged to be the same as the corresponding OP_values in the default case |
| 7843 | when PCRE2_UCP is not set (which is the only case in which they will appear |
| 7844 | here). |
| 7845 | |
| 7846 | Note: \Q and \E are never seen here, as they were dealt with in |
| 7847 | parse_pattern(). Neither are numerical back references or recursions, which |
| 7848 | were turned into META_BACKREF or META_RECURSE items, respectively. \k and |
| 7849 | \g, when followed by names, are turned into META_BACKREF_BYNAME or |
| 7850 | META_RECURSE_BYNAME. */ |
| 7851 | |
| 7852 | case META_ESCAPE: |
| 7853 | |
| 7854 | /* We can test for escape sequences that consume a character because their |
| 7855 | values lie between ESC_b and ESC_Z; this may have to change if any new ones |
| 7856 | are ever created. For these sequences, we disable the setting of a first |
| 7857 | character if it hasn't already been set. */ |
| 7858 | |
| 7859 | if (meta_arg > ESC_b && meta_arg < ESC_Z) |
| 7860 | { |
| 7861 | matched_char = TRUE; |
| 7862 | if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; |
| 7863 | } |
| 7864 | |
| 7865 | /* Set values to reset to if this is followed by a zero repeat. */ |
| 7866 | |
| 7867 | zerofirstcu = firstcu; |
| 7868 | zerofirstcuflags = firstcuflags; |
| 7869 | zeroreqcu = reqcu; |
| 7870 | zeroreqcuflags = reqcuflags; |
| 7871 | |
| 7872 | /* If Unicode is not supported, \P and \p are not allowed and are |
| 7873 | faulted at parse time, so will never appear here. */ |
| 7874 | |
| 7875 | #ifdef SUPPORT_UNICODE |
| 7876 | if (meta_arg == ESC_P || meta_arg == ESC_p) |
| 7877 | { |
| 7878 | uint32_t ptype = *(++pptr) >> 16; |
| 7879 | uint32_t pdata = *pptr & 0xffff; |
| 7880 | |
| 7881 | /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit |
| 7882 | from the auto-anchoring code. */ |
| 7883 | |
| 7884 | if (meta_arg == ESC_p && ptype == PT_ANY) |
| 7885 | { |
| 7886 | *code++ = OP_ALLANY; |
| 7887 | } |
| 7888 | else |
| 7889 | { |
| 7890 | *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP; |
| 7891 | *code++ = ptype; |
| 7892 | *code++ = pdata; |
| 7893 | } |
| 7894 | break; /* End META_ESCAPE */ |
| 7895 | } |
| 7896 | #endif |
| 7897 | |
| 7898 | /* \K is forbidden in lookarounds since 10.38 because that's what Perl has |
| 7899 | done. However, there's an option, in case anyone was relying on it. */ |
| 7900 | |
| 7901 | if (cb->assert_depth > 0 && meta_arg == ESC_K && |
| 7902 | (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0) |
| 7903 | { |
| 7904 | *errorcodeptr = ERR99; |
| 7905 | return 0; |
| 7906 | } |
| 7907 | |
| 7908 | /* For the rest (including \X when Unicode is supported - if not it's |
| 7909 | faulted at parse time), the OP value is the escape value when PCRE2_UCP is |
| 7910 | not set; if it is set, these escapes do not show up here because they are |
| 7911 | converted into Unicode property tests in parse_regex(). Note that \b and \B |
| 7912 | do a one-character lookbehind, and \A also behaves as if it does. */ |
| 7913 | |
| 7914 | if (meta_arg == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */ |
| 7915 | if ((meta_arg == ESC_b || meta_arg == ESC_B || meta_arg == ESC_A) && |
| 7916 | cb->max_lookbehind == 0) |
| 7917 | cb->max_lookbehind = 1; |
| 7918 | |
| 7919 | /* In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY |
| 7920 | instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds. */ |
| 7921 | |
| 7922 | #if PCRE2_CODE_UNIT_WIDTH == 32 |
| 7923 | *code++ = (meta_arg == ESC_C)? OP_ALLANY : meta_arg; |
| 7924 | #else |
| 7925 | *code++ = (!utf && meta_arg == ESC_C)? OP_ALLANY : meta_arg; |
| 7926 | #endif |
| 7927 | break; /* End META_ESCAPE */ |
| 7928 | |
| 7929 | |
| 7930 | /* ===================================================================*/ |
| 7931 | /* Handle an unrecognized meta value. A parsed pattern value less than |
| 7932 | META_END is a literal. Otherwise we have a problem. */ |
| 7933 | |
| 7934 | default: |
| 7935 | if (meta >= META_END) |
| 7936 | { |
| 7937 | #ifdef DEBUG_SHOW_PARSED |
| 7938 | fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr); |
| 7939 | #endif |
| 7940 | *errorcodeptr = ERR89; /* Internal error - unrecognized. */ |
| 7941 | return 0; |
| 7942 | } |
| 7943 | |
| 7944 | /* Handle a literal character. We come here by goto in the case of a |
| 7945 | 32-bit, non-UTF character whose value is greater than META_END. */ |
| 7946 | |
| 7947 | NORMAL_CHAR: |
| 7948 | meta = *pptr; /* Get the full 32 bits */ |
| 7949 | NORMAL_CHAR_SET: /* Character is already in meta */ |
| 7950 | matched_char = TRUE; |
| 7951 | |
| 7952 | /* For caseless UTF or UCP mode, check whether this character has more than |
| 7953 | one other case. If so, generate a special OP_PROP item instead of OP_CHARI. |
| 7954 | */ |
| 7955 | |
| 7956 | #ifdef SUPPORT_UNICODE |
| 7957 | if ((utf||ucp) && (options & PCRE2_CASELESS) != 0) |
| 7958 | { |
| 7959 | uint32_t caseset = UCD_CASESET(meta); |
| 7960 | if (caseset != 0) |
| 7961 | { |
| 7962 | *code++ = OP_PROP; |
| 7963 | *code++ = PT_CLIST; |
| 7964 | *code++ = caseset; |
| 7965 | if (firstcuflags == REQ_UNSET) |
| 7966 | firstcuflags = zerofirstcuflags = REQ_NONE; |
| 7967 | break; /* End handling this meta item */ |
| 7968 | } |
| 7969 | } |
| 7970 | #endif |
| 7971 | |
| 7972 | /* Caseful matches, or caseless and not one of the multicase characters. We |
| 7973 | come here by goto in the case of a positive class that contains only |
| 7974 | case-partners of a character with just two cases; matched_char has already |
| 7975 | been set TRUE and options fudged if necessary. */ |
| 7976 | |
| 7977 | CLASS_CASELESS_CHAR: |
| 7978 | |
| 7979 | /* Get the character's code units into mcbuffer, with the length in |
| 7980 | mclength. When not in UTF mode, the length is always 1. */ |
| 7981 | |
| 7982 | #ifdef SUPPORT_UNICODE |
| 7983 | if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else |
| 7984 | #endif |
| 7985 | { |
| 7986 | mclength = 1; |
| 7987 | mcbuffer[0] = meta; |
| 7988 | } |
| 7989 | |
| 7990 | /* Generate the appropriate code */ |
| 7991 | |
| 7992 | *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR; |
| 7993 | memcpy(code, mcbuffer, CU2BYTES(mclength)); |
| 7994 | code += mclength; |
| 7995 | |
| 7996 | /* Remember if \r or \n were seen */ |
| 7997 | |
| 7998 | if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL) |
| 7999 | cb->external_flags |= PCRE2_HASCRORLF; |
| 8000 | |
| 8001 | /* Set the first and required code units appropriately. If no previous |
| 8002 | first code unit, set it from this character, but revert to none on a zero |
| 8003 | repeat. Otherwise, leave the firstcu value alone, and don't change it on |
| 8004 | a zero repeat. */ |
| 8005 | |
| 8006 | if (firstcuflags == REQ_UNSET) |
| 8007 | { |
| 8008 | zerofirstcuflags = REQ_NONE; |
| 8009 | zeroreqcu = reqcu; |
| 8010 | zeroreqcuflags = reqcuflags; |
| 8011 | |
| 8012 | /* If the character is more than one code unit long, we can set a single |
| 8013 | firstcu only if it is not to be matched caselessly. Multiple possible |
| 8014 | starting code units may be picked up later in the studying code. */ |
| 8015 | |
| 8016 | if (mclength == 1 || req_caseopt == 0) |
| 8017 | { |
| 8018 | firstcu = mcbuffer[0]; |
| 8019 | firstcuflags = req_caseopt; |
| 8020 | if (mclength != 1) |
| 8021 | { |
| 8022 | reqcu = code[-1]; |
| 8023 | reqcuflags = cb->req_varyopt; |
| 8024 | } |
| 8025 | } |
| 8026 | else firstcuflags = reqcuflags = REQ_NONE; |
| 8027 | } |
| 8028 | |
| 8029 | /* firstcu was previously set; we can set reqcu only if the length is |
| 8030 | 1 or the matching is caseful. */ |
| 8031 | |
| 8032 | else |
| 8033 | { |
| 8034 | zerofirstcu = firstcu; |
| 8035 | zerofirstcuflags = firstcuflags; |
| 8036 | zeroreqcu = reqcu; |
| 8037 | zeroreqcuflags = reqcuflags; |
| 8038 | if (mclength == 1 || req_caseopt == 0) |
| 8039 | { |
| 8040 | reqcu = code[-1]; |
| 8041 | reqcuflags = req_caseopt | cb->req_varyopt; |
| 8042 | } |
| 8043 | } |
| 8044 | |
| 8045 | /* If caselessness was temporarily instated, reset it. */ |
| 8046 | |
| 8047 | if (reset_caseful) |
| 8048 | { |
| 8049 | options &= ~PCRE2_CASELESS; |
| 8050 | req_caseopt = 0; |
| 8051 | reset_caseful = FALSE; |
| 8052 | } |
| 8053 | |
| 8054 | break; /* End literal character handling */ |
| 8055 | } /* End of big switch */ |
| 8056 | } /* End of big loop */ |
| 8057 | |
| 8058 | /* Control never reaches here. */ |
| 8059 | } |
| 8060 | |
| 8061 | |
| 8062 | |
| 8063 | /************************************************* |
| 8064 | * Compile regex: a sequence of alternatives * |
| 8065 | *************************************************/ |
| 8066 | |
| 8067 | /* On entry, pptr is pointing past the bracket meta, but on return it points to |
| 8068 | the closing bracket or META_END. The code variable is pointing at the code unit |
| 8069 | into which the BRA operator has been stored. This function is used during the |
| 8070 | pre-compile phase when we are trying to find out the amount of memory needed, |
| 8071 | as well as during the real compile phase. The value of lengthptr distinguishes |
| 8072 | the two phases. |
| 8073 | |
| 8074 | Arguments: |
| 8075 | options option bits, including any changes for this subpattern |
| 8076 | codeptr -> the address of the current code pointer |
| 8077 | pptrptr -> the address of the current parsed pattern pointer |
| 8078 | errorcodeptr -> pointer to error code variable |
| 8079 | skipunits skip this many code units at start (for brackets and OP_COND) |
| 8080 | firstcuptr place to put the first required code unit |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 8081 | firstcuflagsptr place to put the first code unit flags |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 8082 | reqcuptr place to put the last required code unit |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 8083 | reqcuflagsptr place to put the last required code unit flags |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 8084 | bcptr pointer to the chain of currently open branches |
| 8085 | cb points to the data block with tables pointers etc. |
| 8086 | lengthptr NULL during the real compile phase |
| 8087 | points to length accumulator during pre-compile phase |
| 8088 | |
| 8089 | Returns: 0 There has been an error |
| 8090 | +1 Success, this group must match at least one character |
| 8091 | -1 Success, this group may match an empty string |
| 8092 | */ |
| 8093 | |
| 8094 | static int |
| 8095 | compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, uint32_t **pptrptr, |
| 8096 | int *errorcodeptr, uint32_t skipunits, uint32_t *firstcuptr, |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 8097 | uint32_t *firstcuflagsptr, uint32_t *reqcuptr, uint32_t *reqcuflagsptr, |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 8098 | branch_chain *bcptr, compile_block *cb, PCRE2_SIZE *lengthptr) |
| 8099 | { |
| 8100 | PCRE2_UCHAR *code = *codeptr; |
| 8101 | PCRE2_UCHAR *last_branch = code; |
| 8102 | PCRE2_UCHAR *start_bracket = code; |
| 8103 | BOOL lookbehind; |
| 8104 | open_capitem capitem; |
| 8105 | int capnumber = 0; |
| 8106 | int okreturn = 1; |
| 8107 | uint32_t *pptr = *pptrptr; |
| 8108 | uint32_t firstcu, reqcu; |
| 8109 | uint32_t lookbehindlength; |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 8110 | uint32_t firstcuflags, reqcuflags; |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 8111 | uint32_t branchfirstcu, branchreqcu; |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 8112 | uint32_t branchfirstcuflags, branchreqcuflags; |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 8113 | PCRE2_SIZE length; |
| 8114 | branch_chain bc; |
| 8115 | |
| 8116 | /* If set, call the external function that checks for stack availability. */ |
| 8117 | |
| 8118 | if (cb->cx->stack_guard != NULL && |
| 8119 | cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data)) |
| 8120 | { |
| 8121 | *errorcodeptr= ERR33; |
| 8122 | return 0; |
| 8123 | } |
| 8124 | |
| 8125 | /* Miscellaneous initialization */ |
| 8126 | |
| 8127 | bc.outer = bcptr; |
| 8128 | bc.current_branch = code; |
| 8129 | |
| 8130 | firstcu = reqcu = 0; |
| 8131 | firstcuflags = reqcuflags = REQ_UNSET; |
| 8132 | |
| 8133 | /* Accumulate the length for use in the pre-compile phase. Start with the |
| 8134 | length of the BRA and KET and any extra code units that are required at the |
| 8135 | beginning. We accumulate in a local variable to save frequent testing of |
| 8136 | lengthptr for NULL. We cannot do this by looking at the value of 'code' at the |
| 8137 | start and end of each alternative, because compiled items are discarded during |
| 8138 | the pre-compile phase so that the workspace is not exceeded. */ |
| 8139 | |
| 8140 | length = 2 + 2*LINK_SIZE + skipunits; |
| 8141 | |
| 8142 | /* Remember if this is a lookbehind assertion, and if it is, save its length |
| 8143 | and skip over the pattern offset. */ |
| 8144 | |
| 8145 | lookbehind = *code == OP_ASSERTBACK || |
| 8146 | *code == OP_ASSERTBACK_NOT || |
| 8147 | *code == OP_ASSERTBACK_NA; |
| 8148 | |
| 8149 | if (lookbehind) |
| 8150 | { |
| 8151 | lookbehindlength = META_DATA(pptr[-1]); |
| 8152 | pptr += SIZEOFFSET; |
| 8153 | } |
| 8154 | else lookbehindlength = 0; |
| 8155 | |
| 8156 | /* If this is a capturing subpattern, add to the chain of open capturing items |
| 8157 | so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA |
| 8158 | need be tested here; changing this opcode to one of its variants, e.g. |
| 8159 | OP_SCBRAPOS, happens later, after the group has been compiled. */ |
| 8160 | |
| 8161 | if (*code == OP_CBRA) |
| 8162 | { |
| 8163 | capnumber = GET2(code, 1 + LINK_SIZE); |
| 8164 | capitem.number = capnumber; |
| 8165 | capitem.next = cb->open_caps; |
| 8166 | capitem.assert_depth = cb->assert_depth; |
| 8167 | cb->open_caps = &capitem; |
| 8168 | } |
| 8169 | |
| 8170 | /* Offset is set zero to mark that this bracket is still open */ |
| 8171 | |
| 8172 | PUT(code, 1, 0); |
| 8173 | code += 1 + LINK_SIZE + skipunits; |
| 8174 | |
| 8175 | /* Loop for each alternative branch */ |
| 8176 | |
| 8177 | for (;;) |
| 8178 | { |
| 8179 | int branch_return; |
| 8180 | |
| 8181 | /* Insert OP_REVERSE if this is as lookbehind assertion. */ |
| 8182 | |
| 8183 | if (lookbehind && lookbehindlength > 0) |
| 8184 | { |
| 8185 | *code++ = OP_REVERSE; |
| 8186 | PUTINC(code, 0, lookbehindlength); |
| 8187 | length += 1 + LINK_SIZE; |
| 8188 | } |
| 8189 | |
| 8190 | /* Now compile the branch; in the pre-compile phase its length gets added |
| 8191 | into the length. */ |
| 8192 | |
| 8193 | if ((branch_return = |
| 8194 | compile_branch(&options, &code, &pptr, errorcodeptr, &branchfirstcu, |
| 8195 | &branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc, |
| 8196 | cb, (lengthptr == NULL)? NULL : &length)) == 0) |
| 8197 | return 0; |
| 8198 | |
| 8199 | /* If a branch can match an empty string, so can the whole group. */ |
| 8200 | |
| 8201 | if (branch_return < 0) okreturn = -1; |
| 8202 | |
| 8203 | /* In the real compile phase, there is some post-processing to be done. */ |
| 8204 | |
| 8205 | if (lengthptr == NULL) |
| 8206 | { |
| 8207 | /* If this is the first branch, the firstcu and reqcu values for the |
| 8208 | branch become the values for the regex. */ |
| 8209 | |
| 8210 | if (*last_branch != OP_ALT) |
| 8211 | { |
| 8212 | firstcu = branchfirstcu; |
| 8213 | firstcuflags = branchfirstcuflags; |
| 8214 | reqcu = branchreqcu; |
| 8215 | reqcuflags = branchreqcuflags; |
| 8216 | } |
| 8217 | |
| 8218 | /* If this is not the first branch, the first char and reqcu have to |
| 8219 | match the values from all the previous branches, except that if the |
| 8220 | previous value for reqcu didn't have REQ_VARY set, it can still match, |
| 8221 | and we set REQ_VARY for the group from this branch's value. */ |
| 8222 | |
| 8223 | else |
| 8224 | { |
| 8225 | /* If we previously had a firstcu, but it doesn't match the new branch, |
| 8226 | we have to abandon the firstcu for the regex, but if there was |
| 8227 | previously no reqcu, it takes on the value of the old firstcu. */ |
| 8228 | |
| 8229 | if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu) |
| 8230 | { |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 8231 | if (firstcuflags < REQ_NONE) |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 8232 | { |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 8233 | if (reqcuflags >= REQ_NONE) |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 8234 | { |
| 8235 | reqcu = firstcu; |
| 8236 | reqcuflags = firstcuflags; |
| 8237 | } |
| 8238 | } |
| 8239 | firstcuflags = REQ_NONE; |
| 8240 | } |
| 8241 | |
| 8242 | /* If we (now or from before) have no firstcu, a firstcu from the |
| 8243 | branch becomes a reqcu if there isn't a branch reqcu. */ |
| 8244 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 8245 | if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE && |
| 8246 | branchreqcuflags >= REQ_NONE) |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 8247 | { |
| 8248 | branchreqcu = branchfirstcu; |
| 8249 | branchreqcuflags = branchfirstcuflags; |
| 8250 | } |
| 8251 | |
| 8252 | /* Now ensure that the reqcus match */ |
| 8253 | |
| 8254 | if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) || |
| 8255 | reqcu != branchreqcu) |
| 8256 | reqcuflags = REQ_NONE; |
| 8257 | else |
| 8258 | { |
| 8259 | reqcu = branchreqcu; |
| 8260 | reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */ |
| 8261 | } |
| 8262 | } |
| 8263 | } |
| 8264 | |
| 8265 | /* Handle reaching the end of the expression, either ')' or end of pattern. |
| 8266 | In the real compile phase, go back through the alternative branches and |
| 8267 | reverse the chain of offsets, with the field in the BRA item now becoming an |
| 8268 | offset to the first alternative. If there are no alternatives, it points to |
| 8269 | the end of the group. The length in the terminating ket is always the length |
| 8270 | of the whole bracketed item. Return leaving the pointer at the terminating |
| 8271 | char. */ |
| 8272 | |
| 8273 | if (META_CODE(*pptr) != META_ALT) |
| 8274 | { |
| 8275 | if (lengthptr == NULL) |
| 8276 | { |
| 8277 | PCRE2_SIZE branch_length = code - last_branch; |
| 8278 | do |
| 8279 | { |
| 8280 | PCRE2_SIZE prev_length = GET(last_branch, 1); |
| 8281 | PUT(last_branch, 1, branch_length); |
| 8282 | branch_length = prev_length; |
| 8283 | last_branch -= branch_length; |
| 8284 | } |
| 8285 | while (branch_length > 0); |
| 8286 | } |
| 8287 | |
| 8288 | /* Fill in the ket */ |
| 8289 | |
| 8290 | *code = OP_KET; |
| 8291 | PUT(code, 1, (int)(code - start_bracket)); |
| 8292 | code += 1 + LINK_SIZE; |
| 8293 | |
| 8294 | /* If it was a capturing subpattern, remove the block from the chain. */ |
| 8295 | |
| 8296 | if (capnumber > 0) cb->open_caps = cb->open_caps->next; |
| 8297 | |
| 8298 | /* Set values to pass back */ |
| 8299 | |
| 8300 | *codeptr = code; |
| 8301 | *pptrptr = pptr; |
| 8302 | *firstcuptr = firstcu; |
| 8303 | *firstcuflagsptr = firstcuflags; |
| 8304 | *reqcuptr = reqcu; |
| 8305 | *reqcuflagsptr = reqcuflags; |
| 8306 | if (lengthptr != NULL) |
| 8307 | { |
| 8308 | if (OFLOW_MAX - *lengthptr < length) |
| 8309 | { |
| 8310 | *errorcodeptr = ERR20; |
| 8311 | return 0; |
| 8312 | } |
| 8313 | *lengthptr += length; |
| 8314 | } |
| 8315 | return okreturn; |
| 8316 | } |
| 8317 | |
| 8318 | /* Another branch follows. In the pre-compile phase, we can move the code |
| 8319 | pointer back to where it was for the start of the first branch. (That is, |
| 8320 | pretend that each branch is the only one.) |
| 8321 | |
| 8322 | In the real compile phase, insert an ALT node. Its length field points back |
| 8323 | to the previous branch while the bracket remains open. At the end the chain |
| 8324 | is reversed. It's done like this so that the start of the bracket has a |
| 8325 | zero offset until it is closed, making it possible to detect recursion. */ |
| 8326 | |
| 8327 | if (lengthptr != NULL) |
| 8328 | { |
| 8329 | code = *codeptr + 1 + LINK_SIZE + skipunits; |
| 8330 | length += 1 + LINK_SIZE; |
| 8331 | } |
| 8332 | else |
| 8333 | { |
| 8334 | *code = OP_ALT; |
| 8335 | PUT(code, 1, (int)(code - last_branch)); |
| 8336 | bc.current_branch = last_branch = code; |
| 8337 | code += 1 + LINK_SIZE; |
| 8338 | } |
| 8339 | |
| 8340 | /* Set the lookbehind length (if not in a lookbehind the value will be zero) |
| 8341 | and then advance past the vertical bar. */ |
| 8342 | |
| 8343 | lookbehindlength = META_DATA(*pptr); |
| 8344 | pptr++; |
| 8345 | } |
| 8346 | /* Control never reaches here */ |
| 8347 | } |
| 8348 | |
| 8349 | |
| 8350 | |
| 8351 | /************************************************* |
| 8352 | * Check for anchored pattern * |
| 8353 | *************************************************/ |
| 8354 | |
| 8355 | /* Try to find out if this is an anchored regular expression. Consider each |
| 8356 | alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket |
| 8357 | all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then |
| 8358 | it's anchored. However, if this is a multiline pattern, then only OP_SOD will |
| 8359 | be found, because ^ generates OP_CIRCM in that mode. |
| 8360 | |
| 8361 | We can also consider a regex to be anchored if OP_SOM starts all its branches. |
| 8362 | This is the code for \G, which means "match at start of match position, taking |
| 8363 | into account the match offset". |
| 8364 | |
| 8365 | A branch is also implicitly anchored if it starts with .* and DOTALL is set, |
| 8366 | because that will try the rest of the pattern at all possible matching points, |
| 8367 | so there is no point trying again.... er .... |
| 8368 | |
| 8369 | .... except when the .* appears inside capturing parentheses, and there is a |
| 8370 | subsequent back reference to those parentheses. We haven't enough information |
| 8371 | to catch that case precisely. |
| 8372 | |
| 8373 | At first, the best we could do was to detect when .* was in capturing brackets |
| 8374 | and the highest back reference was greater than or equal to that level. |
| 8375 | However, by keeping a bitmap of the first 31 back references, we can catch some |
| 8376 | of the more common cases more precisely. |
| 8377 | |
| 8378 | ... A second exception is when the .* appears inside an atomic group, because |
| 8379 | this prevents the number of characters it matches from being adjusted. |
| 8380 | |
| 8381 | Arguments: |
| 8382 | code points to start of the compiled pattern |
| 8383 | bracket_map a bitmap of which brackets we are inside while testing; this |
| 8384 | handles up to substring 31; after that we just have to take |
| 8385 | the less precise approach |
| 8386 | cb points to the compile data block |
| 8387 | atomcount atomic group level |
| 8388 | inassert TRUE if in an assertion |
| 8389 | |
| 8390 | Returns: TRUE or FALSE |
| 8391 | */ |
| 8392 | |
| 8393 | static BOOL |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 8394 | is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb, |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 8395 | int atomcount, BOOL inassert) |
| 8396 | { |
| 8397 | do { |
| 8398 | PCRE2_SPTR scode = first_significant_code( |
| 8399 | code + PRIV(OP_lengths)[*code], FALSE); |
| 8400 | int op = *scode; |
| 8401 | |
| 8402 | /* Non-capturing brackets */ |
| 8403 | |
| 8404 | if (op == OP_BRA || op == OP_BRAPOS || |
| 8405 | op == OP_SBRA || op == OP_SBRAPOS) |
| 8406 | { |
| 8407 | if (!is_anchored(scode, bracket_map, cb, atomcount, inassert)) |
| 8408 | return FALSE; |
| 8409 | } |
| 8410 | |
| 8411 | /* Capturing brackets */ |
| 8412 | |
| 8413 | else if (op == OP_CBRA || op == OP_CBRAPOS || |
| 8414 | op == OP_SCBRA || op == OP_SCBRAPOS) |
| 8415 | { |
| 8416 | int n = GET2(scode, 1+LINK_SIZE); |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 8417 | uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1); |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 8418 | if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE; |
| 8419 | } |
| 8420 | |
| 8421 | /* Positive forward assertion */ |
| 8422 | |
| 8423 | else if (op == OP_ASSERT || op == OP_ASSERT_NA) |
| 8424 | { |
| 8425 | if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE; |
| 8426 | } |
| 8427 | |
| 8428 | /* Condition. If there is no second branch, it can't be anchored. */ |
| 8429 | |
| 8430 | else if (op == OP_COND || op == OP_SCOND) |
| 8431 | { |
| 8432 | if (scode[GET(scode,1)] != OP_ALT) return FALSE; |
| 8433 | if (!is_anchored(scode, bracket_map, cb, atomcount, inassert)) |
| 8434 | return FALSE; |
| 8435 | } |
| 8436 | |
| 8437 | /* Atomic groups */ |
| 8438 | |
| 8439 | else if (op == OP_ONCE) |
| 8440 | { |
| 8441 | if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert)) |
| 8442 | return FALSE; |
| 8443 | } |
| 8444 | |
| 8445 | /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and |
| 8446 | it isn't in brackets that are or may be referenced or inside an atomic |
| 8447 | group or an assertion. Also the pattern must not contain *PRUNE or *SKIP, |
| 8448 | because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/ |
| 8449 | with the subject "aab", which matches "b", i.e. not at the start of a line. |
| 8450 | There is also an option that disables auto-anchoring. */ |
| 8451 | |
| 8452 | else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR || |
| 8453 | op == OP_TYPEPOSSTAR)) |
| 8454 | { |
| 8455 | if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 || |
| 8456 | atomcount > 0 || cb->had_pruneorskip || inassert || |
| 8457 | (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0) |
| 8458 | return FALSE; |
| 8459 | } |
| 8460 | |
| 8461 | /* Check for explicit anchoring */ |
| 8462 | |
| 8463 | else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE; |
| 8464 | |
| 8465 | code += GET(code, 1); |
| 8466 | } |
| 8467 | while (*code == OP_ALT); /* Loop for each alternative */ |
| 8468 | return TRUE; |
| 8469 | } |
| 8470 | |
| 8471 | |
| 8472 | |
| 8473 | /************************************************* |
| 8474 | * Check for starting with ^ or .* * |
| 8475 | *************************************************/ |
| 8476 | |
| 8477 | /* This is called to find out if every branch starts with ^ or .* so that |
| 8478 | "first char" processing can be done to speed things up in multiline |
| 8479 | matching and for non-DOTALL patterns that start with .* (which must start at |
| 8480 | the beginning or after \n). As in the case of is_anchored() (see above), we |
| 8481 | have to take account of back references to capturing brackets that contain .* |
| 8482 | because in that case we can't make the assumption. Also, the appearance of .* |
| 8483 | inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE |
| 8484 | or *SKIP does not count, because once again the assumption no longer holds. |
| 8485 | |
| 8486 | Arguments: |
| 8487 | code points to start of the compiled pattern or a group |
| 8488 | bracket_map a bitmap of which brackets we are inside while testing; this |
| 8489 | handles up to substring 31; after that we just have to take |
| 8490 | the less precise approach |
| 8491 | cb points to the compile data |
| 8492 | atomcount atomic group level |
| 8493 | inassert TRUE if in an assertion |
| 8494 | |
| 8495 | Returns: TRUE or FALSE |
| 8496 | */ |
| 8497 | |
| 8498 | static BOOL |
| 8499 | is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb, |
| 8500 | int atomcount, BOOL inassert) |
| 8501 | { |
| 8502 | do { |
| 8503 | PCRE2_SPTR scode = first_significant_code( |
| 8504 | code + PRIV(OP_lengths)[*code], FALSE); |
| 8505 | int op = *scode; |
| 8506 | |
| 8507 | /* If we are at the start of a conditional assertion group, *both* the |
| 8508 | conditional assertion *and* what follows the condition must satisfy the test |
| 8509 | for start of line. Other kinds of condition fail. Note that there may be an |
| 8510 | auto-callout at the start of a condition. */ |
| 8511 | |
| 8512 | if (op == OP_COND) |
| 8513 | { |
| 8514 | scode += 1 + LINK_SIZE; |
| 8515 | |
| 8516 | if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT]; |
| 8517 | else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE); |
| 8518 | |
| 8519 | switch (*scode) |
| 8520 | { |
| 8521 | case OP_CREF: |
| 8522 | case OP_DNCREF: |
| 8523 | case OP_RREF: |
| 8524 | case OP_DNRREF: |
| 8525 | case OP_FAIL: |
| 8526 | case OP_FALSE: |
| 8527 | case OP_TRUE: |
| 8528 | return FALSE; |
| 8529 | |
| 8530 | default: /* Assertion */ |
| 8531 | if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE; |
| 8532 | do scode += GET(scode, 1); while (*scode == OP_ALT); |
| 8533 | scode += 1 + LINK_SIZE; |
| 8534 | break; |
| 8535 | } |
| 8536 | scode = first_significant_code(scode, FALSE); |
| 8537 | op = *scode; |
| 8538 | } |
| 8539 | |
| 8540 | /* Non-capturing brackets */ |
| 8541 | |
| 8542 | if (op == OP_BRA || op == OP_BRAPOS || |
| 8543 | op == OP_SBRA || op == OP_SBRAPOS) |
| 8544 | { |
| 8545 | if (!is_startline(scode, bracket_map, cb, atomcount, inassert)) |
| 8546 | return FALSE; |
| 8547 | } |
| 8548 | |
| 8549 | /* Capturing brackets */ |
| 8550 | |
| 8551 | else if (op == OP_CBRA || op == OP_CBRAPOS || |
| 8552 | op == OP_SCBRA || op == OP_SCBRAPOS) |
| 8553 | { |
| 8554 | int n = GET2(scode, 1+LINK_SIZE); |
| 8555 | int new_map = bracket_map | ((n < 32)? (1u << n) : 1); |
| 8556 | if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE; |
| 8557 | } |
| 8558 | |
| 8559 | /* Positive forward assertions */ |
| 8560 | |
| 8561 | else if (op == OP_ASSERT || op == OP_ASSERT_NA) |
| 8562 | { |
| 8563 | if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) |
| 8564 | return FALSE; |
| 8565 | } |
| 8566 | |
| 8567 | /* Atomic brackets */ |
| 8568 | |
| 8569 | else if (op == OP_ONCE) |
| 8570 | { |
| 8571 | if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert)) |
| 8572 | return FALSE; |
| 8573 | } |
| 8574 | |
| 8575 | /* .* means "start at start or after \n" if it isn't in atomic brackets or |
| 8576 | brackets that may be referenced or an assertion, and as long as the pattern |
| 8577 | does not contain *PRUNE or *SKIP, because these break the feature. Consider, |
| 8578 | for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", |
| 8579 | i.e. not at the start of a line. There is also an option that disables this |
| 8580 | optimization. */ |
| 8581 | |
| 8582 | else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR) |
| 8583 | { |
| 8584 | if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 || |
| 8585 | atomcount > 0 || cb->had_pruneorskip || inassert || |
| 8586 | (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0) |
| 8587 | return FALSE; |
| 8588 | } |
| 8589 | |
| 8590 | /* Check for explicit circumflex; anything else gives a FALSE result. Note |
| 8591 | in particular that this includes atomic brackets OP_ONCE because the number |
| 8592 | of characters matched by .* cannot be adjusted inside them. */ |
| 8593 | |
| 8594 | else if (op != OP_CIRC && op != OP_CIRCM) return FALSE; |
| 8595 | |
| 8596 | /* Move on to the next alternative */ |
| 8597 | |
| 8598 | code += GET(code, 1); |
| 8599 | } |
| 8600 | while (*code == OP_ALT); /* Loop for each alternative */ |
| 8601 | return TRUE; |
| 8602 | } |
| 8603 | |
| 8604 | |
| 8605 | |
| 8606 | /************************************************* |
| 8607 | * Scan compiled regex for recursion reference * |
| 8608 | *************************************************/ |
| 8609 | |
| 8610 | /* This function scans through a compiled pattern until it finds an instance of |
| 8611 | OP_RECURSE. |
| 8612 | |
| 8613 | Arguments: |
| 8614 | code points to start of expression |
| 8615 | utf TRUE in UTF mode |
| 8616 | |
| 8617 | Returns: pointer to the opcode for OP_RECURSE, or NULL if not found |
| 8618 | */ |
| 8619 | |
| 8620 | static PCRE2_SPTR |
| 8621 | find_recurse(PCRE2_SPTR code, BOOL utf) |
| 8622 | { |
| 8623 | for (;;) |
| 8624 | { |
| 8625 | PCRE2_UCHAR c = *code; |
| 8626 | if (c == OP_END) return NULL; |
| 8627 | if (c == OP_RECURSE) return code; |
| 8628 | |
| 8629 | /* XCLASS is used for classes that cannot be represented just by a bit map. |
| 8630 | This includes negated single high-valued characters. CALLOUT_STR is used for |
| 8631 | callouts with string arguments. In both cases the length in the table is |
| 8632 | zero; the actual length is stored in the compiled code. */ |
| 8633 | |
| 8634 | if (c == OP_XCLASS) code += GET(code, 1); |
| 8635 | else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE); |
| 8636 | |
| 8637 | /* Otherwise, we can get the item's length from the table, except that for |
| 8638 | repeated character types, we have to test for \p and \P, which have an extra |
| 8639 | two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, |
| 8640 | we must add in its length. */ |
| 8641 | |
| 8642 | else |
| 8643 | { |
| 8644 | switch(c) |
| 8645 | { |
| 8646 | case OP_TYPESTAR: |
| 8647 | case OP_TYPEMINSTAR: |
| 8648 | case OP_TYPEPLUS: |
| 8649 | case OP_TYPEMINPLUS: |
| 8650 | case OP_TYPEQUERY: |
| 8651 | case OP_TYPEMINQUERY: |
| 8652 | case OP_TYPEPOSSTAR: |
| 8653 | case OP_TYPEPOSPLUS: |
| 8654 | case OP_TYPEPOSQUERY: |
| 8655 | if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; |
| 8656 | break; |
| 8657 | |
| 8658 | case OP_TYPEPOSUPTO: |
| 8659 | case OP_TYPEUPTO: |
| 8660 | case OP_TYPEMINUPTO: |
| 8661 | case OP_TYPEEXACT: |
| 8662 | if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) |
| 8663 | code += 2; |
| 8664 | break; |
| 8665 | |
| 8666 | case OP_MARK: |
| 8667 | case OP_COMMIT_ARG: |
| 8668 | case OP_PRUNE_ARG: |
| 8669 | case OP_SKIP_ARG: |
| 8670 | case OP_THEN_ARG: |
| 8671 | code += code[1]; |
| 8672 | break; |
| 8673 | } |
| 8674 | |
| 8675 | /* Add in the fixed length from the table */ |
| 8676 | |
| 8677 | code += PRIV(OP_lengths)[c]; |
| 8678 | |
| 8679 | /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may |
| 8680 | be followed by a multi-unit character. The length in the table is a |
| 8681 | minimum, so we have to arrange to skip the extra units. */ |
| 8682 | |
| 8683 | #ifdef MAYBE_UTF_MULTI |
| 8684 | if (utf) switch(c) |
| 8685 | { |
| 8686 | case OP_CHAR: |
| 8687 | case OP_CHARI: |
| 8688 | case OP_NOT: |
| 8689 | case OP_NOTI: |
| 8690 | case OP_EXACT: |
| 8691 | case OP_EXACTI: |
| 8692 | case OP_NOTEXACT: |
| 8693 | case OP_NOTEXACTI: |
| 8694 | case OP_UPTO: |
| 8695 | case OP_UPTOI: |
| 8696 | case OP_NOTUPTO: |
| 8697 | case OP_NOTUPTOI: |
| 8698 | case OP_MINUPTO: |
| 8699 | case OP_MINUPTOI: |
| 8700 | case OP_NOTMINUPTO: |
| 8701 | case OP_NOTMINUPTOI: |
| 8702 | case OP_POSUPTO: |
| 8703 | case OP_POSUPTOI: |
| 8704 | case OP_NOTPOSUPTO: |
| 8705 | case OP_NOTPOSUPTOI: |
| 8706 | case OP_STAR: |
| 8707 | case OP_STARI: |
| 8708 | case OP_NOTSTAR: |
| 8709 | case OP_NOTSTARI: |
| 8710 | case OP_MINSTAR: |
| 8711 | case OP_MINSTARI: |
| 8712 | case OP_NOTMINSTAR: |
| 8713 | case OP_NOTMINSTARI: |
| 8714 | case OP_POSSTAR: |
| 8715 | case OP_POSSTARI: |
| 8716 | case OP_NOTPOSSTAR: |
| 8717 | case OP_NOTPOSSTARI: |
| 8718 | case OP_PLUS: |
| 8719 | case OP_PLUSI: |
| 8720 | case OP_NOTPLUS: |
| 8721 | case OP_NOTPLUSI: |
| 8722 | case OP_MINPLUS: |
| 8723 | case OP_MINPLUSI: |
| 8724 | case OP_NOTMINPLUS: |
| 8725 | case OP_NOTMINPLUSI: |
| 8726 | case OP_POSPLUS: |
| 8727 | case OP_POSPLUSI: |
| 8728 | case OP_NOTPOSPLUS: |
| 8729 | case OP_NOTPOSPLUSI: |
| 8730 | case OP_QUERY: |
| 8731 | case OP_QUERYI: |
| 8732 | case OP_NOTQUERY: |
| 8733 | case OP_NOTQUERYI: |
| 8734 | case OP_MINQUERY: |
| 8735 | case OP_MINQUERYI: |
| 8736 | case OP_NOTMINQUERY: |
| 8737 | case OP_NOTMINQUERYI: |
| 8738 | case OP_POSQUERY: |
| 8739 | case OP_POSQUERYI: |
| 8740 | case OP_NOTPOSQUERY: |
| 8741 | case OP_NOTPOSQUERYI: |
| 8742 | if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); |
| 8743 | break; |
| 8744 | } |
| 8745 | #else |
| 8746 | (void)(utf); /* Keep compiler happy by referencing function argument */ |
| 8747 | #endif /* MAYBE_UTF_MULTI */ |
| 8748 | } |
| 8749 | } |
| 8750 | } |
| 8751 | |
| 8752 | |
| 8753 | |
| 8754 | /************************************************* |
| 8755 | * Check for asserted fixed first code unit * |
| 8756 | *************************************************/ |
| 8757 | |
| 8758 | /* During compilation, the "first code unit" settings from forward assertions |
| 8759 | are discarded, because they can cause conflicts with actual literals that |
| 8760 | follow. However, if we end up without a first code unit setting for an |
| 8761 | unanchored pattern, it is worth scanning the regex to see if there is an |
| 8762 | initial asserted first code unit. If all branches start with the same asserted |
| 8763 | code unit, or with a non-conditional bracket all of whose alternatives start |
| 8764 | with the same asserted code unit (recurse ad lib), then we return that code |
| 8765 | unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with |
| 8766 | REQ_NONE in the flags. |
| 8767 | |
| 8768 | Arguments: |
| 8769 | code points to start of compiled pattern |
| 8770 | flags points to the first code unit flags |
| 8771 | inassert non-zero if in an assertion |
| 8772 | |
| 8773 | Returns: the fixed first code unit, or 0 with REQ_NONE in flags |
| 8774 | */ |
| 8775 | |
| 8776 | static uint32_t |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 8777 | find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert) |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 8778 | { |
| 8779 | uint32_t c = 0; |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 8780 | uint32_t cflags = REQ_NONE; |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 8781 | |
| 8782 | *flags = REQ_NONE; |
| 8783 | do { |
| 8784 | uint32_t d; |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 8785 | uint32_t dflags; |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 8786 | int xl = (*code == OP_CBRA || *code == OP_SCBRA || |
| 8787 | *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0; |
| 8788 | PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE); |
| 8789 | PCRE2_UCHAR op = *scode; |
| 8790 | |
| 8791 | switch(op) |
| 8792 | { |
| 8793 | default: |
| 8794 | return 0; |
| 8795 | |
| 8796 | case OP_BRA: |
| 8797 | case OP_BRAPOS: |
| 8798 | case OP_CBRA: |
| 8799 | case OP_SCBRA: |
| 8800 | case OP_CBRAPOS: |
| 8801 | case OP_SCBRAPOS: |
| 8802 | case OP_ASSERT: |
| 8803 | case OP_ASSERT_NA: |
| 8804 | case OP_ONCE: |
| 8805 | case OP_SCRIPT_RUN: |
| 8806 | d = find_firstassertedcu(scode, &dflags, inassert + |
| 8807 | ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0)); |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 8808 | if (dflags >= REQ_NONE) return 0; |
| 8809 | if (cflags >= REQ_NONE) { c = d; cflags = dflags; } |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 8810 | else if (c != d || cflags != dflags) return 0; |
| 8811 | break; |
| 8812 | |
| 8813 | case OP_EXACT: |
| 8814 | scode += IMM2_SIZE; |
| 8815 | /* Fall through */ |
| 8816 | |
| 8817 | case OP_CHAR: |
| 8818 | case OP_PLUS: |
| 8819 | case OP_MINPLUS: |
| 8820 | case OP_POSPLUS: |
| 8821 | if (inassert == 0) return 0; |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 8822 | if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; } |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 8823 | else if (c != scode[1]) return 0; |
| 8824 | break; |
| 8825 | |
| 8826 | case OP_EXACTI: |
| 8827 | scode += IMM2_SIZE; |
| 8828 | /* Fall through */ |
| 8829 | |
| 8830 | case OP_CHARI: |
| 8831 | case OP_PLUSI: |
| 8832 | case OP_MINPLUSI: |
| 8833 | case OP_POSPLUSI: |
| 8834 | if (inassert == 0) return 0; |
| 8835 | |
| 8836 | /* If the character is more than one code unit long, we cannot set its |
| 8837 | first code unit when matching caselessly. Later scanning may pick up |
| 8838 | multiple code units. */ |
| 8839 | |
| 8840 | #ifdef SUPPORT_UNICODE |
| 8841 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
| 8842 | if (scode[1] >= 0x80) return 0; |
| 8843 | #elif PCRE2_CODE_UNIT_WIDTH == 16 |
| 8844 | if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0; |
| 8845 | #endif |
| 8846 | #endif |
| 8847 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 8848 | if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; } |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 8849 | else if (c != scode[1]) return 0; |
| 8850 | break; |
| 8851 | } |
| 8852 | |
| 8853 | code += GET(code, 1); |
| 8854 | } |
| 8855 | while (*code == OP_ALT); |
| 8856 | |
| 8857 | *flags = cflags; |
| 8858 | return c; |
| 8859 | } |
| 8860 | |
| 8861 | |
| 8862 | |
| 8863 | /************************************************* |
| 8864 | * Add an entry to the name/number table * |
| 8865 | *************************************************/ |
| 8866 | |
| 8867 | /* This function is called between compiling passes to add an entry to the |
| 8868 | name/number table, maintaining alphabetical order. Checking for permitted |
| 8869 | and forbidden duplicates has already been done. |
| 8870 | |
| 8871 | Arguments: |
| 8872 | cb the compile data block |
| 8873 | name the name to add |
| 8874 | length the length of the name |
| 8875 | groupno the group number |
| 8876 | tablecount the count of names in the table so far |
| 8877 | |
| 8878 | Returns: nothing |
| 8879 | */ |
| 8880 | |
| 8881 | static void |
| 8882 | add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length, |
| 8883 | unsigned int groupno, uint32_t tablecount) |
| 8884 | { |
| 8885 | uint32_t i; |
| 8886 | PCRE2_UCHAR *slot = cb->name_table; |
| 8887 | |
| 8888 | for (i = 0; i < tablecount; i++) |
| 8889 | { |
| 8890 | int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length)); |
| 8891 | if (crc == 0 && slot[IMM2_SIZE+length] != 0) |
| 8892 | crc = -1; /* Current name is a substring */ |
| 8893 | |
| 8894 | /* Make space in the table and break the loop for an earlier name. For a |
| 8895 | duplicate or later name, carry on. We do this for duplicates so that in the |
| 8896 | simple case (when ?(| is not used) they are in order of their numbers. In all |
| 8897 | cases they are in the order in which they appear in the pattern. */ |
| 8898 | |
| 8899 | if (crc < 0) |
| 8900 | { |
| 8901 | (void)memmove(slot + cb->name_entry_size, slot, |
| 8902 | CU2BYTES((tablecount - i) * cb->name_entry_size)); |
| 8903 | break; |
| 8904 | } |
| 8905 | |
| 8906 | /* Continue the loop for a later or duplicate name */ |
| 8907 | |
| 8908 | slot += cb->name_entry_size; |
| 8909 | } |
| 8910 | |
| 8911 | PUT2(slot, 0, groupno); |
| 8912 | memcpy(slot + IMM2_SIZE, name, CU2BYTES(length)); |
| 8913 | |
| 8914 | /* Add a terminating zero and fill the rest of the slot with zeroes so that |
| 8915 | the memory is all initialized. Otherwise valgrind moans about uninitialized |
| 8916 | memory when saving serialized compiled patterns. */ |
| 8917 | |
| 8918 | memset(slot + IMM2_SIZE + length, 0, |
| 8919 | CU2BYTES(cb->name_entry_size - length - IMM2_SIZE)); |
| 8920 | } |
| 8921 | |
| 8922 | |
| 8923 | |
| 8924 | /************************************************* |
| 8925 | * Skip in parsed pattern * |
| 8926 | *************************************************/ |
| 8927 | |
| 8928 | /* This function is called to skip parts of the parsed pattern when finding the |
| 8929 | length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find |
| 8930 | the end of the branch, it is called to skip over an internal lookaround or |
| 8931 | (DEFINE) group, and it is also called to skip to the end of a class, during |
| 8932 | which it will never encounter nested groups (but there's no need to have |
| 8933 | special code for that). |
| 8934 | |
| 8935 | When called to find the end of a branch or group, pptr must point to the first |
| 8936 | meta code inside the branch, not the branch-starting code. In other cases it |
| 8937 | can point to the item that causes the function to be called. |
| 8938 | |
| 8939 | Arguments: |
| 8940 | pptr current pointer to skip from |
| 8941 | skiptype PSKIP_CLASS when skipping to end of class |
| 8942 | PSKIP_ALT when META_ALT ends the skip |
| 8943 | PSKIP_KET when only META_KET ends the skip |
| 8944 | |
| 8945 | Returns: new value of pptr |
| 8946 | NULL if META_END is reached - should never occur |
| 8947 | or for an unknown meta value - likewise |
| 8948 | */ |
| 8949 | |
| 8950 | static uint32_t * |
| 8951 | parsed_skip(uint32_t *pptr, uint32_t skiptype) |
| 8952 | { |
| 8953 | uint32_t nestlevel = 0; |
| 8954 | |
| 8955 | for (;; pptr++) |
| 8956 | { |
| 8957 | uint32_t meta = META_CODE(*pptr); |
| 8958 | |
| 8959 | switch(meta) |
| 8960 | { |
| 8961 | default: /* Just skip over most items */ |
| 8962 | if (meta < META_END) continue; /* Literal */ |
| 8963 | break; |
| 8964 | |
| 8965 | /* This should never occur. */ |
| 8966 | |
| 8967 | case META_END: |
| 8968 | return NULL; |
| 8969 | |
| 8970 | /* The data for these items is variable in length. */ |
| 8971 | |
| 8972 | case META_BACKREF: /* Offset is present only if group >= 10 */ |
| 8973 | if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET; |
| 8974 | break; |
| 8975 | |
| 8976 | case META_ESCAPE: /* A few escapes are followed by data items. */ |
| 8977 | switch (META_DATA(*pptr)) |
| 8978 | { |
| 8979 | case ESC_P: |
| 8980 | case ESC_p: |
| 8981 | pptr += 1; |
| 8982 | break; |
| 8983 | |
| 8984 | case ESC_g: |
| 8985 | case ESC_k: |
| 8986 | pptr += 1 + SIZEOFFSET; |
| 8987 | break; |
| 8988 | } |
| 8989 | break; |
| 8990 | |
| 8991 | case META_MARK: /* Add the length of the name. */ |
| 8992 | case META_COMMIT_ARG: |
| 8993 | case META_PRUNE_ARG: |
| 8994 | case META_SKIP_ARG: |
| 8995 | case META_THEN_ARG: |
| 8996 | pptr += pptr[1]; |
| 8997 | break; |
| 8998 | |
| 8999 | /* These are the "active" items in this loop. */ |
| 9000 | |
| 9001 | case META_CLASS_END: |
| 9002 | if (skiptype == PSKIP_CLASS) return pptr; |
| 9003 | break; |
| 9004 | |
| 9005 | case META_ATOMIC: |
| 9006 | case META_CAPTURE: |
| 9007 | case META_COND_ASSERT: |
| 9008 | case META_COND_DEFINE: |
| 9009 | case META_COND_NAME: |
| 9010 | case META_COND_NUMBER: |
| 9011 | case META_COND_RNAME: |
| 9012 | case META_COND_RNUMBER: |
| 9013 | case META_COND_VERSION: |
| 9014 | case META_LOOKAHEAD: |
| 9015 | case META_LOOKAHEADNOT: |
| 9016 | case META_LOOKAHEAD_NA: |
| 9017 | case META_LOOKBEHIND: |
| 9018 | case META_LOOKBEHINDNOT: |
| 9019 | case META_LOOKBEHIND_NA: |
| 9020 | case META_NOCAPTURE: |
| 9021 | case META_SCRIPT_RUN: |
| 9022 | nestlevel++; |
| 9023 | break; |
| 9024 | |
| 9025 | case META_ALT: |
| 9026 | if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr; |
| 9027 | break; |
| 9028 | |
| 9029 | case META_KET: |
| 9030 | if (nestlevel == 0) return pptr; |
| 9031 | nestlevel--; |
| 9032 | break; |
| 9033 | } |
| 9034 | |
| 9035 | /* The extra data item length for each meta is in a table. */ |
| 9036 | |
| 9037 | meta = (meta >> 16) & 0x7fff; |
| 9038 | if (meta >= sizeof(meta_extra_lengths)) return NULL; |
| 9039 | pptr += meta_extra_lengths[meta]; |
| 9040 | } |
| 9041 | /* Control never reaches here */ |
| 9042 | return pptr; |
| 9043 | } |
| 9044 | |
| 9045 | |
| 9046 | |
| 9047 | /************************************************* |
| 9048 | * Find length of a parsed group * |
| 9049 | *************************************************/ |
| 9050 | |
| 9051 | /* This is called for nested groups within a branch of a lookbehind whose |
| 9052 | length is being computed. If all the branches in the nested group have the same |
| 9053 | length, that is OK. On entry, the pointer must be at the first element after |
| 9054 | the group initializing code. On exit it points to OP_KET. Caching is used to |
| 9055 | improve processing speed when the same capturing group occurs many times. |
| 9056 | |
| 9057 | Arguments: |
| 9058 | pptrptr pointer to pointer in the parsed pattern |
| 9059 | isinline FALSE if a reference or recursion; TRUE for inline group |
| 9060 | errcodeptr pointer to the errorcode |
| 9061 | lcptr pointer to the loop counter |
| 9062 | group number of captured group or -1 for a non-capturing group |
| 9063 | recurses chain of recurse_check to catch mutual recursion |
| 9064 | cb pointer to the compile data |
| 9065 | |
| 9066 | Returns: the group length or a negative number |
| 9067 | */ |
| 9068 | |
| 9069 | static int |
| 9070 | get_grouplength(uint32_t **pptrptr, BOOL isinline, int *errcodeptr, int *lcptr, |
| 9071 | int group, parsed_recurse_check *recurses, compile_block *cb) |
| 9072 | { |
| 9073 | int branchlength; |
| 9074 | int grouplength = -1; |
| 9075 | |
| 9076 | /* The cache can be used only if there is no possibility of there being two |
| 9077 | groups with the same number. We do not need to set the end pointer for a group |
| 9078 | that is being processed as a back reference or recursion, but we must do so for |
| 9079 | an inline group. */ |
| 9080 | |
| 9081 | if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0) |
| 9082 | { |
| 9083 | uint32_t groupinfo = cb->groupinfo[group]; |
| 9084 | if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1; |
| 9085 | if ((groupinfo & GI_SET_FIXED_LENGTH) != 0) |
| 9086 | { |
| 9087 | if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET); |
| 9088 | return groupinfo & GI_FIXED_LENGTH_MASK; |
| 9089 | } |
| 9090 | } |
| 9091 | |
| 9092 | /* Scan the group. In this case we find the end pointer of necessity. */ |
| 9093 | |
| 9094 | for(;;) |
| 9095 | { |
| 9096 | branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb); |
| 9097 | if (branchlength < 0) goto ISNOTFIXED; |
| 9098 | if (grouplength == -1) grouplength = branchlength; |
| 9099 | else if (grouplength != branchlength) goto ISNOTFIXED; |
| 9100 | if (**pptrptr == META_KET) break; |
| 9101 | *pptrptr += 1; /* Skip META_ALT */ |
| 9102 | } |
| 9103 | |
| 9104 | if (group > 0) |
| 9105 | cb->groupinfo[group] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength); |
| 9106 | return grouplength; |
| 9107 | |
| 9108 | ISNOTFIXED: |
| 9109 | if (group > 0) cb->groupinfo[group] |= GI_NOT_FIXED_LENGTH; |
| 9110 | return -1; |
| 9111 | } |
| 9112 | |
| 9113 | |
| 9114 | |
| 9115 | /************************************************* |
| 9116 | * Find length of a parsed branch * |
| 9117 | *************************************************/ |
| 9118 | |
| 9119 | /* Return a fixed length for a branch in a lookbehind, giving an error if the |
| 9120 | length is not fixed. On entry, *pptrptr points to the first element inside the |
| 9121 | branch. On exit it is set to point to the ALT or KET. |
| 9122 | |
| 9123 | Arguments: |
| 9124 | pptrptr pointer to pointer in the parsed pattern |
| 9125 | errcodeptr pointer to error code |
| 9126 | lcptr pointer to loop counter |
| 9127 | recurses chain of recurse_check to catch mutual recursion |
| 9128 | cb pointer to compile block |
| 9129 | |
| 9130 | Returns: the length, or a negative value on error |
| 9131 | */ |
| 9132 | |
| 9133 | static int |
| 9134 | get_branchlength(uint32_t **pptrptr, int *errcodeptr, int *lcptr, |
| 9135 | parsed_recurse_check *recurses, compile_block *cb) |
| 9136 | { |
| 9137 | int branchlength = 0; |
| 9138 | int grouplength; |
| 9139 | uint32_t lastitemlength = 0; |
| 9140 | uint32_t *pptr = *pptrptr; |
| 9141 | PCRE2_SIZE offset; |
| 9142 | parsed_recurse_check this_recurse; |
| 9143 | |
| 9144 | /* A large and/or complex regex can take too long to process. This can happen |
| 9145 | more often when (?| groups are present in the pattern because their length |
| 9146 | cannot be cached. */ |
| 9147 | |
| 9148 | if ((*lcptr)++ > 2000) |
| 9149 | { |
| 9150 | *errcodeptr = ERR35; /* Lookbehind is too complicated */ |
| 9151 | return -1; |
| 9152 | } |
| 9153 | |
| 9154 | /* Scan the branch, accumulating the length. */ |
| 9155 | |
| 9156 | for (;; pptr++) |
| 9157 | { |
| 9158 | parsed_recurse_check *r; |
| 9159 | uint32_t *gptr, *gptrend; |
| 9160 | uint32_t escape; |
| 9161 | uint32_t group = 0; |
| 9162 | uint32_t itemlength = 0; |
| 9163 | |
| 9164 | if (*pptr < META_END) |
| 9165 | { |
| 9166 | itemlength = 1; |
| 9167 | } |
| 9168 | |
| 9169 | else switch (META_CODE(*pptr)) |
| 9170 | { |
| 9171 | case META_KET: |
| 9172 | case META_ALT: |
| 9173 | goto EXIT; |
| 9174 | |
| 9175 | /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the |
| 9176 | actual termination. */ |
| 9177 | |
| 9178 | case META_ACCEPT: |
| 9179 | case META_FAIL: |
| 9180 | pptr = parsed_skip(pptr, PSKIP_ALT); |
| 9181 | if (pptr == NULL) goto PARSED_SKIP_FAILED; |
| 9182 | goto EXIT; |
| 9183 | |
| 9184 | case META_MARK: |
| 9185 | case META_COMMIT_ARG: |
| 9186 | case META_PRUNE_ARG: |
| 9187 | case META_SKIP_ARG: |
| 9188 | case META_THEN_ARG: |
| 9189 | pptr += pptr[1] + 1; |
| 9190 | break; |
| 9191 | |
| 9192 | case META_CIRCUMFLEX: |
| 9193 | case META_COMMIT: |
| 9194 | case META_DOLLAR: |
| 9195 | case META_PRUNE: |
| 9196 | case META_SKIP: |
| 9197 | case META_THEN: |
| 9198 | break; |
| 9199 | |
| 9200 | case META_OPTIONS: |
| 9201 | pptr += 1; |
| 9202 | break; |
| 9203 | |
| 9204 | case META_BIGVALUE: |
| 9205 | itemlength = 1; |
| 9206 | pptr += 1; |
| 9207 | break; |
| 9208 | |
| 9209 | case META_CLASS: |
| 9210 | case META_CLASS_NOT: |
| 9211 | itemlength = 1; |
| 9212 | pptr = parsed_skip(pptr, PSKIP_CLASS); |
| 9213 | if (pptr == NULL) goto PARSED_SKIP_FAILED; |
| 9214 | break; |
| 9215 | |
| 9216 | case META_CLASS_EMPTY_NOT: |
| 9217 | case META_DOT: |
| 9218 | itemlength = 1; |
| 9219 | break; |
| 9220 | |
| 9221 | case META_CALLOUT_NUMBER: |
| 9222 | pptr += 3; |
| 9223 | break; |
| 9224 | |
| 9225 | case META_CALLOUT_STRING: |
| 9226 | pptr += 3 + SIZEOFFSET; |
| 9227 | break; |
| 9228 | |
| 9229 | /* Only some escapes consume a character. Of those, \R and \X are never |
| 9230 | allowed because they might match more than character. \C is allowed only in |
| 9231 | 32-bit and non-UTF 8/16-bit modes. */ |
| 9232 | |
| 9233 | case META_ESCAPE: |
| 9234 | escape = META_DATA(*pptr); |
| 9235 | if (escape == ESC_R || escape == ESC_X) return -1; |
| 9236 | if (escape > ESC_b && escape < ESC_Z) |
| 9237 | { |
| 9238 | #if PCRE2_CODE_UNIT_WIDTH != 32 |
| 9239 | if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C) |
| 9240 | { |
| 9241 | *errcodeptr = ERR36; |
| 9242 | return -1; |
| 9243 | } |
| 9244 | #endif |
| 9245 | itemlength = 1; |
| 9246 | if (escape == ESC_p || escape == ESC_P) pptr++; /* Skip prop data */ |
| 9247 | } |
| 9248 | break; |
| 9249 | |
| 9250 | /* Lookaheads do not contribute to the length of this branch, but they may |
| 9251 | contain lookbehinds within them whose lengths need to be set. */ |
| 9252 | |
| 9253 | case META_LOOKAHEAD: |
| 9254 | case META_LOOKAHEADNOT: |
| 9255 | case META_LOOKAHEAD_NA: |
| 9256 | *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr); |
| 9257 | if (*errcodeptr != 0) return -1; |
| 9258 | |
| 9259 | /* Ignore any qualifiers that follow a lookahead assertion. */ |
| 9260 | |
| 9261 | switch (pptr[1]) |
| 9262 | { |
| 9263 | case META_ASTERISK: |
| 9264 | case META_ASTERISK_PLUS: |
| 9265 | case META_ASTERISK_QUERY: |
| 9266 | case META_PLUS: |
| 9267 | case META_PLUS_PLUS: |
| 9268 | case META_PLUS_QUERY: |
| 9269 | case META_QUERY: |
| 9270 | case META_QUERY_PLUS: |
| 9271 | case META_QUERY_QUERY: |
| 9272 | pptr++; |
| 9273 | break; |
| 9274 | |
| 9275 | case META_MINMAX: |
| 9276 | case META_MINMAX_PLUS: |
| 9277 | case META_MINMAX_QUERY: |
| 9278 | pptr += 3; |
| 9279 | break; |
| 9280 | |
| 9281 | default: |
| 9282 | break; |
| 9283 | } |
| 9284 | break; |
| 9285 | |
| 9286 | /* A nested lookbehind does not contribute any length to this lookbehind, |
| 9287 | but must itself be checked and have its lengths set. */ |
| 9288 | |
| 9289 | case META_LOOKBEHIND: |
| 9290 | case META_LOOKBEHINDNOT: |
| 9291 | case META_LOOKBEHIND_NA: |
| 9292 | if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb)) |
| 9293 | return -1; |
| 9294 | break; |
| 9295 | |
| 9296 | /* Back references and recursions are handled by very similar code. At this |
| 9297 | stage, the names generated in the parsing pass are available, but the main |
| 9298 | name table has not yet been created. So for the named varieties, scan the |
| 9299 | list of names in order to get the number of the first one in the pattern, |
| 9300 | and whether or not this name is duplicated. */ |
| 9301 | |
| 9302 | case META_BACKREF_BYNAME: |
| 9303 | if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0) |
| 9304 | goto ISNOTFIXED; |
| 9305 | /* Fall through */ |
| 9306 | |
| 9307 | case META_RECURSE_BYNAME: |
| 9308 | { |
| 9309 | int i; |
| 9310 | PCRE2_SPTR name; |
| 9311 | BOOL is_dupname = FALSE; |
| 9312 | named_group *ng = cb->named_groups; |
| 9313 | uint32_t meta_code = META_CODE(*pptr); |
| 9314 | uint32_t length = *(++pptr); |
| 9315 | |
| 9316 | GETPLUSOFFSET(offset, pptr); |
| 9317 | name = cb->start_pattern + offset; |
| 9318 | for (i = 0; i < cb->names_found; i++, ng++) |
| 9319 | { |
| 9320 | if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0) |
| 9321 | { |
| 9322 | group = ng->number; |
| 9323 | is_dupname = ng->isdup; |
| 9324 | break; |
| 9325 | } |
| 9326 | } |
| 9327 | |
| 9328 | if (group == 0) |
| 9329 | { |
| 9330 | *errcodeptr = ERR15; /* Non-existent subpattern */ |
| 9331 | cb->erroroffset = offset; |
| 9332 | return -1; |
| 9333 | } |
| 9334 | |
| 9335 | /* A numerical back reference can be fixed length if duplicate capturing |
| 9336 | groups are not being used. A non-duplicate named back reference can also |
| 9337 | be handled. */ |
| 9338 | |
| 9339 | if (meta_code == META_RECURSE_BYNAME || |
| 9340 | (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)) |
| 9341 | goto RECURSE_OR_BACKREF_LENGTH; /* Handle as a numbered version. */ |
| 9342 | } |
| 9343 | goto ISNOTFIXED; /* Duplicate name or number */ |
| 9344 | |
| 9345 | /* The offset values for back references < 10 are in a separate vector |
| 9346 | because otherwise they would use more than two parsed pattern elements on |
| 9347 | 64-bit systems. */ |
| 9348 | |
| 9349 | case META_BACKREF: |
| 9350 | if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 || |
| 9351 | (cb->external_flags & PCRE2_DUPCAPUSED) != 0) |
| 9352 | goto ISNOTFIXED; |
| 9353 | group = META_DATA(*pptr); |
| 9354 | if (group < 10) |
| 9355 | { |
| 9356 | offset = cb->small_ref_offset[group]; |
| 9357 | goto RECURSE_OR_BACKREF_LENGTH; |
| 9358 | } |
| 9359 | |
| 9360 | /* Fall through */ |
| 9361 | /* For groups >= 10 - picking up group twice does no harm. */ |
| 9362 | |
| 9363 | /* A true recursion implies not fixed length, but a subroutine call may |
| 9364 | be OK. Back reference "recursions" are also failed. */ |
| 9365 | |
| 9366 | case META_RECURSE: |
| 9367 | group = META_DATA(*pptr); |
| 9368 | GETPLUSOFFSET(offset, pptr); |
| 9369 | |
| 9370 | RECURSE_OR_BACKREF_LENGTH: |
| 9371 | if (group > cb->bracount) |
| 9372 | { |
| 9373 | cb->erroroffset = offset; |
| 9374 | *errcodeptr = ERR15; /* Non-existent subpattern */ |
| 9375 | return -1; |
| 9376 | } |
| 9377 | if (group == 0) goto ISNOTFIXED; /* Local recursion */ |
| 9378 | for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++) |
| 9379 | { |
| 9380 | if (META_CODE(*gptr) == META_BIGVALUE) gptr++; |
| 9381 | else if (*gptr == (META_CAPTURE | group)) break; |
| 9382 | } |
| 9383 | |
| 9384 | /* We must start the search for the end of the group at the first meta code |
| 9385 | inside the group. Otherwise it will be treated as an enclosed group. */ |
| 9386 | |
| 9387 | gptrend = parsed_skip(gptr + 1, PSKIP_KET); |
| 9388 | if (gptrend == NULL) goto PARSED_SKIP_FAILED; |
| 9389 | if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED; /* Local recursion */ |
| 9390 | for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break; |
| 9391 | if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */ |
| 9392 | this_recurse.prev = recurses; |
| 9393 | this_recurse.groupptr = gptr; |
| 9394 | |
| 9395 | /* We do not need to know the position of the end of the group, that is, |
| 9396 | gptr is not used after the call to get_grouplength(). Setting the second |
| 9397 | argument FALSE stops it scanning for the end when the length can be found |
| 9398 | in the cache. */ |
| 9399 | |
| 9400 | gptr++; |
| 9401 | grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group, |
| 9402 | &this_recurse, cb); |
| 9403 | if (grouplength < 0) |
| 9404 | { |
| 9405 | if (*errcodeptr == 0) goto ISNOTFIXED; |
| 9406 | return -1; /* Error already set */ |
| 9407 | } |
| 9408 | itemlength = grouplength; |
| 9409 | break; |
| 9410 | |
| 9411 | /* A (DEFINE) group is never obeyed inline and so it does not contribute to |
| 9412 | the length of this branch. Skip from the following item to the next |
| 9413 | unpaired ket. */ |
| 9414 | |
| 9415 | case META_COND_DEFINE: |
| 9416 | pptr = parsed_skip(pptr + 1, PSKIP_KET); |
| 9417 | break; |
| 9418 | |
| 9419 | /* Check other nested groups - advance past the initial data for each type |
| 9420 | and then seek a fixed length with get_grouplength(). */ |
| 9421 | |
| 9422 | case META_COND_NAME: |
| 9423 | case META_COND_NUMBER: |
| 9424 | case META_COND_RNAME: |
| 9425 | case META_COND_RNUMBER: |
| 9426 | pptr += 2 + SIZEOFFSET; |
| 9427 | goto CHECK_GROUP; |
| 9428 | |
| 9429 | case META_COND_ASSERT: |
| 9430 | pptr += 1; |
| 9431 | goto CHECK_GROUP; |
| 9432 | |
| 9433 | case META_COND_VERSION: |
| 9434 | pptr += 4; |
| 9435 | goto CHECK_GROUP; |
| 9436 | |
| 9437 | case META_CAPTURE: |
| 9438 | group = META_DATA(*pptr); |
| 9439 | /* Fall through */ |
| 9440 | |
| 9441 | case META_ATOMIC: |
| 9442 | case META_NOCAPTURE: |
| 9443 | case META_SCRIPT_RUN: |
| 9444 | pptr++; |
| 9445 | CHECK_GROUP: |
| 9446 | grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group, |
| 9447 | recurses, cb); |
| 9448 | if (grouplength < 0) return -1; |
| 9449 | itemlength = grouplength; |
| 9450 | break; |
| 9451 | |
| 9452 | /* Exact repetition is OK; variable repetition is not. A repetition of zero |
| 9453 | must subtract the length that has already been added. */ |
| 9454 | |
| 9455 | case META_MINMAX: |
| 9456 | case META_MINMAX_PLUS: |
| 9457 | case META_MINMAX_QUERY: |
| 9458 | if (pptr[1] == pptr[2]) |
| 9459 | { |
| 9460 | switch(pptr[1]) |
| 9461 | { |
| 9462 | case 0: |
| 9463 | branchlength -= lastitemlength; |
| 9464 | break; |
| 9465 | |
| 9466 | case 1: |
| 9467 | itemlength = 0; |
| 9468 | break; |
| 9469 | |
| 9470 | default: /* Check for integer overflow */ |
| 9471 | if (lastitemlength != 0 && /* Should not occur, but just in case */ |
| 9472 | INT_MAX/lastitemlength < pptr[1] - 1) |
| 9473 | { |
| 9474 | *errcodeptr = ERR87; /* Integer overflow; lookbehind too big */ |
| 9475 | return -1; |
| 9476 | } |
| 9477 | itemlength = (pptr[1] - 1) * lastitemlength; |
| 9478 | break; |
| 9479 | } |
| 9480 | pptr += 2; |
| 9481 | break; |
| 9482 | } |
| 9483 | /* Fall through */ |
| 9484 | |
| 9485 | /* Any other item means this branch does not have a fixed length. */ |
| 9486 | |
| 9487 | default: |
| 9488 | ISNOTFIXED: |
| 9489 | *errcodeptr = ERR25; /* Not fixed length */ |
| 9490 | return -1; |
| 9491 | } |
| 9492 | |
| 9493 | /* Add the item length to the branchlength, checking for integer overflow and |
| 9494 | for the branch length exceeding the limit. */ |
| 9495 | |
| 9496 | if (INT_MAX - branchlength < (int)itemlength || |
| 9497 | (branchlength += itemlength) > LOOKBEHIND_MAX) |
| 9498 | { |
| 9499 | *errcodeptr = ERR87; |
| 9500 | return -1; |
| 9501 | } |
| 9502 | |
| 9503 | /* Save this item length for use if the next item is a quantifier. */ |
| 9504 | |
| 9505 | lastitemlength = itemlength; |
| 9506 | } |
| 9507 | |
| 9508 | EXIT: |
| 9509 | *pptrptr = pptr; |
| 9510 | return branchlength; |
| 9511 | |
| 9512 | PARSED_SKIP_FAILED: |
| 9513 | *errcodeptr = ERR90; |
| 9514 | return -1; |
| 9515 | } |
| 9516 | |
| 9517 | |
| 9518 | |
| 9519 | /************************************************* |
| 9520 | * Set lengths in a lookbehind * |
| 9521 | *************************************************/ |
| 9522 | |
| 9523 | /* This function is called for each lookbehind, to set the lengths in its |
| 9524 | branches. An error occurs if any branch does not have a fixed length that is |
| 9525 | less than the maximum (65535). On exit, the pointer must be left on the final |
| 9526 | ket. |
| 9527 | |
| 9528 | The function also maintains the max_lookbehind value. Any lookbehind branch |
| 9529 | that contains a nested lookbehind may actually look further back than the |
| 9530 | length of the branch. The additional amount is passed back from |
| 9531 | get_branchlength() as an "extra" value. |
| 9532 | |
| 9533 | Arguments: |
| 9534 | pptrptr pointer to pointer in the parsed pattern |
| 9535 | errcodeptr pointer to error code |
| 9536 | lcptr pointer to loop counter |
| 9537 | recurses chain of recurse_check to catch mutual recursion |
| 9538 | cb pointer to compile block |
| 9539 | |
| 9540 | Returns: TRUE if all is well |
| 9541 | FALSE otherwise, with error code and offset set |
| 9542 | */ |
| 9543 | |
| 9544 | static BOOL |
| 9545 | set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr, |
| 9546 | parsed_recurse_check *recurses, compile_block *cb) |
| 9547 | { |
| 9548 | PCRE2_SIZE offset; |
| 9549 | int branchlength; |
| 9550 | uint32_t *bptr = *pptrptr; |
| 9551 | |
| 9552 | READPLUSOFFSET(offset, bptr); /* Offset for error messages */ |
| 9553 | *pptrptr += SIZEOFFSET; |
| 9554 | |
| 9555 | do |
| 9556 | { |
| 9557 | *pptrptr += 1; |
| 9558 | branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb); |
| 9559 | if (branchlength < 0) |
| 9560 | { |
| 9561 | /* The errorcode and offset may already be set from a nested lookbehind. */ |
| 9562 | if (*errcodeptr == 0) *errcodeptr = ERR25; |
| 9563 | if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset; |
| 9564 | return FALSE; |
| 9565 | } |
| 9566 | if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength; |
| 9567 | *bptr |= branchlength; /* branchlength never more than 65535 */ |
| 9568 | bptr = *pptrptr; |
| 9569 | } |
| 9570 | while (*bptr == META_ALT); |
| 9571 | |
| 9572 | return TRUE; |
| 9573 | } |
| 9574 | |
| 9575 | |
| 9576 | |
| 9577 | /************************************************* |
| 9578 | * Check parsed pattern lookbehinds * |
| 9579 | *************************************************/ |
| 9580 | |
| 9581 | /* This function is called at the end of parsing a pattern if any lookbehinds |
| 9582 | were encountered. It scans the parsed pattern for them, calling |
| 9583 | set_lookbehind_lengths() for each one. At the start, the errorcode is zero and |
| 9584 | the error offset is marked unset. The enables the functions above not to |
| 9585 | override settings from deeper nestings. |
| 9586 | |
| 9587 | This function is called recursively from get_branchlength() for lookaheads in |
| 9588 | order to process any lookbehinds that they may contain. It stops when it hits a |
| 9589 | non-nested closing parenthesis in this case, returning a pointer to it. |
| 9590 | |
| 9591 | Arguments |
| 9592 | pptr points to where to start (start of pattern or start of lookahead) |
| 9593 | retptr if not NULL, return the ket pointer here |
| 9594 | recurses chain of recurse_check to catch mutual recursion |
| 9595 | cb points to the compile block |
| 9596 | lcptr points to loop counter |
| 9597 | |
| 9598 | Returns: 0 on success, or an errorcode (cb->erroroffset will be set) |
| 9599 | */ |
| 9600 | |
| 9601 | static int |
| 9602 | check_lookbehinds(uint32_t *pptr, uint32_t **retptr, |
| 9603 | parsed_recurse_check *recurses, compile_block *cb, int *lcptr) |
| 9604 | { |
| 9605 | int errorcode = 0; |
| 9606 | int nestlevel = 0; |
| 9607 | |
| 9608 | cb->erroroffset = PCRE2_UNSET; |
| 9609 | |
| 9610 | for (; *pptr != META_END; pptr++) |
| 9611 | { |
| 9612 | if (*pptr < META_END) continue; /* Literal */ |
| 9613 | |
| 9614 | switch (META_CODE(*pptr)) |
| 9615 | { |
| 9616 | default: |
| 9617 | return ERR70; /* Unrecognized meta code */ |
| 9618 | |
| 9619 | case META_ESCAPE: |
| 9620 | if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p) |
| 9621 | pptr += 1; |
| 9622 | break; |
| 9623 | |
| 9624 | case META_KET: |
| 9625 | if (--nestlevel < 0) |
| 9626 | { |
| 9627 | if (retptr != NULL) *retptr = pptr; |
| 9628 | return 0; |
| 9629 | } |
| 9630 | break; |
| 9631 | |
| 9632 | case META_ATOMIC: |
| 9633 | case META_CAPTURE: |
| 9634 | case META_COND_ASSERT: |
| 9635 | case META_LOOKAHEAD: |
| 9636 | case META_LOOKAHEADNOT: |
| 9637 | case META_LOOKAHEAD_NA: |
| 9638 | case META_NOCAPTURE: |
| 9639 | case META_SCRIPT_RUN: |
| 9640 | nestlevel++; |
| 9641 | break; |
| 9642 | |
| 9643 | case META_ACCEPT: |
| 9644 | case META_ALT: |
| 9645 | case META_ASTERISK: |
| 9646 | case META_ASTERISK_PLUS: |
| 9647 | case META_ASTERISK_QUERY: |
| 9648 | case META_BACKREF: |
| 9649 | case META_CIRCUMFLEX: |
| 9650 | case META_CLASS: |
| 9651 | case META_CLASS_EMPTY: |
| 9652 | case META_CLASS_EMPTY_NOT: |
| 9653 | case META_CLASS_END: |
| 9654 | case META_CLASS_NOT: |
| 9655 | case META_COMMIT: |
| 9656 | case META_DOLLAR: |
| 9657 | case META_DOT: |
| 9658 | case META_FAIL: |
| 9659 | case META_PLUS: |
| 9660 | case META_PLUS_PLUS: |
| 9661 | case META_PLUS_QUERY: |
| 9662 | case META_PRUNE: |
| 9663 | case META_QUERY: |
| 9664 | case META_QUERY_PLUS: |
| 9665 | case META_QUERY_QUERY: |
| 9666 | case META_RANGE_ESCAPED: |
| 9667 | case META_RANGE_LITERAL: |
| 9668 | case META_SKIP: |
| 9669 | case META_THEN: |
| 9670 | break; |
| 9671 | |
| 9672 | case META_RECURSE: |
| 9673 | pptr += SIZEOFFSET; |
| 9674 | break; |
| 9675 | |
| 9676 | case META_BACKREF_BYNAME: |
| 9677 | case META_RECURSE_BYNAME: |
| 9678 | pptr += 1 + SIZEOFFSET; |
| 9679 | break; |
| 9680 | |
| 9681 | case META_COND_DEFINE: |
| 9682 | pptr += SIZEOFFSET; |
| 9683 | nestlevel++; |
| 9684 | break; |
| 9685 | |
| 9686 | case META_COND_NAME: |
| 9687 | case META_COND_NUMBER: |
| 9688 | case META_COND_RNAME: |
| 9689 | case META_COND_RNUMBER: |
| 9690 | pptr += 1 + SIZEOFFSET; |
| 9691 | nestlevel++; |
| 9692 | break; |
| 9693 | |
| 9694 | case META_COND_VERSION: |
| 9695 | pptr += 3; |
| 9696 | nestlevel++; |
| 9697 | break; |
| 9698 | |
| 9699 | case META_CALLOUT_STRING: |
| 9700 | pptr += 3 + SIZEOFFSET; |
| 9701 | break; |
| 9702 | |
| 9703 | case META_BIGVALUE: |
| 9704 | case META_OPTIONS: |
| 9705 | case META_POSIX: |
| 9706 | case META_POSIX_NEG: |
| 9707 | pptr += 1; |
| 9708 | break; |
| 9709 | |
| 9710 | case META_MINMAX: |
| 9711 | case META_MINMAX_QUERY: |
| 9712 | case META_MINMAX_PLUS: |
| 9713 | pptr += 2; |
| 9714 | break; |
| 9715 | |
| 9716 | case META_CALLOUT_NUMBER: |
| 9717 | pptr += 3; |
| 9718 | break; |
| 9719 | |
| 9720 | case META_MARK: |
| 9721 | case META_COMMIT_ARG: |
| 9722 | case META_PRUNE_ARG: |
| 9723 | case META_SKIP_ARG: |
| 9724 | case META_THEN_ARG: |
| 9725 | pptr += 1 + pptr[1]; |
| 9726 | break; |
| 9727 | |
| 9728 | case META_LOOKBEHIND: |
| 9729 | case META_LOOKBEHINDNOT: |
| 9730 | case META_LOOKBEHIND_NA: |
| 9731 | if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb)) |
| 9732 | return errorcode; |
| 9733 | break; |
| 9734 | } |
| 9735 | } |
| 9736 | |
| 9737 | return 0; |
| 9738 | } |
| 9739 | |
| 9740 | |
| 9741 | |
| 9742 | /************************************************* |
| 9743 | * External function to compile a pattern * |
| 9744 | *************************************************/ |
| 9745 | |
| 9746 | /* This function reads a regular expression in the form of a string and returns |
| 9747 | a pointer to a block of store holding a compiled version of the expression. |
| 9748 | |
| 9749 | Arguments: |
| 9750 | pattern the regular expression |
| 9751 | patlen the length of the pattern, or PCRE2_ZERO_TERMINATED |
| 9752 | options option bits |
| 9753 | errorptr pointer to errorcode |
| 9754 | erroroffset pointer to error offset |
| 9755 | ccontext points to a compile context or is NULL |
| 9756 | |
| 9757 | Returns: pointer to compiled data block, or NULL on error, |
| 9758 | with errorcode and erroroffset set |
| 9759 | */ |
| 9760 | |
| 9761 | PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION |
| 9762 | pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options, |
| 9763 | int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext) |
| 9764 | { |
| 9765 | BOOL utf; /* Set TRUE for UTF mode */ |
| 9766 | BOOL ucp; /* Set TRUE for UCP mode */ |
| 9767 | BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */ |
| 9768 | BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */ |
| 9769 | pcre2_real_code *re = NULL; /* What we will return */ |
| 9770 | compile_block cb; /* "Static" compile-time data */ |
| 9771 | const uint8_t *tables; /* Char tables base pointer */ |
| 9772 | |
| 9773 | PCRE2_UCHAR *code; /* Current pointer in compiled code */ |
| 9774 | PCRE2_SPTR codestart; /* Start of compiled code */ |
| 9775 | PCRE2_SPTR ptr; /* Current pointer in pattern */ |
| 9776 | uint32_t *pptr; /* Current pointer in parsed pattern */ |
| 9777 | |
| 9778 | PCRE2_SIZE length = 1; /* Allow for final END opcode */ |
| 9779 | PCRE2_SIZE usedlength; /* Actual length used */ |
| 9780 | PCRE2_SIZE re_blocksize; /* Size of memory block */ |
| 9781 | PCRE2_SIZE big32count = 0; /* 32-bit literals >= 0x80000000 */ |
| 9782 | PCRE2_SIZE parsed_size_needed; /* Needed for parsed pattern */ |
| 9783 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 9784 | uint32_t firstcuflags, reqcuflags; /* Type of first/req code unit */ |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 9785 | uint32_t firstcu, reqcu; /* Value of first/req code unit */ |
| 9786 | uint32_t setflags = 0; /* NL and BSR set flags */ |
| 9787 | |
| 9788 | uint32_t skipatstart; /* When checking (*UTF) etc */ |
| 9789 | uint32_t limit_heap = UINT32_MAX; |
| 9790 | uint32_t limit_match = UINT32_MAX; /* Unset match limits */ |
| 9791 | uint32_t limit_depth = UINT32_MAX; |
| 9792 | |
| 9793 | int newline = 0; /* Unset; can be set by the pattern */ |
| 9794 | int bsr = 0; /* Unset; can be set by the pattern */ |
| 9795 | int errorcode = 0; /* Initialize to avoid compiler warn */ |
| 9796 | int regexrc; /* Return from compile */ |
| 9797 | |
| 9798 | uint32_t i; /* Local loop counter */ |
| 9799 | |
| 9800 | /* Comments at the head of this file explain about these variables. */ |
| 9801 | |
| 9802 | uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE]; |
| 9803 | uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE]; |
| 9804 | named_group named_groups[NAMED_GROUP_LIST_SIZE]; |
| 9805 | |
| 9806 | /* The workspace is used in different ways in the different compiling phases. |
| 9807 | It needs to be 16-bit aligned for the preliminary parsing scan. */ |
| 9808 | |
| 9809 | uint32_t c16workspace[C16_WORK_SIZE]; |
| 9810 | PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace; |
| 9811 | |
| 9812 | |
| 9813 | /* -------------- Check arguments and set up the pattern ----------------- */ |
| 9814 | |
| 9815 | /* There must be error code and offset pointers. */ |
| 9816 | |
| 9817 | if (errorptr == NULL || erroroffset == NULL) return NULL; |
| 9818 | *errorptr = ERR0; |
| 9819 | *erroroffset = 0; |
| 9820 | |
| 9821 | /* There must be a pattern! */ |
| 9822 | |
| 9823 | if (pattern == NULL) |
| 9824 | { |
| 9825 | *errorptr = ERR16; |
| 9826 | return NULL; |
| 9827 | } |
| 9828 | |
| 9829 | /* A NULL compile context means "use a default context" */ |
| 9830 | |
| 9831 | if (ccontext == NULL) |
| 9832 | ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context)); |
| 9833 | |
| 9834 | /* PCRE2_MATCH_INVALID_UTF implies UTF */ |
| 9835 | |
| 9836 | if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF; |
| 9837 | |
| 9838 | /* Check that all undefined public option bits are zero. */ |
| 9839 | |
| 9840 | if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 || |
| 9841 | (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0) |
| 9842 | { |
| 9843 | *errorptr = ERR17; |
| 9844 | return NULL; |
| 9845 | } |
| 9846 | |
| 9847 | if ((options & PCRE2_LITERAL) != 0 && |
| 9848 | ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 || |
| 9849 | (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0)) |
| 9850 | { |
| 9851 | *errorptr = ERR92; |
| 9852 | return NULL; |
| 9853 | } |
| 9854 | |
| 9855 | /* A zero-terminated pattern is indicated by the special length value |
| 9856 | PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */ |
| 9857 | |
| 9858 | if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED))) |
| 9859 | patlen = PRIV(strlen)(pattern); |
| 9860 | |
| 9861 | if (patlen > ccontext->max_pattern_length) |
| 9862 | { |
| 9863 | *errorptr = ERR88; |
| 9864 | return NULL; |
| 9865 | } |
| 9866 | |
| 9867 | /* From here on, all returns from this function should end up going via the |
| 9868 | EXIT label. */ |
| 9869 | |
| 9870 | |
| 9871 | /* ------------ Initialize the "static" compile data -------------- */ |
| 9872 | |
| 9873 | tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables); |
| 9874 | |
| 9875 | cb.lcc = tables + lcc_offset; /* Individual */ |
| 9876 | cb.fcc = tables + fcc_offset; /* character */ |
| 9877 | cb.cbits = tables + cbits_offset; /* tables */ |
| 9878 | cb.ctypes = tables + ctypes_offset; |
| 9879 | |
| 9880 | cb.assert_depth = 0; |
| 9881 | cb.bracount = 0; |
| 9882 | cb.cx = ccontext; |
| 9883 | cb.dupnames = FALSE; |
| 9884 | cb.end_pattern = pattern + patlen; |
| 9885 | cb.erroroffset = 0; |
| 9886 | cb.external_flags = 0; |
| 9887 | cb.external_options = options; |
| 9888 | cb.groupinfo = stack_groupinfo; |
| 9889 | cb.had_recurse = FALSE; |
| 9890 | cb.lastcapture = 0; |
| 9891 | cb.max_lookbehind = 0; |
| 9892 | cb.name_entry_size = 0; |
| 9893 | cb.name_table = NULL; |
| 9894 | cb.named_groups = named_groups; |
| 9895 | cb.named_group_list_size = NAMED_GROUP_LIST_SIZE; |
| 9896 | cb.names_found = 0; |
| 9897 | cb.open_caps = NULL; |
| 9898 | cb.parens_depth = 0; |
| 9899 | cb.parsed_pattern = stack_parsed_pattern; |
| 9900 | cb.req_varyopt = 0; |
| 9901 | cb.start_code = cworkspace; |
| 9902 | cb.start_pattern = pattern; |
| 9903 | cb.start_workspace = cworkspace; |
| 9904 | cb.workspace_size = COMPILE_WORK_SIZE; |
| 9905 | |
| 9906 | /* Maximum back reference and backref bitmap. The bitmap records up to 31 back |
| 9907 | references to help in deciding whether (.*) can be treated as anchored or not. |
| 9908 | */ |
| 9909 | |
| 9910 | cb.top_backref = 0; |
| 9911 | cb.backref_map = 0; |
| 9912 | |
| 9913 | /* Escape sequences \1 to \9 are always back references, but as they are only |
| 9914 | two characters long, only two elements can be used in the parsed_pattern |
| 9915 | vector. The first contains the reference, and we'd like to use the second to |
| 9916 | record the offset in the pattern, so that forward references to non-existent |
| 9917 | groups can be diagnosed later with an offset. However, on 64-bit systems, |
| 9918 | PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first |
| 9919 | occurrence of \1 to \9, indexed by the second parsed_pattern value. All other |
| 9920 | references have enough space for the offset to be put into the parsed pattern. |
| 9921 | */ |
| 9922 | |
| 9923 | for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET; |
| 9924 | |
| 9925 | |
| 9926 | /* --------------- Start looking at the pattern --------------- */ |
| 9927 | |
| 9928 | /* Unless PCRE2_LITERAL is set, check for global one-time option settings at |
| 9929 | the start of the pattern, and remember the offset to the actual regex. With |
| 9930 | valgrind support, make the terminator of a zero-terminated pattern |
| 9931 | inaccessible. This catches bugs that would otherwise only show up for |
| 9932 | non-zero-terminated patterns. */ |
| 9933 | |
| 9934 | #ifdef SUPPORT_VALGRIND |
| 9935 | if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1)); |
| 9936 | #endif |
| 9937 | |
| 9938 | ptr = pattern; |
| 9939 | skipatstart = 0; |
| 9940 | |
| 9941 | if ((options & PCRE2_LITERAL) == 0) |
| 9942 | { |
| 9943 | while (patlen - skipatstart >= 2 && |
| 9944 | ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && |
| 9945 | ptr[skipatstart+1] == CHAR_ASTERISK) |
| 9946 | { |
| 9947 | for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++) |
| 9948 | { |
| 9949 | uint32_t c, pp; |
| 9950 | pso *p = pso_list + i; |
| 9951 | |
| 9952 | if (patlen - skipatstart - 2 >= p->length && |
| 9953 | PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name), |
| 9954 | p->length) == 0) |
| 9955 | { |
| 9956 | skipatstart += p->length + 2; |
| 9957 | switch(p->type) |
| 9958 | { |
| 9959 | case PSO_OPT: |
| 9960 | cb.external_options |= p->value; |
| 9961 | break; |
| 9962 | |
| 9963 | case PSO_FLG: |
| 9964 | setflags |= p->value; |
| 9965 | break; |
| 9966 | |
| 9967 | case PSO_NL: |
| 9968 | newline = p->value; |
| 9969 | setflags |= PCRE2_NL_SET; |
| 9970 | break; |
| 9971 | |
| 9972 | case PSO_BSR: |
| 9973 | bsr = p->value; |
| 9974 | setflags |= PCRE2_BSR_SET; |
| 9975 | break; |
| 9976 | |
| 9977 | case PSO_LIMM: |
| 9978 | case PSO_LIMD: |
| 9979 | case PSO_LIMH: |
| 9980 | c = 0; |
| 9981 | pp = skipatstart; |
| 9982 | if (!IS_DIGIT(ptr[pp])) |
| 9983 | { |
| 9984 | errorcode = ERR60; |
| 9985 | ptr += pp; |
| 9986 | goto HAD_EARLY_ERROR; |
| 9987 | } |
| 9988 | while (IS_DIGIT(ptr[pp])) |
| 9989 | { |
| 9990 | if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */ |
| 9991 | c = c*10 + (ptr[pp++] - CHAR_0); |
| 9992 | } |
| 9993 | if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS) |
| 9994 | { |
| 9995 | errorcode = ERR60; |
| 9996 | ptr += pp; |
| 9997 | goto HAD_EARLY_ERROR; |
| 9998 | } |
| 9999 | if (p->type == PSO_LIMH) limit_heap = c; |
| 10000 | else if (p->type == PSO_LIMM) limit_match = c; |
| 10001 | else limit_depth = c; |
| 10002 | skipatstart += pp - skipatstart; |
| 10003 | break; |
| 10004 | } |
| 10005 | break; /* Out of the table scan loop */ |
| 10006 | } |
| 10007 | } |
| 10008 | if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */ |
| 10009 | } |
| 10010 | } |
| 10011 | |
| 10012 | /* End of pattern-start options; advance to start of real regex. */ |
| 10013 | |
| 10014 | ptr += skipatstart; |
| 10015 | |
| 10016 | /* Can't support UTF or UCP if PCRE2 was built without Unicode support. */ |
| 10017 | |
| 10018 | #ifndef SUPPORT_UNICODE |
| 10019 | if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0) |
| 10020 | { |
| 10021 | errorcode = ERR32; |
| 10022 | goto HAD_EARLY_ERROR; |
| 10023 | } |
| 10024 | #endif |
| 10025 | |
| 10026 | /* Check UTF. We have the original options in 'options', with that value as |
| 10027 | modified by (*UTF) etc in cb->external_options. The extra option |
| 10028 | PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the |
| 10029 | surrogate code points cannot be represented in UTF-16. */ |
| 10030 | |
| 10031 | utf = (cb.external_options & PCRE2_UTF) != 0; |
| 10032 | if (utf) |
| 10033 | { |
| 10034 | if ((options & PCRE2_NEVER_UTF) != 0) |
| 10035 | { |
| 10036 | errorcode = ERR74; |
| 10037 | goto HAD_EARLY_ERROR; |
| 10038 | } |
| 10039 | if ((options & PCRE2_NO_UTF_CHECK) == 0 && |
| 10040 | (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0) |
| 10041 | goto HAD_ERROR; /* Offset was set by valid_utf() */ |
| 10042 | |
| 10043 | #if PCRE2_CODE_UNIT_WIDTH == 16 |
| 10044 | if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0) |
| 10045 | { |
| 10046 | errorcode = ERR91; |
| 10047 | goto HAD_EARLY_ERROR; |
| 10048 | } |
| 10049 | #endif |
| 10050 | } |
| 10051 | |
| 10052 | /* Check UCP lockout. */ |
| 10053 | |
| 10054 | ucp = (cb.external_options & PCRE2_UCP) != 0; |
| 10055 | if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0) |
| 10056 | { |
| 10057 | errorcode = ERR75; |
| 10058 | goto HAD_EARLY_ERROR; |
| 10059 | } |
| 10060 | |
| 10061 | /* Process the BSR setting. */ |
| 10062 | |
| 10063 | if (bsr == 0) bsr = ccontext->bsr_convention; |
| 10064 | |
| 10065 | /* Process the newline setting. */ |
| 10066 | |
| 10067 | if (newline == 0) newline = ccontext->newline_convention; |
| 10068 | cb.nltype = NLTYPE_FIXED; |
| 10069 | switch(newline) |
| 10070 | { |
| 10071 | case PCRE2_NEWLINE_CR: |
| 10072 | cb.nllen = 1; |
| 10073 | cb.nl[0] = CHAR_CR; |
| 10074 | break; |
| 10075 | |
| 10076 | case PCRE2_NEWLINE_LF: |
| 10077 | cb.nllen = 1; |
| 10078 | cb.nl[0] = CHAR_NL; |
| 10079 | break; |
| 10080 | |
| 10081 | case PCRE2_NEWLINE_NUL: |
| 10082 | cb.nllen = 1; |
| 10083 | cb.nl[0] = CHAR_NUL; |
| 10084 | break; |
| 10085 | |
| 10086 | case PCRE2_NEWLINE_CRLF: |
| 10087 | cb.nllen = 2; |
| 10088 | cb.nl[0] = CHAR_CR; |
| 10089 | cb.nl[1] = CHAR_NL; |
| 10090 | break; |
| 10091 | |
| 10092 | case PCRE2_NEWLINE_ANY: |
| 10093 | cb.nltype = NLTYPE_ANY; |
| 10094 | break; |
| 10095 | |
| 10096 | case PCRE2_NEWLINE_ANYCRLF: |
| 10097 | cb.nltype = NLTYPE_ANYCRLF; |
| 10098 | break; |
| 10099 | |
| 10100 | default: |
| 10101 | errorcode = ERR56; |
| 10102 | goto HAD_EARLY_ERROR; |
| 10103 | } |
| 10104 | |
| 10105 | /* Pre-scan the pattern to do two things: (1) Discover the named groups and |
| 10106 | their numerical equivalents, so that this information is always available for |
| 10107 | the remaining processing. (2) At the same time, parse the pattern and put a |
| 10108 | processed version into the parsed_pattern vector. This has escapes interpreted |
| 10109 | and comments removed (amongst other things). |
| 10110 | |
| 10111 | In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned |
| 10112 | 32-bit ints in the parsed pattern is bounded by the length of the pattern plus |
| 10113 | one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is |
| 10114 | set. The exceptional case is when running in 32-bit, non-UTF mode, when literal |
| 10115 | characters greater than META_END (0x80000000) have to be coded as two units. In |
| 10116 | this case, therefore, we scan the pattern to check for such values. */ |
| 10117 | |
| 10118 | #if PCRE2_CODE_UNIT_WIDTH == 32 |
| 10119 | if (!utf) |
| 10120 | { |
| 10121 | PCRE2_SPTR p; |
| 10122 | for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++; |
| 10123 | } |
| 10124 | #endif |
| 10125 | |
| 10126 | /* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT |
| 10127 | is set we have to assume a numerical callout (4 elements) for each character |
| 10128 | plus one at the end. This is overkill, but memory is plentiful these days. For |
| 10129 | many smaller patterns the vector on the stack (which was set up above) can be |
| 10130 | used. */ |
| 10131 | |
| 10132 | parsed_size_needed = patlen - skipatstart + big32count; |
| 10133 | |
| 10134 | if ((ccontext->extra_options & |
| 10135 | (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0) |
| 10136 | parsed_size_needed += 4; |
| 10137 | |
| 10138 | if ((options & PCRE2_AUTO_CALLOUT) != 0) |
| 10139 | parsed_size_needed = (parsed_size_needed + 1) * 5; |
| 10140 | |
| 10141 | if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE) |
| 10142 | { |
| 10143 | uint32_t *heap_parsed_pattern = ccontext->memctl.malloc( |
| 10144 | (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data); |
| 10145 | if (heap_parsed_pattern == NULL) |
| 10146 | { |
| 10147 | *errorptr = ERR21; |
| 10148 | goto EXIT; |
| 10149 | } |
| 10150 | cb.parsed_pattern = heap_parsed_pattern; |
| 10151 | } |
| 10152 | cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1; |
| 10153 | |
| 10154 | /* Do the parsing scan. */ |
| 10155 | |
| 10156 | errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb); |
| 10157 | if (errorcode != 0) goto HAD_CB_ERROR; |
| 10158 | |
| 10159 | /* Workspace is needed to remember information about numbered groups: whether a |
| 10160 | group can match an empty string and what its fixed length is. This is done to |
| 10161 | avoid the possibility of recursive references causing very long compile times |
| 10162 | when checking these features. Unnumbered groups do not have this exposure since |
| 10163 | they cannot be referenced. We use an indexed vector for this purpose. If there |
| 10164 | are sufficiently few groups, the default vector on the stack, as set up above, |
| 10165 | can be used. Otherwise we have to get/free a special vector. The vector must be |
| 10166 | initialized to zero. */ |
| 10167 | |
| 10168 | if (cb.bracount >= GROUPINFO_DEFAULT_SIZE) |
| 10169 | { |
| 10170 | cb.groupinfo = ccontext->memctl.malloc( |
| 10171 | (cb.bracount + 1)*sizeof(uint32_t), ccontext->memctl.memory_data); |
| 10172 | if (cb.groupinfo == NULL) |
| 10173 | { |
| 10174 | errorcode = ERR21; |
| 10175 | cb.erroroffset = 0; |
| 10176 | goto HAD_CB_ERROR; |
| 10177 | } |
| 10178 | } |
| 10179 | memset(cb.groupinfo, 0, (cb.bracount + 1) * sizeof(uint32_t)); |
| 10180 | |
| 10181 | /* If there were any lookbehinds, scan the parsed pattern to figure out their |
| 10182 | lengths. */ |
| 10183 | |
| 10184 | if (has_lookbehind) |
| 10185 | { |
| 10186 | int loopcount = 0; |
| 10187 | errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount); |
| 10188 | if (errorcode != 0) goto HAD_CB_ERROR; |
| 10189 | } |
| 10190 | |
| 10191 | /* For debugging, there is a function that shows the parsed data vector. */ |
| 10192 | |
| 10193 | #ifdef DEBUG_SHOW_PARSED |
| 10194 | fprintf(stderr, "+++ Pre-scan complete:\n"); |
| 10195 | show_parsed(&cb); |
| 10196 | #endif |
| 10197 | |
| 10198 | /* For debugging capturing information this code can be enabled. */ |
| 10199 | |
| 10200 | #ifdef DEBUG_SHOW_CAPTURES |
| 10201 | { |
| 10202 | named_group *ng = cb.named_groups; |
| 10203 | fprintf(stderr, "+++Captures: %d\n", cb.bracount); |
| 10204 | for (i = 0; i < cb.names_found; i++, ng++) |
| 10205 | { |
| 10206 | fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name); |
| 10207 | } |
| 10208 | } |
| 10209 | #endif |
| 10210 | |
| 10211 | /* Pretend to compile the pattern while actually just accumulating the amount |
| 10212 | of memory required in the 'length' variable. This behaviour is triggered by |
| 10213 | passing a non-NULL final argument to compile_regex(). We pass a block of |
| 10214 | workspace (cworkspace) for it to compile parts of the pattern into; the |
| 10215 | compiled code is discarded when it is no longer needed, so hopefully this |
| 10216 | workspace will never overflow, though there is a test for its doing so. |
| 10217 | |
| 10218 | On error, errorcode will be set non-zero, so we don't need to look at the |
| 10219 | result of the function. The initial options have been put into the cb block, |
| 10220 | but we still have to pass a separate options variable (the first argument) |
| 10221 | because the options may change as the pattern is processed. */ |
| 10222 | |
| 10223 | cb.erroroffset = patlen; /* For any subsequent errors that do not set it */ |
| 10224 | pptr = cb.parsed_pattern; |
| 10225 | code = cworkspace; |
| 10226 | *code = OP_BRA; |
| 10227 | |
| 10228 | (void)compile_regex(cb.external_options, &code, &pptr, &errorcode, 0, &firstcu, |
| 10229 | &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, &length); |
| 10230 | |
| 10231 | if (errorcode != 0) goto HAD_CB_ERROR; /* Offset is in cb.erroroffset */ |
| 10232 | |
| 10233 | /* This should be caught in compile_regex(), but just in case... */ |
| 10234 | |
| 10235 | if (length > MAX_PATTERN_SIZE) |
| 10236 | { |
| 10237 | errorcode = ERR20; |
| 10238 | goto HAD_CB_ERROR; |
| 10239 | } |
| 10240 | |
| 10241 | /* Compute the size of, and then get and initialize, the data block for storing |
| 10242 | the compiled pattern and names table. Integer overflow should no longer be |
| 10243 | possible because nowadays we limit the maximum value of cb.names_found and |
| 10244 | cb.name_entry_size. */ |
| 10245 | |
| 10246 | re_blocksize = sizeof(pcre2_real_code) + |
| 10247 | CU2BYTES(length + |
| 10248 | (PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size); |
| 10249 | re = (pcre2_real_code *) |
| 10250 | ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data); |
| 10251 | if (re == NULL) |
| 10252 | { |
| 10253 | errorcode = ERR21; |
| 10254 | goto HAD_CB_ERROR; |
| 10255 | } |
| 10256 | |
| 10257 | /* The compiler may put padding at the end of the pcre2_real_code structure in |
| 10258 | order to round it up to a multiple of 4 or 8 bytes. This means that when a |
| 10259 | compiled pattern is copied (for example, when serialized) undefined bytes are |
| 10260 | read, and this annoys debuggers such as valgrind. To avoid this, we explicitly |
| 10261 | write to the last 8 bytes of the structure before setting the fields. */ |
| 10262 | |
| 10263 | memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8); |
| 10264 | re->memctl = ccontext->memctl; |
| 10265 | re->tables = tables; |
| 10266 | re->executable_jit = NULL; |
| 10267 | memset(re->start_bitmap, 0, 32 * sizeof(uint8_t)); |
| 10268 | re->blocksize = re_blocksize; |
| 10269 | re->magic_number = MAGIC_NUMBER; |
| 10270 | re->compile_options = options; |
| 10271 | re->overall_options = cb.external_options; |
| 10272 | re->extra_options = ccontext->extra_options; |
| 10273 | re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags; |
| 10274 | re->limit_heap = limit_heap; |
| 10275 | re->limit_match = limit_match; |
| 10276 | re->limit_depth = limit_depth; |
| 10277 | re->first_codeunit = 0; |
| 10278 | re->last_codeunit = 0; |
| 10279 | re->bsr_convention = bsr; |
| 10280 | re->newline_convention = newline; |
| 10281 | re->max_lookbehind = 0; |
| 10282 | re->minlength = 0; |
| 10283 | re->top_bracket = 0; |
| 10284 | re->top_backref = 0; |
| 10285 | re->name_entry_size = cb.name_entry_size; |
| 10286 | re->name_count = cb.names_found; |
| 10287 | |
| 10288 | /* The basic block is immediately followed by the name table, and the compiled |
| 10289 | code follows after that. */ |
| 10290 | |
| 10291 | codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) + |
| 10292 | re->name_entry_size * re->name_count; |
| 10293 | |
| 10294 | /* Update the compile data block for the actual compile. The starting points of |
| 10295 | the name/number translation table and of the code are passed around in the |
| 10296 | compile data block. The start/end pattern and initial options are already set |
| 10297 | from the pre-compile phase, as is the name_entry_size field. */ |
| 10298 | |
| 10299 | cb.parens_depth = 0; |
| 10300 | cb.assert_depth = 0; |
| 10301 | cb.lastcapture = 0; |
| 10302 | cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)); |
| 10303 | cb.start_code = codestart; |
| 10304 | cb.req_varyopt = 0; |
| 10305 | cb.had_accept = FALSE; |
| 10306 | cb.had_pruneorskip = FALSE; |
| 10307 | cb.open_caps = NULL; |
| 10308 | |
| 10309 | /* If any named groups were found, create the name/number table from the list |
| 10310 | created in the pre-pass. */ |
| 10311 | |
| 10312 | if (cb.names_found > 0) |
| 10313 | { |
| 10314 | named_group *ng = cb.named_groups; |
| 10315 | for (i = 0; i < cb.names_found; i++, ng++) |
| 10316 | add_name_to_table(&cb, ng->name, ng->length, ng->number, i); |
| 10317 | } |
| 10318 | |
| 10319 | /* Set up a starting, non-extracting bracket, then compile the expression. On |
| 10320 | error, errorcode will be set non-zero, so we don't need to look at the result |
| 10321 | of the function here. */ |
| 10322 | |
| 10323 | pptr = cb.parsed_pattern; |
| 10324 | code = (PCRE2_UCHAR *)codestart; |
| 10325 | *code = OP_BRA; |
| 10326 | regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, 0, |
| 10327 | &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL); |
| 10328 | if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY; |
| 10329 | re->top_bracket = cb.bracount; |
| 10330 | re->top_backref = cb.top_backref; |
| 10331 | re->max_lookbehind = cb.max_lookbehind; |
| 10332 | |
| 10333 | if (cb.had_accept) |
| 10334 | { |
| 10335 | reqcu = 0; /* Must disable after (*ACCEPT) */ |
| 10336 | reqcuflags = REQ_NONE; |
| 10337 | re->flags |= PCRE2_HASACCEPT; /* Disables minimum length */ |
| 10338 | } |
| 10339 | |
| 10340 | /* Fill in the final opcode and check for disastrous overflow. If no overflow, |
| 10341 | but the estimated length exceeds the really used length, adjust the value of |
| 10342 | re->blocksize, and if valgrind support is configured, mark the extra allocated |
| 10343 | memory as unaddressable, so that any out-of-bound reads can be detected. */ |
| 10344 | |
| 10345 | *code++ = OP_END; |
| 10346 | usedlength = code - codestart; |
| 10347 | if (usedlength > length) errorcode = ERR23; else |
| 10348 | { |
| 10349 | re->blocksize -= CU2BYTES(length - usedlength); |
| 10350 | #ifdef SUPPORT_VALGRIND |
| 10351 | VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength)); |
| 10352 | #endif |
| 10353 | } |
| 10354 | |
| 10355 | /* Scan the pattern for recursion/subroutine calls and convert the group |
| 10356 | numbers into offsets. Maintain a small cache so that repeated groups containing |
| 10357 | recursions are efficiently handled. */ |
| 10358 | |
| 10359 | #define RSCAN_CACHE_SIZE 8 |
| 10360 | |
| 10361 | if (errorcode == 0 && cb.had_recurse) |
| 10362 | { |
| 10363 | PCRE2_UCHAR *rcode; |
| 10364 | PCRE2_SPTR rgroup; |
| 10365 | unsigned int ccount = 0; |
| 10366 | int start = RSCAN_CACHE_SIZE; |
| 10367 | recurse_cache rc[RSCAN_CACHE_SIZE]; |
| 10368 | |
| 10369 | for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf); |
| 10370 | rcode != NULL; |
| 10371 | rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf)) |
| 10372 | { |
| 10373 | int p, groupnumber; |
| 10374 | |
| 10375 | groupnumber = (int)GET(rcode, 1); |
| 10376 | if (groupnumber == 0) rgroup = codestart; else |
| 10377 | { |
| 10378 | PCRE2_SPTR search_from = codestart; |
| 10379 | rgroup = NULL; |
| 10380 | for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7) |
| 10381 | { |
| 10382 | if (groupnumber == rc[p].groupnumber) |
| 10383 | { |
| 10384 | rgroup = rc[p].group; |
| 10385 | break; |
| 10386 | } |
| 10387 | |
| 10388 | /* Group n+1 must always start to the right of group n, so we can save |
| 10389 | search time below when the new group number is greater than any of the |
| 10390 | previously found groups. */ |
| 10391 | |
| 10392 | if (groupnumber > rc[p].groupnumber) search_from = rc[p].group; |
| 10393 | } |
| 10394 | |
| 10395 | if (rgroup == NULL) |
| 10396 | { |
| 10397 | rgroup = PRIV(find_bracket)(search_from, utf, groupnumber); |
| 10398 | if (rgroup == NULL) |
| 10399 | { |
| 10400 | errorcode = ERR53; |
| 10401 | break; |
| 10402 | } |
| 10403 | if (--start < 0) start = RSCAN_CACHE_SIZE - 1; |
| 10404 | rc[start].groupnumber = groupnumber; |
| 10405 | rc[start].group = rgroup; |
| 10406 | if (ccount < RSCAN_CACHE_SIZE) ccount++; |
| 10407 | } |
| 10408 | } |
| 10409 | |
| 10410 | PUT(rcode, 1, rgroup - codestart); |
| 10411 | } |
| 10412 | } |
| 10413 | |
| 10414 | /* In rare debugging situations we sometimes need to look at the compiled code |
| 10415 | at this stage. */ |
| 10416 | |
| 10417 | #ifdef DEBUG_CALL_PRINTINT |
| 10418 | pcre2_printint(re, stderr, TRUE); |
| 10419 | fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength); |
| 10420 | #endif |
| 10421 | |
| 10422 | /* Unless disabled, check whether any single character iterators can be |
| 10423 | auto-possessified. The function overwrites the appropriate opcode values, so |
| 10424 | the type of the pointer must be cast. NOTE: the intermediate variable "temp" is |
| 10425 | used in this code because at least one compiler gives a warning about loss of |
| 10426 | "const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the |
| 10427 | function call. */ |
| 10428 | |
| 10429 | if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0) |
| 10430 | { |
| 10431 | PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart; |
| 10432 | if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80; |
| 10433 | } |
| 10434 | |
| 10435 | /* Failed to compile, or error while post-processing. */ |
| 10436 | |
| 10437 | if (errorcode != 0) goto HAD_CB_ERROR; |
| 10438 | |
| 10439 | /* Successful compile. If the anchored option was not passed, set it if |
| 10440 | we can determine that the pattern is anchored by virtue of ^ characters or \A |
| 10441 | or anything else, such as starting with non-atomic .* when DOTALL is set and |
| 10442 | there are no occurrences of *PRUNE or *SKIP (though there is an option to |
| 10443 | disable this case). */ |
| 10444 | |
| 10445 | if ((re->overall_options & PCRE2_ANCHORED) == 0 && |
| 10446 | is_anchored(codestart, 0, &cb, 0, FALSE)) |
| 10447 | re->overall_options |= PCRE2_ANCHORED; |
| 10448 | |
| 10449 | /* Set up the first code unit or startline flag, the required code unit, and |
| 10450 | then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE |
| 10451 | is set, as the data it would create will not be used. Note that a first code |
| 10452 | unit (but not the startline flag) is useful for anchored patterns because it |
| 10453 | can still give a quick "no match" and also avoid searching for a last code |
| 10454 | unit. */ |
| 10455 | |
| 10456 | if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) |
| 10457 | { |
| 10458 | int minminlength = 0; /* For minimal minlength from first/required CU */ |
| 10459 | |
| 10460 | /* If we do not have a first code unit, see if there is one that is asserted |
| 10461 | (these are not saved during the compile because they can cause conflicts with |
| 10462 | actual literals that follow). */ |
| 10463 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 10464 | if (firstcuflags >= REQ_NONE) |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 10465 | firstcu = find_firstassertedcu(codestart, &firstcuflags, 0); |
| 10466 | |
| 10467 | /* Save the data for a first code unit. The existence of one means the |
| 10468 | minimum length must be at least 1. */ |
| 10469 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 10470 | if (firstcuflags < REQ_NONE) |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 10471 | { |
| 10472 | re->first_codeunit = firstcu; |
| 10473 | re->flags |= PCRE2_FIRSTSET; |
| 10474 | minminlength++; |
| 10475 | |
| 10476 | /* Handle caseless first code units. */ |
| 10477 | |
| 10478 | if ((firstcuflags & REQ_CASELESS) != 0) |
| 10479 | { |
| 10480 | if (firstcu < 128 || (!utf && !ucp && firstcu < 255)) |
| 10481 | { |
| 10482 | if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS; |
| 10483 | } |
| 10484 | |
| 10485 | /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise. |
| 10486 | In 8-bit UTF mode, codepoints in the range 128-255 are introductory code |
| 10487 | points and cannot have another case, but if UCP is set they may do. */ |
| 10488 | |
| 10489 | #ifdef SUPPORT_UNICODE |
| 10490 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
| 10491 | else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu) |
| 10492 | re->flags |= PCRE2_FIRSTCASELESS; |
| 10493 | #else |
| 10494 | else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT && |
| 10495 | UCD_OTHERCASE(firstcu) != firstcu) |
| 10496 | re->flags |= PCRE2_FIRSTCASELESS; |
| 10497 | #endif |
| 10498 | #endif /* SUPPORT_UNICODE */ |
| 10499 | } |
| 10500 | } |
| 10501 | |
| 10502 | /* When there is no first code unit, for non-anchored patterns, see if we can |
| 10503 | set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all |
| 10504 | branches start with ^ and also when all branches start with non-atomic .* for |
| 10505 | non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option |
| 10506 | that disables this case.) */ |
| 10507 | |
| 10508 | else if ((re->overall_options & PCRE2_ANCHORED) == 0 && |
| 10509 | is_startline(codestart, 0, &cb, 0, FALSE)) |
| 10510 | re->flags |= PCRE2_STARTLINE; |
| 10511 | |
| 10512 | /* Handle the "required code unit", if one is set. In the UTF case we can |
| 10513 | increment the minimum minimum length only if we are sure this really is a |
| 10514 | different character and not a non-starting code unit of the first character, |
| 10515 | because the minimum length count is in characters, not code units. */ |
| 10516 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 10517 | if (reqcuflags < REQ_NONE) |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 10518 | { |
| 10519 | #if PCRE2_CODE_UNIT_WIDTH == 16 |
| 10520 | if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */ |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 10521 | firstcuflags >= REQ_NONE || /* First not set */ |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 10522 | (firstcu & 0xf800) != 0xd800 || /* First not surrogate */ |
| 10523 | (reqcu & 0xfc00) != 0xdc00) /* Req not low surrogate */ |
| 10524 | #elif PCRE2_CODE_UNIT_WIDTH == 8 |
| 10525 | if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */ |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 10526 | firstcuflags >= REQ_NONE || /* First not set */ |
Elliott Hughes | 5b80804 | 2021-10-01 10:56:10 -0700 | [diff] [blame] | 10527 | (firstcu & 0x80) == 0 || /* First is ASCII */ |
| 10528 | (reqcu & 0x80) == 0) /* Req is ASCII */ |
| 10529 | #endif |
| 10530 | { |
| 10531 | minminlength++; |
| 10532 | } |
| 10533 | |
| 10534 | /* In the case of an anchored pattern, set up the value only if it follows |
| 10535 | a variable length item in the pattern. */ |
| 10536 | |
| 10537 | if ((re->overall_options & PCRE2_ANCHORED) == 0 || |
| 10538 | (reqcuflags & REQ_VARY) != 0) |
| 10539 | { |
| 10540 | re->last_codeunit = reqcu; |
| 10541 | re->flags |= PCRE2_LASTSET; |
| 10542 | |
| 10543 | /* Handle caseless required code units as for first code units (above). */ |
| 10544 | |
| 10545 | if ((reqcuflags & REQ_CASELESS) != 0) |
| 10546 | { |
| 10547 | if (reqcu < 128 || (!utf && !ucp && reqcu < 255)) |
| 10548 | { |
| 10549 | if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS; |
| 10550 | } |
| 10551 | #ifdef SUPPORT_UNICODE |
| 10552 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
| 10553 | else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu) |
| 10554 | re->flags |= PCRE2_LASTCASELESS; |
| 10555 | #else |
| 10556 | else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT && |
| 10557 | UCD_OTHERCASE(reqcu) != reqcu) |
| 10558 | re->flags |= PCRE2_LASTCASELESS; |
| 10559 | #endif |
| 10560 | #endif /* SUPPORT_UNICODE */ |
| 10561 | } |
| 10562 | } |
| 10563 | } |
| 10564 | |
| 10565 | /* Study the compiled pattern to set up information such as a bitmap of |
| 10566 | starting code units and a minimum matching length. */ |
| 10567 | |
| 10568 | if (PRIV(study)(re) != 0) |
| 10569 | { |
| 10570 | errorcode = ERR31; |
| 10571 | goto HAD_CB_ERROR; |
| 10572 | } |
| 10573 | |
| 10574 | /* If study() set a bitmap of starting code units, it implies a minimum |
| 10575 | length of at least one. */ |
| 10576 | |
| 10577 | if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0) |
| 10578 | minminlength = 1; |
| 10579 | |
| 10580 | /* If the minimum length set (or not set) by study() is less than the minimum |
| 10581 | implied by required code units, override it. */ |
| 10582 | |
| 10583 | if (re->minlength < minminlength) re->minlength = minminlength; |
| 10584 | } /* End of start-of-match optimizations. */ |
| 10585 | |
| 10586 | /* Control ends up here in all cases. When running under valgrind, make a |
| 10587 | pattern's terminating zero defined again. If memory was obtained for the parsed |
| 10588 | version of the pattern, free it before returning. Also free the list of named |
| 10589 | groups if a larger one had to be obtained, and likewise the group information |
| 10590 | vector. */ |
| 10591 | |
| 10592 | EXIT: |
| 10593 | #ifdef SUPPORT_VALGRIND |
| 10594 | if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1)); |
| 10595 | #endif |
| 10596 | if (cb.parsed_pattern != stack_parsed_pattern) |
| 10597 | ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data); |
| 10598 | if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE) |
| 10599 | ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data); |
| 10600 | if (cb.groupinfo != stack_groupinfo) |
| 10601 | ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data); |
| 10602 | return re; /* Will be NULL after an error */ |
| 10603 | |
| 10604 | /* Errors discovered in parse_regex() set the offset value in the compile |
| 10605 | block. Errors discovered before it is called must compute it from the ptr |
| 10606 | value. After parse_regex() is called, the offset in the compile block is set to |
| 10607 | the end of the pattern, but certain errors in compile_regex() may reset it if |
| 10608 | an offset is available in the parsed pattern. */ |
| 10609 | |
| 10610 | HAD_CB_ERROR: |
| 10611 | ptr = pattern + cb.erroroffset; |
| 10612 | |
| 10613 | HAD_EARLY_ERROR: |
| 10614 | *erroroffset = ptr - pattern; |
| 10615 | |
| 10616 | HAD_ERROR: |
| 10617 | *errorptr = errorcode; |
| 10618 | pcre2_code_free(re); |
| 10619 | re = NULL; |
| 10620 | goto EXIT; |
| 10621 | } |
| 10622 | |
| 10623 | /* End of pcre2_compile.c */ |