Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 1 | /************************************************* |
| 2 | * Perl-Compatible Regular Expressions * |
| 3 | *************************************************/ |
| 4 | |
| 5 | /* PCRE is a library of functions to support regular expressions whose syntax |
| 6 | and semantics are as close as possible to those of the Perl 5 language. |
| 7 | |
| 8 | Written by Philip Hazel |
| 9 | Original API code Copyright (c) 1997-2012 University of Cambridge |
Elliott Hughes | 653c210 | 2019-01-09 15:41:36 -0800 | [diff] [blame] | 10 | New API code Copyright (c) 2016-2018 University of Cambridge |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 11 | |
| 12 | ----------------------------------------------------------------------------- |
| 13 | Redistribution and use in source and binary forms, with or without |
| 14 | modification, are permitted provided that the following conditions are met: |
| 15 | |
| 16 | * Redistributions of source code must retain the above copyright notice, |
| 17 | this list of conditions and the following disclaimer. |
| 18 | |
| 19 | * Redistributions in binary form must reproduce the above copyright |
| 20 | notice, this list of conditions and the following disclaimer in the |
| 21 | documentation and/or other materials provided with the distribution. |
| 22 | |
| 23 | * Neither the name of the University of Cambridge nor the names of its |
| 24 | contributors may be used to endorse or promote products derived from |
| 25 | this software without specific prior written permission. |
| 26 | |
| 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 37 | POSSIBILITY OF SUCH DAMAGE. |
| 38 | ----------------------------------------------------------------------------- |
| 39 | */ |
| 40 | |
| 41 | |
| 42 | /* This module contains a single function that scans through a compiled pattern |
| 43 | until it finds a capturing bracket with the given number, or, if the number is |
| 44 | negative, an instance of OP_REVERSE for a lookbehind. The function is called |
| 45 | from pcre2_compile.c and also from pcre2_study.c when finding the minimum |
| 46 | matching length. */ |
| 47 | |
| 48 | |
| 49 | #ifdef HAVE_CONFIG_H |
| 50 | #include "config.h" |
| 51 | #endif |
| 52 | |
| 53 | #include "pcre2_internal.h" |
| 54 | |
| 55 | |
| 56 | /************************************************* |
| 57 | * Scan compiled regex for specific bracket * |
| 58 | *************************************************/ |
| 59 | |
| 60 | /* |
| 61 | Arguments: |
| 62 | code points to start of expression |
| 63 | utf TRUE in UTF mode |
| 64 | number the required bracket number or negative to find a lookbehind |
| 65 | |
| 66 | Returns: pointer to the opcode for the bracket, or NULL if not found |
| 67 | */ |
| 68 | |
| 69 | PCRE2_SPTR |
| 70 | PRIV(find_bracket)(PCRE2_SPTR code, BOOL utf, int number) |
| 71 | { |
| 72 | for (;;) |
| 73 | { |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 74 | PCRE2_UCHAR c = *code; |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 75 | |
| 76 | if (c == OP_END) return NULL; |
| 77 | |
| 78 | /* XCLASS is used for classes that cannot be represented just by a bit map. |
| 79 | This includes negated single high-valued characters. CALLOUT_STR is used for |
| 80 | callouts with string arguments. In both cases the length in the table is |
| 81 | zero; the actual length is stored in the compiled code. */ |
| 82 | |
| 83 | if (c == OP_XCLASS) code += GET(code, 1); |
| 84 | else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE); |
| 85 | |
| 86 | /* Handle lookbehind */ |
| 87 | |
| 88 | else if (c == OP_REVERSE) |
| 89 | { |
| 90 | if (number < 0) return (PCRE2_UCHAR *)code; |
| 91 | code += PRIV(OP_lengths)[c]; |
| 92 | } |
| 93 | |
| 94 | /* Handle capturing bracket */ |
| 95 | |
| 96 | else if (c == OP_CBRA || c == OP_SCBRA || |
| 97 | c == OP_CBRAPOS || c == OP_SCBRAPOS) |
| 98 | { |
| 99 | int n = (int)GET2(code, 1+LINK_SIZE); |
| 100 | if (n == number) return (PCRE2_UCHAR *)code; |
| 101 | code += PRIV(OP_lengths)[c]; |
| 102 | } |
| 103 | |
| 104 | /* Otherwise, we can get the item's length from the table, except that for |
| 105 | repeated character types, we have to test for \p and \P, which have an extra |
| 106 | two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we |
| 107 | must add in its length. */ |
| 108 | |
| 109 | else |
| 110 | { |
| 111 | switch(c) |
| 112 | { |
| 113 | case OP_TYPESTAR: |
| 114 | case OP_TYPEMINSTAR: |
| 115 | case OP_TYPEPLUS: |
| 116 | case OP_TYPEMINPLUS: |
| 117 | case OP_TYPEQUERY: |
| 118 | case OP_TYPEMINQUERY: |
| 119 | case OP_TYPEPOSSTAR: |
| 120 | case OP_TYPEPOSPLUS: |
| 121 | case OP_TYPEPOSQUERY: |
| 122 | if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; |
| 123 | break; |
| 124 | |
| 125 | case OP_TYPEUPTO: |
| 126 | case OP_TYPEMINUPTO: |
| 127 | case OP_TYPEEXACT: |
| 128 | case OP_TYPEPOSUPTO: |
| 129 | if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) |
| 130 | code += 2; |
| 131 | break; |
| 132 | |
| 133 | case OP_MARK: |
Elliott Hughes | 653c210 | 2019-01-09 15:41:36 -0800 | [diff] [blame] | 134 | case OP_COMMIT_ARG: |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 135 | case OP_PRUNE_ARG: |
| 136 | case OP_SKIP_ARG: |
| 137 | case OP_THEN_ARG: |
| 138 | code += code[1]; |
| 139 | break; |
| 140 | } |
| 141 | |
| 142 | /* Add in the fixed length from the table */ |
| 143 | |
| 144 | code += PRIV(OP_lengths)[c]; |
| 145 | |
| 146 | /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be |
| 147 | followed by a multi-byte character. The length in the table is a minimum, so |
| 148 | we have to arrange to skip the extra bytes. */ |
| 149 | |
| 150 | #ifdef MAYBE_UTF_MULTI |
| 151 | if (utf) switch(c) |
| 152 | { |
| 153 | case OP_CHAR: |
| 154 | case OP_CHARI: |
| 155 | case OP_NOT: |
| 156 | case OP_NOTI: |
| 157 | case OP_EXACT: |
| 158 | case OP_EXACTI: |
| 159 | case OP_NOTEXACT: |
| 160 | case OP_NOTEXACTI: |
| 161 | case OP_UPTO: |
| 162 | case OP_UPTOI: |
| 163 | case OP_NOTUPTO: |
| 164 | case OP_NOTUPTOI: |
| 165 | case OP_MINUPTO: |
| 166 | case OP_MINUPTOI: |
| 167 | case OP_NOTMINUPTO: |
| 168 | case OP_NOTMINUPTOI: |
| 169 | case OP_POSUPTO: |
| 170 | case OP_POSUPTOI: |
| 171 | case OP_NOTPOSUPTO: |
| 172 | case OP_NOTPOSUPTOI: |
| 173 | case OP_STAR: |
| 174 | case OP_STARI: |
| 175 | case OP_NOTSTAR: |
| 176 | case OP_NOTSTARI: |
| 177 | case OP_MINSTAR: |
| 178 | case OP_MINSTARI: |
| 179 | case OP_NOTMINSTAR: |
| 180 | case OP_NOTMINSTARI: |
| 181 | case OP_POSSTAR: |
| 182 | case OP_POSSTARI: |
| 183 | case OP_NOTPOSSTAR: |
| 184 | case OP_NOTPOSSTARI: |
| 185 | case OP_PLUS: |
| 186 | case OP_PLUSI: |
| 187 | case OP_NOTPLUS: |
| 188 | case OP_NOTPLUSI: |
| 189 | case OP_MINPLUS: |
| 190 | case OP_MINPLUSI: |
| 191 | case OP_NOTMINPLUS: |
| 192 | case OP_NOTMINPLUSI: |
| 193 | case OP_POSPLUS: |
| 194 | case OP_POSPLUSI: |
| 195 | case OP_NOTPOSPLUS: |
| 196 | case OP_NOTPOSPLUSI: |
| 197 | case OP_QUERY: |
| 198 | case OP_QUERYI: |
| 199 | case OP_NOTQUERY: |
| 200 | case OP_NOTQUERYI: |
| 201 | case OP_MINQUERY: |
| 202 | case OP_MINQUERYI: |
| 203 | case OP_NOTMINQUERY: |
| 204 | case OP_NOTMINQUERYI: |
| 205 | case OP_POSQUERY: |
| 206 | case OP_POSQUERYI: |
| 207 | case OP_NOTPOSQUERY: |
| 208 | case OP_NOTPOSQUERYI: |
| 209 | if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); |
| 210 | break; |
| 211 | } |
| 212 | #else |
| 213 | (void)(utf); /* Keep compiler happy by referencing function argument */ |
| 214 | #endif /* MAYBE_UTF_MULTI */ |
| 215 | } |
| 216 | } |
| 217 | } |
| 218 | |
| 219 | /* End of pcre2_find_bracket.c */ |