Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 1 | /************************************************* |
| 2 | * Perl-Compatible Regular Expressions * |
| 3 | *************************************************/ |
| 4 | |
| 5 | /* PCRE is a library of functions to support regular expressions whose syntax |
| 6 | and semantics are as close as possible to those of the Perl 5 language. |
| 7 | |
| 8 | Written by Philip Hazel |
| 9 | Original API code Copyright (c) 1997-2012 University of Cambridge |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 10 | New API code Copyright (c) 2016-2021 University of Cambridge |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 11 | |
| 12 | ----------------------------------------------------------------------------- |
| 13 | Redistribution and use in source and binary forms, with or without |
| 14 | modification, are permitted provided that the following conditions are met: |
| 15 | |
| 16 | * Redistributions of source code must retain the above copyright notice, |
| 17 | this list of conditions and the following disclaimer. |
| 18 | |
| 19 | * Redistributions in binary form must reproduce the above copyright |
| 20 | notice, this list of conditions and the following disclaimer in the |
| 21 | documentation and/or other materials provided with the distribution. |
| 22 | |
| 23 | * Neither the name of the University of Cambridge nor the names of its |
| 24 | contributors may be used to endorse or promote products derived from |
| 25 | this software without specific prior written permission. |
| 26 | |
| 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 37 | POSSIBILITY OF SUCH DAMAGE. |
| 38 | ----------------------------------------------------------------------------- |
| 39 | */ |
| 40 | |
| 41 | |
| 42 | #ifdef HAVE_CONFIG_H |
| 43 | #include "config.h" |
| 44 | #endif |
| 45 | |
| 46 | #include "pcre2_internal.h" |
| 47 | |
| 48 | #define PTR_STACK_SIZE 20 |
| 49 | |
| 50 | #define SUBSTITUTE_OPTIONS \ |
| 51 | (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \ |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 52 | PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \ |
| 53 | PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \ |
| 54 | PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY) |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 55 | |
| 56 | |
| 57 | |
| 58 | /************************************************* |
| 59 | * Find end of substitute text * |
| 60 | *************************************************/ |
| 61 | |
| 62 | /* In extended mode, we recognize ${name:+set text:unset text} and similar |
| 63 | constructions. This requires the identification of unescaped : and } |
| 64 | characters. This function scans for such. It must deal with nested ${ |
| 65 | constructions. The pointer to the text is updated, either to the required end |
| 66 | character, or to where an error was detected. |
| 67 | |
| 68 | Arguments: |
| 69 | code points to the compiled expression (for options) |
| 70 | ptrptr points to the pointer to the start of the text (updated) |
| 71 | ptrend end of the whole string |
| 72 | last TRUE if the last expected string (only } recognized) |
| 73 | |
| 74 | Returns: 0 on success |
| 75 | negative error code on failure |
| 76 | */ |
| 77 | |
| 78 | static int |
| 79 | find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, |
| 80 | BOOL last) |
| 81 | { |
| 82 | int rc = 0; |
| 83 | uint32_t nestlevel = 0; |
| 84 | BOOL literal = FALSE; |
| 85 | PCRE2_SPTR ptr = *ptrptr; |
| 86 | |
| 87 | for (; ptr < ptrend; ptr++) |
| 88 | { |
| 89 | if (literal) |
| 90 | { |
| 91 | if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E) |
| 92 | { |
| 93 | literal = FALSE; |
| 94 | ptr += 1; |
| 95 | } |
| 96 | } |
| 97 | |
| 98 | else if (*ptr == CHAR_RIGHT_CURLY_BRACKET) |
| 99 | { |
| 100 | if (nestlevel == 0) goto EXIT; |
| 101 | nestlevel--; |
| 102 | } |
| 103 | |
| 104 | else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT; |
| 105 | |
| 106 | else if (*ptr == CHAR_DOLLAR_SIGN) |
| 107 | { |
| 108 | if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET) |
| 109 | { |
| 110 | nestlevel++; |
| 111 | ptr += 1; |
| 112 | } |
| 113 | } |
| 114 | |
| 115 | else if (*ptr == CHAR_BACKSLASH) |
| 116 | { |
| 117 | int erc; |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 118 | int errorcode; |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 119 | uint32_t ch; |
| 120 | |
| 121 | if (ptr < ptrend - 1) switch (ptr[1]) |
| 122 | { |
| 123 | case CHAR_L: |
| 124 | case CHAR_l: |
| 125 | case CHAR_U: |
| 126 | case CHAR_u: |
| 127 | ptr += 1; |
| 128 | continue; |
| 129 | } |
| 130 | |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 131 | ptr += 1; /* Must point after \ */ |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 132 | erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode, |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 133 | code->overall_options, code->extra_options, FALSE, NULL); |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 134 | ptr -= 1; /* Back to last code unit of escape */ |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 135 | if (errorcode != 0) |
| 136 | { |
| 137 | rc = errorcode; |
| 138 | goto EXIT; |
| 139 | } |
| 140 | |
| 141 | switch(erc) |
| 142 | { |
| 143 | case 0: /* Data character */ |
| 144 | case ESC_E: /* Isolated \E is ignored */ |
| 145 | break; |
| 146 | |
| 147 | case ESC_Q: |
| 148 | literal = TRUE; |
| 149 | break; |
| 150 | |
| 151 | default: |
| 152 | rc = PCRE2_ERROR_BADREPESCAPE; |
| 153 | goto EXIT; |
| 154 | } |
| 155 | } |
| 156 | } |
| 157 | |
| 158 | rc = PCRE2_ERROR_REPMISSINGBRACE; /* Terminator not found */ |
| 159 | |
| 160 | EXIT: |
| 161 | *ptrptr = ptr; |
| 162 | return rc; |
| 163 | } |
| 164 | |
| 165 | |
| 166 | |
| 167 | /************************************************* |
| 168 | * Match and substitute * |
| 169 | *************************************************/ |
| 170 | |
| 171 | /* This function applies a compiled re to a subject string and creates a new |
| 172 | string with substitutions. The first 7 arguments are the same as for |
| 173 | pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED. |
| 174 | |
| 175 | Arguments: |
| 176 | code points to the compiled expression |
| 177 | subject points to the subject string |
| 178 | length length of subject string (may contain binary zeros) |
| 179 | start_offset where to start in the subject string |
| 180 | options option bits |
| 181 | match_data points to a match_data block, or is NULL |
| 182 | context points a PCRE2 context |
| 183 | replacement points to the replacement string |
| 184 | rlength length of replacement string |
| 185 | buffer where to put the substituted string |
| 186 | blength points to length of buffer; updated to length of string |
| 187 | |
| 188 | Returns: >= 0 number of substitutions made |
| 189 | < 0 an error code |
| 190 | PCRE2_ERROR_BADREPLACEMENT means invalid use of $ |
| 191 | */ |
| 192 | |
| 193 | /* This macro checks for space in the buffer before copying into it. On |
| 194 | overflow, either give an error immediately, or keep on, accumulating the |
| 195 | length. */ |
| 196 | |
| 197 | #define CHECKMEMCPY(from,length) \ |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 198 | { \ |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 199 | if (!overflowed && lengthleft < length) \ |
| 200 | { \ |
| 201 | if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \ |
| 202 | overflowed = TRUE; \ |
| 203 | extra_needed = length - lengthleft; \ |
| 204 | } \ |
| 205 | else if (overflowed) \ |
| 206 | { \ |
| 207 | extra_needed += length; \ |
| 208 | } \ |
| 209 | else \ |
| 210 | { \ |
| 211 | memcpy(buffer + buff_offset, from, CU2BYTES(length)); \ |
| 212 | buff_offset += length; \ |
| 213 | lengthleft -= length; \ |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 214 | } \ |
| 215 | } |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 216 | |
| 217 | /* Here's the function */ |
| 218 | |
| 219 | PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION |
| 220 | pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, |
| 221 | PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, |
| 222 | pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength, |
| 223 | PCRE2_UCHAR *buffer, PCRE2_SIZE *blength) |
| 224 | { |
| 225 | int rc; |
| 226 | int subs; |
| 227 | int forcecase = 0; |
| 228 | int forcecasereset = 0; |
| 229 | uint32_t ovector_count; |
| 230 | uint32_t goptions = 0; |
| 231 | uint32_t suboptions; |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 232 | pcre2_match_data *internal_match_data = NULL; |
| 233 | BOOL escaped_literal = FALSE; |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 234 | BOOL overflowed = FALSE; |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 235 | BOOL use_existing_match; |
| 236 | BOOL replacement_only; |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 237 | #ifdef SUPPORT_UNICODE |
| 238 | BOOL utf = (code->overall_options & PCRE2_UTF) != 0; |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 239 | BOOL ucp = (code->overall_options & PCRE2_UCP) != 0; |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 240 | #endif |
| 241 | PCRE2_UCHAR temp[6]; |
| 242 | PCRE2_SPTR ptr; |
| 243 | PCRE2_SPTR repend; |
| 244 | PCRE2_SIZE extra_needed = 0; |
| 245 | PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength; |
| 246 | PCRE2_SIZE *ovector; |
Elliott Hughes | 653c210 | 2019-01-09 15:41:36 -0800 | [diff] [blame] | 247 | PCRE2_SIZE ovecsave[3]; |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 248 | pcre2_substitute_callout_block scb; |
| 249 | |
| 250 | /* General initialization */ |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 251 | |
| 252 | buff_offset = 0; |
| 253 | lengthleft = buff_length = *blength; |
| 254 | *blength = PCRE2_UNSET; |
Elliott Hughes | 653c210 | 2019-01-09 15:41:36 -0800 | [diff] [blame] | 255 | ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET; |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 256 | |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 257 | /* Partial matching is not valid. This must come after setting *blength to |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 258 | PCRE2_UNSET, so as not to imply an offset in the replacement. */ |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 259 | |
| 260 | if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0) |
| 261 | return PCRE2_ERROR_BADOPTION; |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 262 | |
| 263 | /* Validate length and find the end of the replacement. A NULL replacement of |
| 264 | zero length is interpreted as an empty string. */ |
| 265 | |
| 266 | if (replacement == NULL) |
| 267 | { |
| 268 | if (rlength != 0) return PCRE2_ERROR_NULL; |
| 269 | replacement = (PCRE2_SPTR)""; |
| 270 | } |
| 271 | |
| 272 | if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement); |
| 273 | repend = replacement + rlength; |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 274 | |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 275 | /* Check for using a match that has already happened. Note that the subject |
| 276 | pointer in the match data may be NULL after a no-match. */ |
| 277 | |
| 278 | use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0); |
| 279 | replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0); |
| 280 | |
| 281 | /* If starting from an existing match, there must be an externally provided |
| 282 | match data block. We create an internal match_data block in two cases: (a) an |
| 283 | external one is not supplied (and we are not starting from an existing match); |
| 284 | (b) an existing match is to be used for the first substitution. In the latter |
| 285 | case, we copy the existing match into the internal block. This ensures that no |
| 286 | changes are made to the existing match data block. */ |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 287 | |
| 288 | if (match_data == NULL) |
| 289 | { |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 290 | pcre2_general_context *gcontext; |
| 291 | if (use_existing_match) return PCRE2_ERROR_NULL; |
| 292 | gcontext = (mcontext == NULL)? |
| 293 | (pcre2_general_context *)code : |
| 294 | (pcre2_general_context *)mcontext; |
| 295 | match_data = internal_match_data = |
| 296 | pcre2_match_data_create_from_pattern(code, gcontext); |
| 297 | if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY; |
| 298 | } |
| 299 | |
| 300 | else if (use_existing_match) |
| 301 | { |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 302 | pcre2_general_context *gcontext = (mcontext == NULL)? |
| 303 | (pcre2_general_context *)code : |
| 304 | (pcre2_general_context *)mcontext; |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 305 | int pairs = (code->top_bracket + 1 < match_data->oveccount)? |
| 306 | code->top_bracket + 1 : match_data->oveccount; |
| 307 | internal_match_data = pcre2_match_data_create(match_data->oveccount, |
| 308 | gcontext); |
| 309 | if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY; |
| 310 | memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector) |
| 311 | + 2*pairs*sizeof(PCRE2_SIZE)); |
| 312 | match_data = internal_match_data; |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 313 | } |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 314 | |
| 315 | /* Remember ovector details */ |
| 316 | |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 317 | ovector = pcre2_get_ovector_pointer(match_data); |
| 318 | ovector_count = pcre2_get_ovector_count(match_data); |
| 319 | |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 320 | /* Fixed things in the callout block */ |
| 321 | |
| 322 | scb.version = 0; |
| 323 | scb.input = subject; |
| 324 | scb.output = (PCRE2_SPTR)buffer; |
| 325 | scb.ovector = ovector; |
| 326 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 327 | /* A NULL subject of zero length is treated as an empty string. */ |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 328 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 329 | if (subject == NULL) |
| 330 | { |
| 331 | if (length != 0) return PCRE2_ERROR_NULL; |
| 332 | subject = (PCRE2_SPTR)""; |
| 333 | } |
| 334 | |
| 335 | /* Find length of zero-terminated subject */ |
| 336 | |
| 337 | if (length == PCRE2_ZERO_TERMINATED) |
| 338 | length = subject? PRIV(strlen)(subject) : 0; |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 339 | |
| 340 | /* Check UTF replacement string if necessary. */ |
| 341 | |
| 342 | #ifdef SUPPORT_UNICODE |
| 343 | if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) |
| 344 | { |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 345 | rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar)); |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 346 | if (rc != 0) |
| 347 | { |
| 348 | match_data->leftchar = 0; |
| 349 | goto EXIT; |
| 350 | } |
| 351 | } |
| 352 | #endif /* SUPPORT_UNICODE */ |
| 353 | |
| 354 | /* Save the substitute options and remove them from the match options. */ |
| 355 | |
| 356 | suboptions = options & SUBSTITUTE_OPTIONS; |
| 357 | options &= ~SUBSTITUTE_OPTIONS; |
| 358 | |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 359 | /* Error if the start match offset is greater than the length of the subject. */ |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 360 | |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 361 | if (start_offset > length) |
| 362 | { |
| 363 | match_data->leftchar = 0; |
| 364 | rc = PCRE2_ERROR_BADOFFSET; |
| 365 | goto EXIT; |
| 366 | } |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 367 | |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 368 | /* Copy up to the start offset, unless only the replacement is required. */ |
| 369 | |
| 370 | if (!replacement_only) CHECKMEMCPY(subject, start_offset); |
| 371 | |
| 372 | /* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first |
| 373 | match is taken from the match_data that was passed in. */ |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 374 | |
| 375 | subs = 0; |
| 376 | do |
| 377 | { |
| 378 | PCRE2_SPTR ptrstack[PTR_STACK_SIZE]; |
| 379 | uint32_t ptrstackptr = 0; |
| 380 | |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 381 | if (use_existing_match) |
| 382 | { |
| 383 | rc = match_data->rc; |
| 384 | use_existing_match = FALSE; |
| 385 | } |
| 386 | else rc = pcre2_match(code, subject, length, start_offset, options|goptions, |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 387 | match_data, mcontext); |
| 388 | |
| 389 | #ifdef SUPPORT_UNICODE |
| 390 | if (utf) options |= PCRE2_NO_UTF_CHECK; /* Only need to check once */ |
| 391 | #endif |
| 392 | |
| 393 | /* Any error other than no match returns the error code. No match when not |
| 394 | doing the special after-empty-match global rematch, or when at the end of the |
| 395 | subject, breaks the global loop. Otherwise, advance the starting point by one |
| 396 | character, copying it to the output, and try again. */ |
| 397 | |
| 398 | if (rc < 0) |
| 399 | { |
| 400 | PCRE2_SIZE save_start; |
| 401 | |
| 402 | if (rc != PCRE2_ERROR_NOMATCH) goto EXIT; |
| 403 | if (goptions == 0 || start_offset >= length) break; |
| 404 | |
| 405 | /* Advance by one code point. Then, if CRLF is a valid newline sequence and |
| 406 | we have advanced into the middle of it, advance one more code point. In |
| 407 | other words, do not start in the middle of CRLF, even if CR and LF on their |
| 408 | own are valid newlines. */ |
| 409 | |
| 410 | save_start = start_offset++; |
| 411 | if (subject[start_offset-1] == CHAR_CR && |
| 412 | code->newline_convention != PCRE2_NEWLINE_CR && |
| 413 | code->newline_convention != PCRE2_NEWLINE_LF && |
| 414 | start_offset < length && |
| 415 | subject[start_offset] == CHAR_LF) |
| 416 | start_offset++; |
| 417 | |
| 418 | /* Otherwise, in UTF mode, advance past any secondary code points. */ |
| 419 | |
| 420 | else if ((code->overall_options & PCRE2_UTF) != 0) |
| 421 | { |
| 422 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
| 423 | while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80) |
| 424 | start_offset++; |
| 425 | #elif PCRE2_CODE_UNIT_WIDTH == 16 |
| 426 | while (start_offset < length && |
| 427 | (subject[start_offset] & 0xfc00) == 0xdc00) |
| 428 | start_offset++; |
| 429 | #endif |
| 430 | } |
| 431 | |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 432 | /* Copy what we have advanced past (unless not required), reset the special |
| 433 | global options, and continue to the next match. */ |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 434 | |
| 435 | fraglength = start_offset - save_start; |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 436 | if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength); |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 437 | goptions = 0; |
| 438 | continue; |
| 439 | } |
| 440 | |
| 441 | /* Handle a successful match. Matches that use \K to end before they start |
Elliott Hughes | 653c210 | 2019-01-09 15:41:36 -0800 | [diff] [blame] | 442 | or start before the current point in the subject are not supported. */ |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 443 | |
Elliott Hughes | 653c210 | 2019-01-09 15:41:36 -0800 | [diff] [blame] | 444 | if (ovector[1] < ovector[0] || ovector[0] < start_offset) |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 445 | { |
| 446 | rc = PCRE2_ERROR_BADSUBSPATTERN; |
| 447 | goto EXIT; |
| 448 | } |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 449 | |
| 450 | /* Check for the same match as previous. This is legitimate after matching an |
Elliott Hughes | 653c210 | 2019-01-09 15:41:36 -0800 | [diff] [blame] | 451 | empty string that starts after the initial match offset. We have tried again |
| 452 | at the match point in case the pattern is one like /(?<=\G.)/ which can never |
| 453 | match at its starting point, so running the match achieves the bumpalong. If |
| 454 | we do get the same (null) match at the original match point, it isn't such a |
| 455 | pattern, so we now do the empty string magic. In all other cases, a repeat |
| 456 | match should never occur. */ |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 457 | |
Elliott Hughes | 653c210 | 2019-01-09 15:41:36 -0800 | [diff] [blame] | 458 | if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1]) |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 459 | { |
| 460 | if (ovector[0] == ovector[1] && ovecsave[2] != start_offset) |
| 461 | { |
| 462 | goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; |
| 463 | ovecsave[2] = start_offset; |
| 464 | continue; /* Back to the top of the loop */ |
Elliott Hughes | 653c210 | 2019-01-09 15:41:36 -0800 | [diff] [blame] | 465 | } |
| 466 | rc = PCRE2_ERROR_INTERNAL_DUPMATCH; |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 467 | goto EXIT; |
| 468 | } |
| 469 | |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 470 | /* Count substitutions with a paranoid check for integer overflow; surely no |
| 471 | real call to this function would ever hit this! */ |
| 472 | |
| 473 | if (subs == INT_MAX) |
| 474 | { |
| 475 | rc = PCRE2_ERROR_TOOMANYREPLACE; |
| 476 | goto EXIT; |
| 477 | } |
| 478 | subs++; |
| 479 | |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 480 | /* Copy the text leading up to the match (unless not required), and remember |
| 481 | where the insert begins and how many ovector pairs are set. */ |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 482 | |
| 483 | if (rc == 0) rc = ovector_count; |
| 484 | fraglength = ovector[0] - start_offset; |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 485 | if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength); |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 486 | scb.output_offsets[0] = buff_offset; |
| 487 | scb.oveccount = rc; |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 488 | |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 489 | /* Process the replacement string. If the entire replacement is literal, just |
| 490 | copy it with length check. */ |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 491 | |
| 492 | ptr = replacement; |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 493 | if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0) |
| 494 | { |
| 495 | CHECKMEMCPY(ptr, rlength); |
| 496 | } |
| 497 | |
| 498 | /* Within a non-literal replacement, which must be scanned character by |
| 499 | character, local literal mode can be set by \Q, but only in extended mode |
| 500 | when backslashes are being interpreted. In extended mode we must handle |
| 501 | nested substrings that are to be reprocessed. */ |
| 502 | |
| 503 | else for (;;) |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 504 | { |
| 505 | uint32_t ch; |
| 506 | unsigned int chlen; |
| 507 | |
| 508 | /* If at the end of a nested substring, pop the stack. */ |
| 509 | |
| 510 | if (ptr >= repend) |
| 511 | { |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 512 | if (ptrstackptr == 0) break; /* End of replacement string */ |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 513 | repend = ptrstack[--ptrstackptr]; |
| 514 | ptr = ptrstack[--ptrstackptr]; |
| 515 | continue; |
| 516 | } |
| 517 | |
| 518 | /* Handle the next character */ |
| 519 | |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 520 | if (escaped_literal) |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 521 | { |
| 522 | if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E) |
| 523 | { |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 524 | escaped_literal = FALSE; |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 525 | ptr += 2; |
| 526 | continue; |
| 527 | } |
| 528 | goto LOADLITERAL; |
| 529 | } |
| 530 | |
| 531 | /* Not in literal mode. */ |
| 532 | |
| 533 | if (*ptr == CHAR_DOLLAR_SIGN) |
| 534 | { |
| 535 | int group, n; |
| 536 | uint32_t special = 0; |
| 537 | BOOL inparens; |
| 538 | BOOL star; |
| 539 | PCRE2_SIZE sublength; |
| 540 | PCRE2_SPTR text1_start = NULL; |
| 541 | PCRE2_SPTR text1_end = NULL; |
| 542 | PCRE2_SPTR text2_start = NULL; |
| 543 | PCRE2_SPTR text2_end = NULL; |
| 544 | PCRE2_UCHAR next; |
| 545 | PCRE2_UCHAR name[33]; |
| 546 | |
| 547 | if (++ptr >= repend) goto BAD; |
| 548 | if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL; |
| 549 | |
| 550 | group = -1; |
| 551 | n = 0; |
| 552 | inparens = FALSE; |
| 553 | star = FALSE; |
| 554 | |
| 555 | if (next == CHAR_LEFT_CURLY_BRACKET) |
| 556 | { |
| 557 | if (++ptr >= repend) goto BAD; |
| 558 | next = *ptr; |
| 559 | inparens = TRUE; |
| 560 | } |
| 561 | |
| 562 | if (next == CHAR_ASTERISK) |
| 563 | { |
| 564 | if (++ptr >= repend) goto BAD; |
| 565 | next = *ptr; |
| 566 | star = TRUE; |
| 567 | } |
| 568 | |
| 569 | if (!star && next >= CHAR_0 && next <= CHAR_9) |
| 570 | { |
| 571 | group = next - CHAR_0; |
| 572 | while (++ptr < repend) |
| 573 | { |
| 574 | next = *ptr; |
| 575 | if (next < CHAR_0 || next > CHAR_9) break; |
| 576 | group = group * 10 + next - CHAR_0; |
| 577 | |
| 578 | /* A check for a number greater than the hightest captured group |
| 579 | is sufficient here; no need for a separate overflow check. If unknown |
| 580 | groups are to be treated as unset, just skip over any remaining |
| 581 | digits and carry on. */ |
| 582 | |
| 583 | if (group > code->top_bracket) |
| 584 | { |
| 585 | if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) |
| 586 | { |
| 587 | while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9); |
| 588 | break; |
| 589 | } |
| 590 | else |
| 591 | { |
| 592 | rc = PCRE2_ERROR_NOSUBSTRING; |
| 593 | goto PTREXIT; |
| 594 | } |
| 595 | } |
| 596 | } |
| 597 | } |
| 598 | else |
| 599 | { |
| 600 | const uint8_t *ctypes = code->tables + ctypes_offset; |
| 601 | while (MAX_255(next) && (ctypes[next] & ctype_word) != 0) |
| 602 | { |
| 603 | name[n++] = next; |
| 604 | if (n > 32) goto BAD; |
| 605 | if (++ptr >= repend) break; |
| 606 | next = *ptr; |
| 607 | } |
| 608 | if (n == 0) goto BAD; |
| 609 | name[n] = 0; |
| 610 | } |
| 611 | |
| 612 | /* In extended mode we recognize ${name:+set text:unset text} and |
| 613 | ${name:-default text}. */ |
| 614 | |
| 615 | if (inparens) |
| 616 | { |
| 617 | if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 && |
| 618 | !star && ptr < repend - 2 && next == CHAR_COLON) |
| 619 | { |
| 620 | special = *(++ptr); |
| 621 | if (special != CHAR_PLUS && special != CHAR_MINUS) |
| 622 | { |
| 623 | rc = PCRE2_ERROR_BADSUBSTITUTION; |
| 624 | goto PTREXIT; |
| 625 | } |
| 626 | |
| 627 | text1_start = ++ptr; |
| 628 | rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS); |
| 629 | if (rc != 0) goto PTREXIT; |
| 630 | text1_end = ptr; |
| 631 | |
| 632 | if (special == CHAR_PLUS && *ptr == CHAR_COLON) |
| 633 | { |
| 634 | text2_start = ++ptr; |
| 635 | rc = find_text_end(code, &ptr, repend, TRUE); |
| 636 | if (rc != 0) goto PTREXIT; |
| 637 | text2_end = ptr; |
| 638 | } |
| 639 | } |
| 640 | |
| 641 | else |
| 642 | { |
| 643 | if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET) |
| 644 | { |
| 645 | rc = PCRE2_ERROR_REPMISSINGBRACE; |
| 646 | goto PTREXIT; |
| 647 | } |
| 648 | } |
| 649 | |
| 650 | ptr++; |
| 651 | } |
| 652 | |
| 653 | /* Have found a syntactically correct group number or name, or *name. |
| 654 | Only *MARK is currently recognized. */ |
| 655 | |
| 656 | if (star) |
| 657 | { |
| 658 | if (PRIV(strcmp_c8)(name, STRING_MARK) == 0) |
| 659 | { |
| 660 | PCRE2_SPTR mark = pcre2_get_mark(match_data); |
| 661 | if (mark != NULL) |
| 662 | { |
| 663 | PCRE2_SPTR mark_start = mark; |
| 664 | while (*mark != 0) mark++; |
| 665 | fraglength = mark - mark_start; |
| 666 | CHECKMEMCPY(mark_start, fraglength); |
| 667 | } |
| 668 | } |
| 669 | else goto BAD; |
| 670 | } |
| 671 | |
| 672 | /* Substitute the contents of a group. We don't use substring_copy |
| 673 | functions any more, in order to support case forcing. */ |
| 674 | |
| 675 | else |
| 676 | { |
| 677 | PCRE2_SPTR subptr, subptrend; |
| 678 | |
| 679 | /* Find a number for a named group. In case there are duplicate names, |
| 680 | search for the first one that is set. If the name is not found when |
| 681 | PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a |
| 682 | non-existent group. */ |
| 683 | |
| 684 | if (group < 0) |
| 685 | { |
| 686 | PCRE2_SPTR first, last, entry; |
| 687 | rc = pcre2_substring_nametable_scan(code, name, &first, &last); |
| 688 | if (rc == PCRE2_ERROR_NOSUBSTRING && |
| 689 | (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) |
| 690 | { |
| 691 | group = code->top_bracket + 1; |
| 692 | } |
| 693 | else |
| 694 | { |
| 695 | if (rc < 0) goto PTREXIT; |
| 696 | for (entry = first; entry <= last; entry += rc) |
| 697 | { |
| 698 | uint32_t ng = GET2(entry, 0); |
| 699 | if (ng < ovector_count) |
| 700 | { |
| 701 | if (group < 0) group = ng; /* First in ovector */ |
| 702 | if (ovector[ng*2] != PCRE2_UNSET) |
| 703 | { |
| 704 | group = ng; /* First that is set */ |
| 705 | break; |
| 706 | } |
| 707 | } |
| 708 | } |
| 709 | |
| 710 | /* If group is still negative, it means we did not find a group |
| 711 | that is in the ovector. Just set the first group. */ |
| 712 | |
| 713 | if (group < 0) group = GET2(first, 0); |
| 714 | } |
| 715 | } |
| 716 | |
| 717 | /* We now have a group that is identified by number. Find the length of |
| 718 | the captured string. If a group in a non-special substitution is unset |
| 719 | when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */ |
| 720 | |
| 721 | rc = pcre2_substring_length_bynumber(match_data, group, &sublength); |
| 722 | if (rc < 0) |
| 723 | { |
| 724 | if (rc == PCRE2_ERROR_NOSUBSTRING && |
| 725 | (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) |
| 726 | { |
| 727 | rc = PCRE2_ERROR_UNSET; |
| 728 | } |
| 729 | if (rc != PCRE2_ERROR_UNSET) goto PTREXIT; /* Non-unset errors */ |
| 730 | if (special == 0) /* Plain substitution */ |
| 731 | { |
| 732 | if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue; |
| 733 | goto PTREXIT; /* Else error */ |
| 734 | } |
| 735 | } |
| 736 | |
| 737 | /* If special is '+' we have a 'set' and possibly an 'unset' text, |
| 738 | both of which are reprocessed when used. If special is '-' we have a |
| 739 | default text for when the group is unset; it must be reprocessed. */ |
| 740 | |
| 741 | if (special != 0) |
| 742 | { |
| 743 | if (special == CHAR_MINUS) |
| 744 | { |
| 745 | if (rc == 0) goto LITERAL_SUBSTITUTE; |
| 746 | text2_start = text1_start; |
| 747 | text2_end = text1_end; |
| 748 | } |
| 749 | |
| 750 | if (ptrstackptr >= PTR_STACK_SIZE) goto BAD; |
| 751 | ptrstack[ptrstackptr++] = ptr; |
| 752 | ptrstack[ptrstackptr++] = repend; |
| 753 | |
| 754 | if (rc == 0) |
| 755 | { |
| 756 | ptr = text1_start; |
| 757 | repend = text1_end; |
| 758 | } |
| 759 | else |
| 760 | { |
| 761 | ptr = text2_start; |
| 762 | repend = text2_end; |
| 763 | } |
| 764 | continue; |
| 765 | } |
| 766 | |
| 767 | /* Otherwise we have a literal substitution of a group's contents. */ |
| 768 | |
| 769 | LITERAL_SUBSTITUTE: |
| 770 | subptr = subject + ovector[group*2]; |
| 771 | subptrend = subject + ovector[group*2 + 1]; |
| 772 | |
| 773 | /* Substitute a literal string, possibly forcing alphabetic case. */ |
| 774 | |
| 775 | while (subptr < subptrend) |
| 776 | { |
| 777 | GETCHARINCTEST(ch, subptr); |
| 778 | if (forcecase != 0) |
| 779 | { |
| 780 | #ifdef SUPPORT_UNICODE |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 781 | if (utf || ucp) |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 782 | { |
| 783 | uint32_t type = UCD_CHARTYPE(ch); |
| 784 | if (PRIV(ucp_gentype)[type] == ucp_L && |
| 785 | type != ((forcecase > 0)? ucp_Lu : ucp_Ll)) |
| 786 | ch = UCD_OTHERCASE(ch); |
| 787 | } |
| 788 | else |
| 789 | #endif |
| 790 | { |
| 791 | if (((code->tables + cbits_offset + |
| 792 | ((forcecase > 0)? cbit_upper:cbit_lower) |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 793 | )[ch/8] & (1u << (ch%8))) == 0) |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 794 | ch = (code->tables + fcc_offset)[ch]; |
| 795 | } |
| 796 | forcecase = forcecasereset; |
| 797 | } |
| 798 | |
| 799 | #ifdef SUPPORT_UNICODE |
| 800 | if (utf) chlen = PRIV(ord2utf)(ch, temp); else |
| 801 | #endif |
| 802 | { |
| 803 | temp[0] = ch; |
| 804 | chlen = 1; |
| 805 | } |
| 806 | CHECKMEMCPY(temp, chlen); |
| 807 | } |
| 808 | } |
| 809 | } |
| 810 | |
| 811 | /* Handle an escape sequence in extended mode. We can use check_escape() |
| 812 | to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but |
| 813 | the case-forcing escapes are not supported in pcre2_compile() so must be |
| 814 | recognized here. */ |
| 815 | |
| 816 | else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 && |
| 817 | *ptr == CHAR_BACKSLASH) |
| 818 | { |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 819 | int errorcode; |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 820 | |
| 821 | if (ptr < repend - 1) switch (ptr[1]) |
| 822 | { |
| 823 | case CHAR_L: |
| 824 | forcecase = forcecasereset = -1; |
| 825 | ptr += 2; |
| 826 | continue; |
| 827 | |
| 828 | case CHAR_l: |
| 829 | forcecase = -1; |
| 830 | forcecasereset = 0; |
| 831 | ptr += 2; |
| 832 | continue; |
| 833 | |
| 834 | case CHAR_U: |
| 835 | forcecase = forcecasereset = 1; |
| 836 | ptr += 2; |
| 837 | continue; |
| 838 | |
| 839 | case CHAR_u: |
| 840 | forcecase = 1; |
| 841 | forcecasereset = 0; |
| 842 | ptr += 2; |
| 843 | continue; |
| 844 | |
| 845 | default: |
| 846 | break; |
| 847 | } |
| 848 | |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 849 | ptr++; /* Point after \ */ |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 850 | rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode, |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 851 | code->overall_options, code->extra_options, FALSE, NULL); |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 852 | if (errorcode != 0) goto BADESCAPE; |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 853 | |
| 854 | switch(rc) |
| 855 | { |
| 856 | case ESC_E: |
| 857 | forcecase = forcecasereset = 0; |
| 858 | continue; |
| 859 | |
| 860 | case ESC_Q: |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 861 | escaped_literal = TRUE; |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 862 | continue; |
| 863 | |
| 864 | case 0: /* Data character */ |
| 865 | goto LITERAL; |
| 866 | |
| 867 | default: |
| 868 | goto BADESCAPE; |
| 869 | } |
| 870 | } |
| 871 | |
| 872 | /* Handle a literal code unit */ |
| 873 | |
| 874 | else |
| 875 | { |
| 876 | LOADLITERAL: |
| 877 | GETCHARINCTEST(ch, ptr); /* Get character value, increment pointer */ |
| 878 | |
| 879 | LITERAL: |
| 880 | if (forcecase != 0) |
| 881 | { |
| 882 | #ifdef SUPPORT_UNICODE |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 883 | if (utf || ucp) |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 884 | { |
| 885 | uint32_t type = UCD_CHARTYPE(ch); |
| 886 | if (PRIV(ucp_gentype)[type] == ucp_L && |
| 887 | type != ((forcecase > 0)? ucp_Lu : ucp_Ll)) |
| 888 | ch = UCD_OTHERCASE(ch); |
| 889 | } |
| 890 | else |
| 891 | #endif |
| 892 | { |
| 893 | if (((code->tables + cbits_offset + |
| 894 | ((forcecase > 0)? cbit_upper:cbit_lower) |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 895 | )[ch/8] & (1u << (ch%8))) == 0) |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 896 | ch = (code->tables + fcc_offset)[ch]; |
| 897 | } |
| 898 | forcecase = forcecasereset; |
| 899 | } |
| 900 | |
| 901 | #ifdef SUPPORT_UNICODE |
| 902 | if (utf) chlen = PRIV(ord2utf)(ch, temp); else |
| 903 | #endif |
| 904 | { |
| 905 | temp[0] = ch; |
| 906 | chlen = 1; |
| 907 | } |
| 908 | CHECKMEMCPY(temp, chlen); |
| 909 | } /* End handling a literal code unit */ |
| 910 | } /* End of loop for scanning the replacement. */ |
| 911 | |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 912 | /* The replacement has been copied to the output, or its size has been |
| 913 | remembered. Do the callout if there is one and we have done an actual |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 914 | replacement. */ |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 915 | |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 916 | if (!overflowed && mcontext != NULL && mcontext->substitute_callout != NULL) |
| 917 | { |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 918 | scb.subscount = subs; |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 919 | scb.output_offsets[1] = buff_offset; |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 920 | rc = mcontext->substitute_callout(&scb, mcontext->substitute_callout_data); |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 921 | |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 922 | /* A non-zero return means cancel this substitution. Instead, copy the |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 923 | matched string fragment. */ |
| 924 | |
| 925 | if (rc != 0) |
| 926 | { |
| 927 | PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0]; |
| 928 | PCRE2_SIZE oldlength = ovector[1] - ovector[0]; |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 929 | |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 930 | buff_offset -= newlength; |
| 931 | lengthleft += newlength; |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 932 | if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength); |
| 933 | |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 934 | /* A negative return means do not do any more. */ |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 935 | |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 936 | if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL); |
| 937 | } |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 938 | } |
| 939 | |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 940 | /* Save the details of this match. See above for how this data is used. If we |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 941 | matched an empty string, do the magic for global matches. Update the start |
| 942 | offset to point to the rest of the subject string. If we re-used an existing |
| 943 | match for the first match, switch to the internal match data block. */ |
| 944 | |
| 945 | ovecsave[0] = ovector[0]; |
| 946 | ovecsave[1] = ovector[1]; |
Elliott Hughes | 653c210 | 2019-01-09 15:41:36 -0800 | [diff] [blame] | 947 | ovecsave[2] = start_offset; |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 948 | |
Elliott Hughes | 653c210 | 2019-01-09 15:41:36 -0800 | [diff] [blame] | 949 | goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 : |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 950 | PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART; |
Elliott Hughes | 653c210 | 2019-01-09 15:41:36 -0800 | [diff] [blame] | 951 | start_offset = ovector[1]; |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 952 | } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0); /* Repeat "do" loop */ |
| 953 | |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 954 | /* Copy the rest of the subject unless not required, and terminate the output |
| 955 | with a binary zero. */ |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 956 | |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 957 | if (!replacement_only) |
| 958 | { |
| 959 | fraglength = length - start_offset; |
| 960 | CHECKMEMCPY(subject + start_offset, fraglength); |
| 961 | } |
| 962 | |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 963 | temp[0] = 0; |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 964 | CHECKMEMCPY(temp, 1); |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 965 | |
| 966 | /* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set, |
| 967 | and matching has carried on after a full buffer, in order to compute the length |
| 968 | needed. Otherwise, an overflow generates an immediate error return. */ |
| 969 | |
| 970 | if (overflowed) |
| 971 | { |
| 972 | rc = PCRE2_ERROR_NOMEMORY; |
| 973 | *blength = buff_length + extra_needed; |
| 974 | } |
| 975 | |
| 976 | /* After a successful execution, return the number of substitutions and set the |
| 977 | length of buffer used, excluding the trailing zero. */ |
| 978 | |
| 979 | else |
| 980 | { |
| 981 | rc = subs; |
| 982 | *blength = buff_offset - 1; |
| 983 | } |
| 984 | |
| 985 | EXIT: |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 986 | if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data); |
Janis Danisevskis | 112c9cc | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 987 | else match_data->rc = rc; |
| 988 | return rc; |
| 989 | |
| 990 | NOROOM: |
| 991 | rc = PCRE2_ERROR_NOMEMORY; |
| 992 | goto EXIT; |
| 993 | |
| 994 | BAD: |
| 995 | rc = PCRE2_ERROR_BADREPLACEMENT; |
| 996 | goto PTREXIT; |
| 997 | |
| 998 | BADESCAPE: |
| 999 | rc = PCRE2_ERROR_BADREPESCAPE; |
| 1000 | |
| 1001 | PTREXIT: |
| 1002 | *blength = (PCRE2_SIZE)(ptr - replacement); |
| 1003 | goto EXIT; |
| 1004 | } |
| 1005 | |
| 1006 | /* End of pcre2_substitute.c */ |