blob: 8b2c369cccd71d15ccc1eb61beea9915106f227d [file] [log] [blame]
Janis Danisevskis112c9cc2016-03-31 13:35:25 +01001/*************************************************
2* Perl-Compatible Regular Expressions *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
Elliott Hughes4e19c8e2022-04-15 15:11:02 -070010 New API code Copyright (c) 2016-2021 University of Cambridge
Janis Danisevskis112c9cc2016-03-31 13:35:25 +010011
12-----------------------------------------------------------------------------
13Redistribution and use in source and binary forms, with or without
14modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE.
38-----------------------------------------------------------------------------
39*/
40
41
42#ifdef HAVE_CONFIG_H
43#include "config.h"
44#endif
45
46#include "pcre2_internal.h"
47
48#define PTR_STACK_SIZE 20
49
50#define SUBSTITUTE_OPTIONS \
51 (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \
Elliott Hughes2dbd7d22020-06-03 14:32:37 -070052 PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \
53 PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \
54 PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY)
Janis Danisevskis112c9cc2016-03-31 13:35:25 +010055
56
57
58/*************************************************
59* Find end of substitute text *
60*************************************************/
61
62/* In extended mode, we recognize ${name:+set text:unset text} and similar
63constructions. This requires the identification of unescaped : and }
64characters. This function scans for such. It must deal with nested ${
65constructions. The pointer to the text is updated, either to the required end
66character, or to where an error was detected.
67
68Arguments:
69 code points to the compiled expression (for options)
70 ptrptr points to the pointer to the start of the text (updated)
71 ptrend end of the whole string
72 last TRUE if the last expected string (only } recognized)
73
74Returns: 0 on success
75 negative error code on failure
76*/
77
78static int
79find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,
80 BOOL last)
81{
82int rc = 0;
83uint32_t nestlevel = 0;
84BOOL literal = FALSE;
85PCRE2_SPTR ptr = *ptrptr;
86
87for (; ptr < ptrend; ptr++)
88 {
89 if (literal)
90 {
91 if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E)
92 {
93 literal = FALSE;
94 ptr += 1;
95 }
96 }
97
98 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
99 {
100 if (nestlevel == 0) goto EXIT;
101 nestlevel--;
102 }
103
104 else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT;
105
106 else if (*ptr == CHAR_DOLLAR_SIGN)
107 {
108 if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
109 {
110 nestlevel++;
111 ptr += 1;
112 }
113 }
114
115 else if (*ptr == CHAR_BACKSLASH)
116 {
117 int erc;
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700118 int errorcode;
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100119 uint32_t ch;
120
121 if (ptr < ptrend - 1) switch (ptr[1])
122 {
123 case CHAR_L:
124 case CHAR_l:
125 case CHAR_U:
126 case CHAR_u:
127 ptr += 1;
128 continue;
129 }
130
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700131 ptr += 1; /* Must point after \ */
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100132 erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode,
Elliott Hughes0c26e192019-08-07 12:24:46 -0700133 code->overall_options, code->extra_options, FALSE, NULL);
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700134 ptr -= 1; /* Back to last code unit of escape */
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100135 if (errorcode != 0)
136 {
137 rc = errorcode;
138 goto EXIT;
139 }
140
141 switch(erc)
142 {
143 case 0: /* Data character */
144 case ESC_E: /* Isolated \E is ignored */
145 break;
146
147 case ESC_Q:
148 literal = TRUE;
149 break;
150
151 default:
152 rc = PCRE2_ERROR_BADREPESCAPE;
153 goto EXIT;
154 }
155 }
156 }
157
158rc = PCRE2_ERROR_REPMISSINGBRACE; /* Terminator not found */
159
160EXIT:
161*ptrptr = ptr;
162return rc;
163}
164
165
166
167/*************************************************
168* Match and substitute *
169*************************************************/
170
171/* This function applies a compiled re to a subject string and creates a new
172string with substitutions. The first 7 arguments are the same as for
173pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED.
174
175Arguments:
176 code points to the compiled expression
177 subject points to the subject string
178 length length of subject string (may contain binary zeros)
179 start_offset where to start in the subject string
180 options option bits
181 match_data points to a match_data block, or is NULL
182 context points a PCRE2 context
183 replacement points to the replacement string
184 rlength length of replacement string
185 buffer where to put the substituted string
186 blength points to length of buffer; updated to length of string
187
188Returns: >= 0 number of substitutions made
189 < 0 an error code
190 PCRE2_ERROR_BADREPLACEMENT means invalid use of $
191*/
192
193/* This macro checks for space in the buffer before copying into it. On
194overflow, either give an error immediately, or keep on, accumulating the
195length. */
196
197#define CHECKMEMCPY(from,length) \
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700198 { \
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100199 if (!overflowed && lengthleft < length) \
200 { \
201 if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \
202 overflowed = TRUE; \
203 extra_needed = length - lengthleft; \
204 } \
205 else if (overflowed) \
206 { \
207 extra_needed += length; \
208 } \
209 else \
210 { \
211 memcpy(buffer + buff_offset, from, CU2BYTES(length)); \
212 buff_offset += length; \
213 lengthleft -= length; \
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700214 } \
215 }
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100216
217/* Here's the function */
218
219PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
220pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
221 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
222 pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength,
223 PCRE2_UCHAR *buffer, PCRE2_SIZE *blength)
224{
225int rc;
226int subs;
227int forcecase = 0;
228int forcecasereset = 0;
229uint32_t ovector_count;
230uint32_t goptions = 0;
231uint32_t suboptions;
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700232pcre2_match_data *internal_match_data = NULL;
233BOOL escaped_literal = FALSE;
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100234BOOL overflowed = FALSE;
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700235BOOL use_existing_match;
236BOOL replacement_only;
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100237#ifdef SUPPORT_UNICODE
238BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700239BOOL ucp = (code->overall_options & PCRE2_UCP) != 0;
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100240#endif
241PCRE2_UCHAR temp[6];
242PCRE2_SPTR ptr;
243PCRE2_SPTR repend;
244PCRE2_SIZE extra_needed = 0;
245PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength;
246PCRE2_SIZE *ovector;
Elliott Hughes653c2102019-01-09 15:41:36 -0800247PCRE2_SIZE ovecsave[3];
Elliott Hughes0c26e192019-08-07 12:24:46 -0700248pcre2_substitute_callout_block scb;
249
250/* General initialization */
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100251
252buff_offset = 0;
253lengthleft = buff_length = *blength;
254*blength = PCRE2_UNSET;
Elliott Hughes653c2102019-01-09 15:41:36 -0800255ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET;
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100256
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700257/* Partial matching is not valid. This must come after setting *blength to
Elliott Hughes0c26e192019-08-07 12:24:46 -0700258PCRE2_UNSET, so as not to imply an offset in the replacement. */
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100259
260if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0)
261 return PCRE2_ERROR_BADOPTION;
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700262
263/* Validate length and find the end of the replacement. A NULL replacement of
264zero length is interpreted as an empty string. */
265
266if (replacement == NULL)
267 {
268 if (rlength != 0) return PCRE2_ERROR_NULL;
269 replacement = (PCRE2_SPTR)"";
270 }
271
272if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement);
273repend = replacement + rlength;
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100274
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700275/* Check for using a match that has already happened. Note that the subject
276pointer in the match data may be NULL after a no-match. */
277
278use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0);
279replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0);
280
281/* If starting from an existing match, there must be an externally provided
282match data block. We create an internal match_data block in two cases: (a) an
283external one is not supplied (and we are not starting from an existing match);
284(b) an existing match is to be used for the first substitution. In the latter
285case, we copy the existing match into the internal block. This ensures that no
286changes are made to the existing match data block. */
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100287
288if (match_data == NULL)
289 {
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700290 pcre2_general_context *gcontext;
291 if (use_existing_match) return PCRE2_ERROR_NULL;
292 gcontext = (mcontext == NULL)?
293 (pcre2_general_context *)code :
294 (pcre2_general_context *)mcontext;
295 match_data = internal_match_data =
296 pcre2_match_data_create_from_pattern(code, gcontext);
297 if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
298 }
299
300else if (use_existing_match)
301 {
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100302 pcre2_general_context *gcontext = (mcontext == NULL)?
303 (pcre2_general_context *)code :
304 (pcre2_general_context *)mcontext;
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700305 int pairs = (code->top_bracket + 1 < match_data->oveccount)?
306 code->top_bracket + 1 : match_data->oveccount;
307 internal_match_data = pcre2_match_data_create(match_data->oveccount,
308 gcontext);
309 if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
310 memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector)
311 + 2*pairs*sizeof(PCRE2_SIZE));
312 match_data = internal_match_data;
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100313 }
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700314
315/* Remember ovector details */
316
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100317ovector = pcre2_get_ovector_pointer(match_data);
318ovector_count = pcre2_get_ovector_count(match_data);
319
Elliott Hughes0c26e192019-08-07 12:24:46 -0700320/* Fixed things in the callout block */
321
322scb.version = 0;
323scb.input = subject;
324scb.output = (PCRE2_SPTR)buffer;
325scb.ovector = ovector;
326
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700327/* A NULL subject of zero length is treated as an empty string. */
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100328
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700329if (subject == NULL)
330 {
331 if (length != 0) return PCRE2_ERROR_NULL;
332 subject = (PCRE2_SPTR)"";
333 }
334
335/* Find length of zero-terminated subject */
336
337if (length == PCRE2_ZERO_TERMINATED)
338 length = subject? PRIV(strlen)(subject) : 0;
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100339
340/* Check UTF replacement string if necessary. */
341
342#ifdef SUPPORT_UNICODE
343if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
344 {
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700345 rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar));
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100346 if (rc != 0)
347 {
348 match_data->leftchar = 0;
349 goto EXIT;
350 }
351 }
352#endif /* SUPPORT_UNICODE */
353
354/* Save the substitute options and remove them from the match options. */
355
356suboptions = options & SUBSTITUTE_OPTIONS;
357options &= ~SUBSTITUTE_OPTIONS;
358
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700359/* Error if the start match offset is greater than the length of the subject. */
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100360
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700361if (start_offset > length)
362 {
363 match_data->leftchar = 0;
364 rc = PCRE2_ERROR_BADOFFSET;
365 goto EXIT;
366 }
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100367
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700368/* Copy up to the start offset, unless only the replacement is required. */
369
370if (!replacement_only) CHECKMEMCPY(subject, start_offset);
371
372/* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first
373match is taken from the match_data that was passed in. */
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100374
375subs = 0;
376do
377 {
378 PCRE2_SPTR ptrstack[PTR_STACK_SIZE];
379 uint32_t ptrstackptr = 0;
380
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700381 if (use_existing_match)
382 {
383 rc = match_data->rc;
384 use_existing_match = FALSE;
385 }
386 else rc = pcre2_match(code, subject, length, start_offset, options|goptions,
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100387 match_data, mcontext);
388
389#ifdef SUPPORT_UNICODE
390 if (utf) options |= PCRE2_NO_UTF_CHECK; /* Only need to check once */
391#endif
392
393 /* Any error other than no match returns the error code. No match when not
394 doing the special after-empty-match global rematch, or when at the end of the
395 subject, breaks the global loop. Otherwise, advance the starting point by one
396 character, copying it to the output, and try again. */
397
398 if (rc < 0)
399 {
400 PCRE2_SIZE save_start;
401
402 if (rc != PCRE2_ERROR_NOMATCH) goto EXIT;
403 if (goptions == 0 || start_offset >= length) break;
404
405 /* Advance by one code point. Then, if CRLF is a valid newline sequence and
406 we have advanced into the middle of it, advance one more code point. In
407 other words, do not start in the middle of CRLF, even if CR and LF on their
408 own are valid newlines. */
409
410 save_start = start_offset++;
411 if (subject[start_offset-1] == CHAR_CR &&
412 code->newline_convention != PCRE2_NEWLINE_CR &&
413 code->newline_convention != PCRE2_NEWLINE_LF &&
414 start_offset < length &&
415 subject[start_offset] == CHAR_LF)
416 start_offset++;
417
418 /* Otherwise, in UTF mode, advance past any secondary code points. */
419
420 else if ((code->overall_options & PCRE2_UTF) != 0)
421 {
422#if PCRE2_CODE_UNIT_WIDTH == 8
423 while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80)
424 start_offset++;
425#elif PCRE2_CODE_UNIT_WIDTH == 16
426 while (start_offset < length &&
427 (subject[start_offset] & 0xfc00) == 0xdc00)
428 start_offset++;
429#endif
430 }
431
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700432 /* Copy what we have advanced past (unless not required), reset the special
433 global options, and continue to the next match. */
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100434
435 fraglength = start_offset - save_start;
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700436 if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength);
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100437 goptions = 0;
438 continue;
439 }
440
441 /* Handle a successful match. Matches that use \K to end before they start
Elliott Hughes653c2102019-01-09 15:41:36 -0800442 or start before the current point in the subject are not supported. */
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700443
Elliott Hughes653c2102019-01-09 15:41:36 -0800444 if (ovector[1] < ovector[0] || ovector[0] < start_offset)
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100445 {
446 rc = PCRE2_ERROR_BADSUBSPATTERN;
447 goto EXIT;
448 }
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700449
450 /* Check for the same match as previous. This is legitimate after matching an
Elliott Hughes653c2102019-01-09 15:41:36 -0800451 empty string that starts after the initial match offset. We have tried again
452 at the match point in case the pattern is one like /(?<=\G.)/ which can never
453 match at its starting point, so running the match achieves the bumpalong. If
454 we do get the same (null) match at the original match point, it isn't such a
455 pattern, so we now do the empty string magic. In all other cases, a repeat
456 match should never occur. */
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700457
Elliott Hughes653c2102019-01-09 15:41:36 -0800458 if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1])
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700459 {
460 if (ovector[0] == ovector[1] && ovecsave[2] != start_offset)
461 {
462 goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
463 ovecsave[2] = start_offset;
464 continue; /* Back to the top of the loop */
Elliott Hughes653c2102019-01-09 15:41:36 -0800465 }
466 rc = PCRE2_ERROR_INTERNAL_DUPMATCH;
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700467 goto EXIT;
468 }
469
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100470 /* Count substitutions with a paranoid check for integer overflow; surely no
471 real call to this function would ever hit this! */
472
473 if (subs == INT_MAX)
474 {
475 rc = PCRE2_ERROR_TOOMANYREPLACE;
476 goto EXIT;
477 }
478 subs++;
479
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700480 /* Copy the text leading up to the match (unless not required), and remember
481 where the insert begins and how many ovector pairs are set. */
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100482
483 if (rc == 0) rc = ovector_count;
484 fraglength = ovector[0] - start_offset;
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700485 if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength);
Elliott Hughes0c26e192019-08-07 12:24:46 -0700486 scb.output_offsets[0] = buff_offset;
487 scb.oveccount = rc;
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100488
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700489 /* Process the replacement string. If the entire replacement is literal, just
490 copy it with length check. */
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100491
492 ptr = replacement;
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700493 if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0)
494 {
495 CHECKMEMCPY(ptr, rlength);
496 }
497
498 /* Within a non-literal replacement, which must be scanned character by
499 character, local literal mode can be set by \Q, but only in extended mode
500 when backslashes are being interpreted. In extended mode we must handle
501 nested substrings that are to be reprocessed. */
502
503 else for (;;)
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100504 {
505 uint32_t ch;
506 unsigned int chlen;
507
508 /* If at the end of a nested substring, pop the stack. */
509
510 if (ptr >= repend)
511 {
Elliott Hughes0c26e192019-08-07 12:24:46 -0700512 if (ptrstackptr == 0) break; /* End of replacement string */
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100513 repend = ptrstack[--ptrstackptr];
514 ptr = ptrstack[--ptrstackptr];
515 continue;
516 }
517
518 /* Handle the next character */
519
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700520 if (escaped_literal)
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100521 {
522 if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E)
523 {
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700524 escaped_literal = FALSE;
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100525 ptr += 2;
526 continue;
527 }
528 goto LOADLITERAL;
529 }
530
531 /* Not in literal mode. */
532
533 if (*ptr == CHAR_DOLLAR_SIGN)
534 {
535 int group, n;
536 uint32_t special = 0;
537 BOOL inparens;
538 BOOL star;
539 PCRE2_SIZE sublength;
540 PCRE2_SPTR text1_start = NULL;
541 PCRE2_SPTR text1_end = NULL;
542 PCRE2_SPTR text2_start = NULL;
543 PCRE2_SPTR text2_end = NULL;
544 PCRE2_UCHAR next;
545 PCRE2_UCHAR name[33];
546
547 if (++ptr >= repend) goto BAD;
548 if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL;
549
550 group = -1;
551 n = 0;
552 inparens = FALSE;
553 star = FALSE;
554
555 if (next == CHAR_LEFT_CURLY_BRACKET)
556 {
557 if (++ptr >= repend) goto BAD;
558 next = *ptr;
559 inparens = TRUE;
560 }
561
562 if (next == CHAR_ASTERISK)
563 {
564 if (++ptr >= repend) goto BAD;
565 next = *ptr;
566 star = TRUE;
567 }
568
569 if (!star && next >= CHAR_0 && next <= CHAR_9)
570 {
571 group = next - CHAR_0;
572 while (++ptr < repend)
573 {
574 next = *ptr;
575 if (next < CHAR_0 || next > CHAR_9) break;
576 group = group * 10 + next - CHAR_0;
577
578 /* A check for a number greater than the hightest captured group
579 is sufficient here; no need for a separate overflow check. If unknown
580 groups are to be treated as unset, just skip over any remaining
581 digits and carry on. */
582
583 if (group > code->top_bracket)
584 {
585 if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
586 {
587 while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9);
588 break;
589 }
590 else
591 {
592 rc = PCRE2_ERROR_NOSUBSTRING;
593 goto PTREXIT;
594 }
595 }
596 }
597 }
598 else
599 {
600 const uint8_t *ctypes = code->tables + ctypes_offset;
601 while (MAX_255(next) && (ctypes[next] & ctype_word) != 0)
602 {
603 name[n++] = next;
604 if (n > 32) goto BAD;
605 if (++ptr >= repend) break;
606 next = *ptr;
607 }
608 if (n == 0) goto BAD;
609 name[n] = 0;
610 }
611
612 /* In extended mode we recognize ${name:+set text:unset text} and
613 ${name:-default text}. */
614
615 if (inparens)
616 {
617 if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
618 !star && ptr < repend - 2 && next == CHAR_COLON)
619 {
620 special = *(++ptr);
621 if (special != CHAR_PLUS && special != CHAR_MINUS)
622 {
623 rc = PCRE2_ERROR_BADSUBSTITUTION;
624 goto PTREXIT;
625 }
626
627 text1_start = ++ptr;
628 rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS);
629 if (rc != 0) goto PTREXIT;
630 text1_end = ptr;
631
632 if (special == CHAR_PLUS && *ptr == CHAR_COLON)
633 {
634 text2_start = ++ptr;
635 rc = find_text_end(code, &ptr, repend, TRUE);
636 if (rc != 0) goto PTREXIT;
637 text2_end = ptr;
638 }
639 }
640
641 else
642 {
643 if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET)
644 {
645 rc = PCRE2_ERROR_REPMISSINGBRACE;
646 goto PTREXIT;
647 }
648 }
649
650 ptr++;
651 }
652
653 /* Have found a syntactically correct group number or name, or *name.
654 Only *MARK is currently recognized. */
655
656 if (star)
657 {
658 if (PRIV(strcmp_c8)(name, STRING_MARK) == 0)
659 {
660 PCRE2_SPTR mark = pcre2_get_mark(match_data);
661 if (mark != NULL)
662 {
663 PCRE2_SPTR mark_start = mark;
664 while (*mark != 0) mark++;
665 fraglength = mark - mark_start;
666 CHECKMEMCPY(mark_start, fraglength);
667 }
668 }
669 else goto BAD;
670 }
671
672 /* Substitute the contents of a group. We don't use substring_copy
673 functions any more, in order to support case forcing. */
674
675 else
676 {
677 PCRE2_SPTR subptr, subptrend;
678
679 /* Find a number for a named group. In case there are duplicate names,
680 search for the first one that is set. If the name is not found when
681 PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a
682 non-existent group. */
683
684 if (group < 0)
685 {
686 PCRE2_SPTR first, last, entry;
687 rc = pcre2_substring_nametable_scan(code, name, &first, &last);
688 if (rc == PCRE2_ERROR_NOSUBSTRING &&
689 (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
690 {
691 group = code->top_bracket + 1;
692 }
693 else
694 {
695 if (rc < 0) goto PTREXIT;
696 for (entry = first; entry <= last; entry += rc)
697 {
698 uint32_t ng = GET2(entry, 0);
699 if (ng < ovector_count)
700 {
701 if (group < 0) group = ng; /* First in ovector */
702 if (ovector[ng*2] != PCRE2_UNSET)
703 {
704 group = ng; /* First that is set */
705 break;
706 }
707 }
708 }
709
710 /* If group is still negative, it means we did not find a group
711 that is in the ovector. Just set the first group. */
712
713 if (group < 0) group = GET2(first, 0);
714 }
715 }
716
717 /* We now have a group that is identified by number. Find the length of
718 the captured string. If a group in a non-special substitution is unset
719 when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */
720
721 rc = pcre2_substring_length_bynumber(match_data, group, &sublength);
722 if (rc < 0)
723 {
724 if (rc == PCRE2_ERROR_NOSUBSTRING &&
725 (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
726 {
727 rc = PCRE2_ERROR_UNSET;
728 }
729 if (rc != PCRE2_ERROR_UNSET) goto PTREXIT; /* Non-unset errors */
730 if (special == 0) /* Plain substitution */
731 {
732 if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue;
733 goto PTREXIT; /* Else error */
734 }
735 }
736
737 /* If special is '+' we have a 'set' and possibly an 'unset' text,
738 both of which are reprocessed when used. If special is '-' we have a
739 default text for when the group is unset; it must be reprocessed. */
740
741 if (special != 0)
742 {
743 if (special == CHAR_MINUS)
744 {
745 if (rc == 0) goto LITERAL_SUBSTITUTE;
746 text2_start = text1_start;
747 text2_end = text1_end;
748 }
749
750 if (ptrstackptr >= PTR_STACK_SIZE) goto BAD;
751 ptrstack[ptrstackptr++] = ptr;
752 ptrstack[ptrstackptr++] = repend;
753
754 if (rc == 0)
755 {
756 ptr = text1_start;
757 repend = text1_end;
758 }
759 else
760 {
761 ptr = text2_start;
762 repend = text2_end;
763 }
764 continue;
765 }
766
767 /* Otherwise we have a literal substitution of a group's contents. */
768
769 LITERAL_SUBSTITUTE:
770 subptr = subject + ovector[group*2];
771 subptrend = subject + ovector[group*2 + 1];
772
773 /* Substitute a literal string, possibly forcing alphabetic case. */
774
775 while (subptr < subptrend)
776 {
777 GETCHARINCTEST(ch, subptr);
778 if (forcecase != 0)
779 {
780#ifdef SUPPORT_UNICODE
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700781 if (utf || ucp)
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100782 {
783 uint32_t type = UCD_CHARTYPE(ch);
784 if (PRIV(ucp_gentype)[type] == ucp_L &&
785 type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
786 ch = UCD_OTHERCASE(ch);
787 }
788 else
789#endif
790 {
791 if (((code->tables + cbits_offset +
792 ((forcecase > 0)? cbit_upper:cbit_lower)
Elliott Hughes0c26e192019-08-07 12:24:46 -0700793 )[ch/8] & (1u << (ch%8))) == 0)
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100794 ch = (code->tables + fcc_offset)[ch];
795 }
796 forcecase = forcecasereset;
797 }
798
799#ifdef SUPPORT_UNICODE
800 if (utf) chlen = PRIV(ord2utf)(ch, temp); else
801#endif
802 {
803 temp[0] = ch;
804 chlen = 1;
805 }
806 CHECKMEMCPY(temp, chlen);
807 }
808 }
809 }
810
811 /* Handle an escape sequence in extended mode. We can use check_escape()
812 to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but
813 the case-forcing escapes are not supported in pcre2_compile() so must be
814 recognized here. */
815
816 else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
817 *ptr == CHAR_BACKSLASH)
818 {
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700819 int errorcode;
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100820
821 if (ptr < repend - 1) switch (ptr[1])
822 {
823 case CHAR_L:
824 forcecase = forcecasereset = -1;
825 ptr += 2;
826 continue;
827
828 case CHAR_l:
829 forcecase = -1;
830 forcecasereset = 0;
831 ptr += 2;
832 continue;
833
834 case CHAR_U:
835 forcecase = forcecasereset = 1;
836 ptr += 2;
837 continue;
838
839 case CHAR_u:
840 forcecase = 1;
841 forcecasereset = 0;
842 ptr += 2;
843 continue;
844
845 default:
846 break;
847 }
848
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700849 ptr++; /* Point after \ */
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100850 rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode,
Elliott Hughes0c26e192019-08-07 12:24:46 -0700851 code->overall_options, code->extra_options, FALSE, NULL);
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100852 if (errorcode != 0) goto BADESCAPE;
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100853
854 switch(rc)
855 {
856 case ESC_E:
857 forcecase = forcecasereset = 0;
858 continue;
859
860 case ESC_Q:
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700861 escaped_literal = TRUE;
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100862 continue;
863
864 case 0: /* Data character */
865 goto LITERAL;
866
867 default:
868 goto BADESCAPE;
869 }
870 }
871
872 /* Handle a literal code unit */
873
874 else
875 {
876 LOADLITERAL:
877 GETCHARINCTEST(ch, ptr); /* Get character value, increment pointer */
878
879 LITERAL:
880 if (forcecase != 0)
881 {
882#ifdef SUPPORT_UNICODE
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700883 if (utf || ucp)
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100884 {
885 uint32_t type = UCD_CHARTYPE(ch);
886 if (PRIV(ucp_gentype)[type] == ucp_L &&
887 type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
888 ch = UCD_OTHERCASE(ch);
889 }
890 else
891#endif
892 {
893 if (((code->tables + cbits_offset +
894 ((forcecase > 0)? cbit_upper:cbit_lower)
Elliott Hughes0c26e192019-08-07 12:24:46 -0700895 )[ch/8] & (1u << (ch%8))) == 0)
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100896 ch = (code->tables + fcc_offset)[ch];
897 }
898 forcecase = forcecasereset;
899 }
900
901#ifdef SUPPORT_UNICODE
902 if (utf) chlen = PRIV(ord2utf)(ch, temp); else
903#endif
904 {
905 temp[0] = ch;
906 chlen = 1;
907 }
908 CHECKMEMCPY(temp, chlen);
909 } /* End handling a literal code unit */
910 } /* End of loop for scanning the replacement. */
911
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700912 /* The replacement has been copied to the output, or its size has been
913 remembered. Do the callout if there is one and we have done an actual
Elliott Hughes0c26e192019-08-07 12:24:46 -0700914 replacement. */
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700915
Elliott Hughes0c26e192019-08-07 12:24:46 -0700916 if (!overflowed && mcontext != NULL && mcontext->substitute_callout != NULL)
917 {
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700918 scb.subscount = subs;
Elliott Hughes0c26e192019-08-07 12:24:46 -0700919 scb.output_offsets[1] = buff_offset;
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700920 rc = mcontext->substitute_callout(&scb, mcontext->substitute_callout_data);
Elliott Hughes0c26e192019-08-07 12:24:46 -0700921
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700922 /* A non-zero return means cancel this substitution. Instead, copy the
Elliott Hughes0c26e192019-08-07 12:24:46 -0700923 matched string fragment. */
924
925 if (rc != 0)
926 {
927 PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0];
928 PCRE2_SIZE oldlength = ovector[1] - ovector[0];
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700929
Elliott Hughes0c26e192019-08-07 12:24:46 -0700930 buff_offset -= newlength;
931 lengthleft += newlength;
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700932 if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength);
933
Elliott Hughes0c26e192019-08-07 12:24:46 -0700934 /* A negative return means do not do any more. */
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700935
Elliott Hughes0c26e192019-08-07 12:24:46 -0700936 if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL);
937 }
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700938 }
939
Elliott Hughes0c26e192019-08-07 12:24:46 -0700940 /* Save the details of this match. See above for how this data is used. If we
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700941 matched an empty string, do the magic for global matches. Update the start
942 offset to point to the rest of the subject string. If we re-used an existing
943 match for the first match, switch to the internal match data block. */
944
945 ovecsave[0] = ovector[0];
946 ovecsave[1] = ovector[1];
Elliott Hughes653c2102019-01-09 15:41:36 -0800947 ovecsave[2] = start_offset;
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700948
Elliott Hughes653c2102019-01-09 15:41:36 -0800949 goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 :
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100950 PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART;
Elliott Hughes653c2102019-01-09 15:41:36 -0800951 start_offset = ovector[1];
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100952 } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0); /* Repeat "do" loop */
953
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700954/* Copy the rest of the subject unless not required, and terminate the output
955with a binary zero. */
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100956
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700957if (!replacement_only)
958 {
959 fraglength = length - start_offset;
960 CHECKMEMCPY(subject + start_offset, fraglength);
961 }
962
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100963temp[0] = 0;
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700964CHECKMEMCPY(temp, 1);
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100965
966/* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set,
967and matching has carried on after a full buffer, in order to compute the length
968needed. Otherwise, an overflow generates an immediate error return. */
969
970if (overflowed)
971 {
972 rc = PCRE2_ERROR_NOMEMORY;
973 *blength = buff_length + extra_needed;
974 }
975
976/* After a successful execution, return the number of substitutions and set the
977length of buffer used, excluding the trailing zero. */
978
979else
980 {
981 rc = subs;
982 *blength = buff_offset - 1;
983 }
984
985EXIT:
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700986if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data);
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100987 else match_data->rc = rc;
988return rc;
989
990NOROOM:
991rc = PCRE2_ERROR_NOMEMORY;
992goto EXIT;
993
994BAD:
995rc = PCRE2_ERROR_BADREPLACEMENT;
996goto PTREXIT;
997
998BADESCAPE:
999rc = PCRE2_ERROR_BADREPESCAPE;
1000
1001PTREXIT:
1002*blength = (PCRE2_SIZE)(ptr - replacement);
1003goto EXIT;
1004}
1005
1006/* End of pcre2_substitute.c */