blob: 6354e1bb9e2bc999331f66377f8ca2cb2a623e71 [file] [log] [blame]
Elliott Hughes5b808042021-10-01 10:56:10 -07001/*************************************************
2* Perl-Compatible Regular Expressions *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
Elliott Hughes4e19c8e2022-04-15 15:11:02 -070010 New API code Copyright (c) 2015-2022 University of Cambridge
Elliott Hughes5b808042021-10-01 10:56:10 -070011
12-----------------------------------------------------------------------------
13Redistribution and use in source and binary forms, with or without
14modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE.
38-----------------------------------------------------------------------------
39*/
40
41
42#ifdef HAVE_CONFIG_H
43#include "config.h"
44#endif
45
46/* These defines enable debugging code */
47
48/* #define DEBUG_FRAMES_DISPLAY */
49/* #define DEBUG_SHOW_OPS */
50/* #define DEBUG_SHOW_RMATCH */
51
Elliott Hughes4e19c8e2022-04-15 15:11:02 -070052#ifdef DEBUG_FRAMES_DISPLAY
Elliott Hughes5b808042021-10-01 10:56:10 -070053#include <stdarg.h>
54#endif
55
56/* These defines identify the name of the block containing "static"
57information, and fields within it. */
58
59#define NLBLOCK mb /* Block containing newline information */
60#define PSSTART start_subject /* Field containing processed string start */
61#define PSEND end_subject /* Field containing processed string end */
62
63#include "pcre2_internal.h"
64
65#define RECURSE_UNSET 0xffffffffu /* Bigger than max group number */
66
67/* Masks for identifying the public options that are permitted at match time. */
68
69#define PUBLIC_MATCH_OPTIONS \
70 (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
71 PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
72 PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT|PCRE2_COPY_MATCHED_SUBJECT)
73
74#define PUBLIC_JIT_MATCH_OPTIONS \
75 (PCRE2_NO_UTF_CHECK|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY|\
76 PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_SOFT|PCRE2_PARTIAL_HARD|\
77 PCRE2_COPY_MATCHED_SUBJECT)
78
79/* Non-error returns from and within the match() function. Error returns are
80externally defined PCRE2_ERROR_xxx codes, which are all negative. */
81
82#define MATCH_MATCH 1
83#define MATCH_NOMATCH 0
84
85/* Special internal returns used in the match() function. Make them
86sufficiently negative to avoid the external error codes. */
87
88#define MATCH_ACCEPT (-999)
89#define MATCH_KETRPOS (-998)
90/* The next 5 must be kept together and in sequence so that a test that checks
91for any one of them can use a range. */
92#define MATCH_COMMIT (-997)
93#define MATCH_PRUNE (-996)
94#define MATCH_SKIP (-995)
95#define MATCH_SKIP_ARG (-994)
96#define MATCH_THEN (-993)
97#define MATCH_BACKTRACK_MAX MATCH_THEN
98#define MATCH_BACKTRACK_MIN MATCH_COMMIT
99
100/* Group frame type values. Zero means the frame is not a group frame. The
101lower 16 bits are used for data (e.g. the capture number). Group frames are
102used for most groups so that information about the start is easily available at
103the end without having to scan back through intermediate frames (backtrack
104points). */
105
106#define GF_CAPTURE 0x00010000u
107#define GF_NOCAPTURE 0x00020000u
108#define GF_CONDASSERT 0x00030000u
109#define GF_RECURSE 0x00040000u
110
111/* Masks for the identity and data parts of the group frame type. */
112
113#define GF_IDMASK(a) ((a) & 0xffff0000u)
114#define GF_DATAMASK(a) ((a) & 0x0000ffffu)
115
116/* Repetition types */
117
118enum { REPTYPE_MIN, REPTYPE_MAX, REPTYPE_POS };
119
120/* Min and max values for the common repeats; a maximum of UINT32_MAX =>
121infinity. */
122
123static const uint32_t rep_min[] = {
124 0, 0, /* * and *? */
125 1, 1, /* + and +? */
126 0, 0, /* ? and ?? */
127 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */
128 0, 1, 0 }; /* OP_CRPOS{STAR, PLUS, QUERY} */
129
130static const uint32_t rep_max[] = {
131 UINT32_MAX, UINT32_MAX, /* * and *? */
132 UINT32_MAX, UINT32_MAX, /* + and +? */
133 1, 1, /* ? and ?? */
134 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */
135 UINT32_MAX, UINT32_MAX, 1 }; /* OP_CRPOS{STAR, PLUS, QUERY} */
136
137/* Repetition types - must include OP_CRPOSRANGE (not needed above) */
138
139static const uint32_t rep_typ[] = {
140 REPTYPE_MAX, REPTYPE_MIN, /* * and *? */
141 REPTYPE_MAX, REPTYPE_MIN, /* + and +? */
142 REPTYPE_MAX, REPTYPE_MIN, /* ? and ?? */
143 REPTYPE_MAX, REPTYPE_MIN, /* OP_CRRANGE and OP_CRMINRANGE */
144 REPTYPE_POS, REPTYPE_POS, /* OP_CRPOSSTAR, OP_CRPOSPLUS */
145 REPTYPE_POS, REPTYPE_POS }; /* OP_CRPOSQUERY, OP_CRPOSRANGE */
146
147/* Numbers for RMATCH calls at backtracking points. When these lists are
148changed, the code at RETURN_SWITCH below must be updated in sync. */
149
150enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
151 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
152 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
153 RM31, RM32, RM33, RM34, RM35, RM36 };
154
155#ifdef SUPPORT_WIDE_CHARS
156enum { RM100=100, RM101 };
157#endif
158
159#ifdef SUPPORT_UNICODE
160enum { RM200=200, RM201, RM202, RM203, RM204, RM205, RM206, RM207,
161 RM208, RM209, RM210, RM211, RM212, RM213, RM214, RM215,
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700162 RM216, RM217, RM218, RM219, RM220, RM221, RM222, RM223,
163 RM224, RM225 };
Elliott Hughes5b808042021-10-01 10:56:10 -0700164#endif
165
166/* Define short names for general fields in the current backtrack frame, which
167is always pointed to by the F variable. Occasional references to fields in
168other frames are written out explicitly. There are also some fields in the
169current frame whose names start with "temp" that are used for short-term,
170localised backtracking memory. These are #defined with Lxxx names at the point
171of use and undefined afterwards. */
172
173#define Fback_frame F->back_frame
174#define Fcapture_last F->capture_last
175#define Fcurrent_recurse F->current_recurse
176#define Fecode F->ecode
177#define Feptr F->eptr
178#define Fgroup_frame_type F->group_frame_type
179#define Flast_group_offset F->last_group_offset
180#define Flength F->length
181#define Fmark F->mark
182#define Frdepth F->rdepth
183#define Fstart_match F->start_match
184#define Foffset_top F->offset_top
185#define Foccu F->occu
186#define Fop F->op
187#define Fovector F->ovector
188#define Freturn_id F->return_id
189
190
191#ifdef DEBUG_FRAMES_DISPLAY
192/*************************************************
193* Display current frames and contents *
194*************************************************/
195
196/* This debugging function displays the current set of frames and their
197contents. It is not called automatically from anywhere, the intention being
198that calls can be inserted where necessary when debugging frame-related
199problems.
200
201Arguments:
202 f the file to write to
203 F the current top frame
204 P a previous frame of interest
205 frame_size the frame size
206 mb points to the match block
207 s identification text
208
209Returns: nothing
210*/
211
212static void
213display_frames(FILE *f, heapframe *F, heapframe *P, PCRE2_SIZE frame_size,
214 match_block *mb, const char *s, ...)
215{
216uint32_t i;
217heapframe *Q;
218va_list ap;
219va_start(ap, s);
220
221fprintf(f, "FRAMES ");
222vfprintf(f, s, ap);
223va_end(ap);
224
225if (P != NULL) fprintf(f, " P=%lu",
226 ((char *)P - (char *)(mb->match_frames))/frame_size);
227fprintf(f, "\n");
228
229for (i = 0, Q = mb->match_frames;
230 Q <= F;
231 i++, Q = (heapframe *)((char *)Q + frame_size))
232 {
233 fprintf(f, "Frame %d type=%x subj=%lu code=%d back=%lu id=%d",
234 i, Q->group_frame_type, Q->eptr - mb->start_subject, *(Q->ecode),
235 Q->back_frame, Q->return_id);
236
237 if (Q->last_group_offset == PCRE2_UNSET)
238 fprintf(f, " lgoffset=unset\n");
239 else
240 fprintf(f, " lgoffset=%lu\n", Q->last_group_offset/frame_size);
241 }
242}
243
244#endif
245
246
247
248/*************************************************
249* Process a callout *
250*************************************************/
251
252/* This function is called for all callouts, whether "standalone" or at the
253start of a conditional group. Feptr will be pointing to either OP_CALLOUT or
254OP_CALLOUT_STR. A callout block is allocated in pcre2_match() and initialized
255with fixed values.
256
257Arguments:
258 F points to the current backtracking frame
259 mb points to the match block
260 lengthptr where to return the length of the callout item
261
262Returns: the return from the callout
263 or 0 if no callout function exists
264*/
265
266static int
267do_callout(heapframe *F, match_block *mb, PCRE2_SIZE *lengthptr)
268{
269int rc;
270PCRE2_SIZE save0, save1;
271PCRE2_SIZE *callout_ovector;
272pcre2_callout_block *cb;
273
274*lengthptr = (*Fecode == OP_CALLOUT)?
275 PRIV(OP_lengths)[OP_CALLOUT] : GET(Fecode, 1 + 2*LINK_SIZE);
276
277if (mb->callout == NULL) return 0; /* No callout function provided */
278
279/* The original matching code (pre 10.30) worked directly with the ovector
280passed by the user, and this was passed to callouts. Now that the working
281ovector is in the backtracking frame, it no longer needs to reserve space for
282the overall match offsets (which would waste space in the frame). For backward
283compatibility, however, we pass capture_top and offset_vector to the callout as
284if for the extended ovector, and we ensure that the first two slots are unset
285by preserving and restoring their current contents. Picky compilers complain if
286references such as Fovector[-2] are use directly, so we set up a separate
287pointer. */
288
289callout_ovector = (PCRE2_SIZE *)(Fovector) - 2;
290
291/* The cb->version, cb->subject, cb->subject_length, and cb->start_match fields
292are set externally. The first 3 never change; the last is updated for each
293bumpalong. */
294
295cb = mb->cb;
296cb->capture_top = (uint32_t)Foffset_top/2 + 1;
297cb->capture_last = Fcapture_last;
298cb->offset_vector = callout_ovector;
299cb->mark = mb->nomatch_mark;
300cb->current_position = (PCRE2_SIZE)(Feptr - mb->start_subject);
301cb->pattern_position = GET(Fecode, 1);
302cb->next_item_length = GET(Fecode, 1 + LINK_SIZE);
303
304if (*Fecode == OP_CALLOUT) /* Numerical callout */
305 {
306 cb->callout_number = Fecode[1 + 2*LINK_SIZE];
307 cb->callout_string_offset = 0;
308 cb->callout_string = NULL;
309 cb->callout_string_length = 0;
310 }
311else /* String callout */
312 {
313 cb->callout_number = 0;
314 cb->callout_string_offset = GET(Fecode, 1 + 3*LINK_SIZE);
315 cb->callout_string = Fecode + (1 + 4*LINK_SIZE) + 1;
316 cb->callout_string_length =
317 *lengthptr - (1 + 4*LINK_SIZE) - 2;
318 }
319
320save0 = callout_ovector[0];
321save1 = callout_ovector[1];
322callout_ovector[0] = callout_ovector[1] = PCRE2_UNSET;
323rc = mb->callout(cb, mb->callout_data);
324callout_ovector[0] = save0;
325callout_ovector[1] = save1;
326cb->callout_flags = 0;
327return rc;
328}
329
330
331
332/*************************************************
333* Match a back-reference *
334*************************************************/
335
336/* This function is called only when it is known that the offset lies within
337the offsets that have so far been used in the match. Note that in caseless
338UTF-8 mode, the number of subject bytes matched may be different to the number
339of reference bytes. (In theory this could also happen in UTF-16 mode, but it
340seems unlikely.)
341
342Arguments:
343 offset index into the offset vector
344 caseless TRUE if caseless
345 F the current backtracking frame pointer
346 mb points to match block
347 lengthptr pointer for returning the length matched
348
349Returns: = 0 sucessful match; number of code units matched is set
350 < 0 no match
351 > 0 partial match
352*/
353
354static int
355match_ref(PCRE2_SIZE offset, BOOL caseless, heapframe *F, match_block *mb,
356 PCRE2_SIZE *lengthptr)
357{
358PCRE2_SPTR p;
359PCRE2_SIZE length;
360PCRE2_SPTR eptr;
361PCRE2_SPTR eptr_start;
362
363/* Deal with an unset group. The default is no match, but there is an option to
364match an empty string. */
365
366if (offset >= Foffset_top || Fovector[offset] == PCRE2_UNSET)
367 {
368 if ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
369 {
370 *lengthptr = 0;
371 return 0; /* Match */
372 }
373 else return -1; /* No match */
374 }
375
376/* Separate the caseless and UTF cases for speed. */
377
378eptr = eptr_start = Feptr;
379p = mb->start_subject + Fovector[offset];
380length = Fovector[offset+1] - Fovector[offset];
381
382if (caseless)
383 {
384#if defined SUPPORT_UNICODE
385 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
386
387 if (utf || (mb->poptions & PCRE2_UCP) != 0)
388 {
389 PCRE2_SPTR endptr = p + length;
390
391 /* Match characters up to the end of the reference. NOTE: the number of
392 code units matched may differ, because in UTF-8 there are some characters
393 whose upper and lower case codes have different numbers of bytes. For
394 example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 (3
395 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
396 sequence of two of the latter. It is important, therefore, to check the
397 length along the reference, not along the subject (earlier code did this
398 wrong). UCP without uses Unicode properties but without UTF encoding. */
399
400 while (p < endptr)
401 {
402 uint32_t c, d;
403 const ucd_record *ur;
404 if (eptr >= mb->end_subject) return 1; /* Partial match */
405
406 if (utf)
407 {
408 GETCHARINC(c, eptr);
409 GETCHARINC(d, p);
410 }
411 else
412 {
413 c = *eptr++;
414 d = *p++;
415 }
416
417 ur = GET_UCD(d);
418 if (c != d && c != (uint32_t)((int)d + ur->other_case))
419 {
420 const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset;
421 for (;;)
422 {
423 if (c < *pp) return -1; /* No match */
424 if (c == *pp++) break;
425 }
426 }
427 }
428 }
429 else
430#endif
431
432 /* Not in UTF or UCP mode */
433 {
434 for (; length > 0; length--)
435 {
436 uint32_t cc, cp;
437 if (eptr >= mb->end_subject) return 1; /* Partial match */
438 cc = UCHAR21TEST(eptr);
439 cp = UCHAR21TEST(p);
440 if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc))
441 return -1; /* No match */
442 p++;
443 eptr++;
444 }
445 }
446 }
447
448/* In the caseful case, we can just compare the code units, whether or not we
449are in UTF and/or UCP mode. When partial matching, we have to do this unit by
450unit. */
451
452else
453 {
454 if (mb->partial != 0)
455 {
456 for (; length > 0; length--)
457 {
458 if (eptr >= mb->end_subject) return 1; /* Partial match */
459 if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /* No match */
460 }
461 }
462
463 /* Not partial matching */
464
465 else
466 {
467 if ((PCRE2_SIZE)(mb->end_subject - eptr) < length) return 1; /* Partial */
468 if (memcmp(p, eptr, CU2BYTES(length)) != 0) return -1; /* No match */
469 eptr += length;
470 }
471 }
472
473*lengthptr = eptr - eptr_start;
474return 0; /* Match */
475}
476
477
478
479/******************************************************************************
480*******************************************************************************
481 "Recursion" in the match() function
482
483The original match() function was highly recursive, but this proved to be the
484source of a number of problems over the years, mostly because of the relatively
485small system stacks that are commonly found. As new features were added to
486patterns, various kludges were invented to reduce the amount of stack used,
487making the code hard to understand in places.
488
489A version did exist that used individual frames on the heap instead of calling
490match() recursively, but this ran substantially slower. The current version is
491a refactoring that uses a vector of frames to remember backtracking points.
492This runs no slower, and possibly even a bit faster than the original recursive
493implementation. An initial vector of size START_FRAMES_SIZE (enough for maybe
49450 frames) is allocated on the system stack. If this is not big enough, the
495heap is used for a larger vector.
496
497*******************************************************************************
498******************************************************************************/
499
500
501
502
503/*************************************************
504* Macros for the match() function *
505*************************************************/
506
507/* These macros pack up tests that are used for partial matching several times
508in the code. The second one is used when we already know we are past the end of
509the subject. We set the "hit end" flag if the pointer is at the end of the
510subject and either (a) the pointer is past the earliest inspected character
511(i.e. something has been matched, even if not part of the actual matched
512string), or (b) the pattern contains a lookbehind. These are the conditions for
513which adding more characters may allow the current match to continue.
514
515For hard partial matching, we immediately return a partial match. Otherwise,
516carrying on means that a complete match on the current subject will be sought.
517A partial match is returned only if no complete match can be found. */
518
519#define CHECK_PARTIAL()\
520 if (Feptr >= mb->end_subject) \
521 { \
522 SCHECK_PARTIAL(); \
523 }
524
525#define SCHECK_PARTIAL()\
526 if (mb->partial != 0 && \
527 (Feptr > mb->start_used_ptr || mb->allowemptypartial)) \
528 { \
529 mb->hitend = TRUE; \
530 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \
531 }
532
533
534/* These macros are used to implement backtracking. They simulate a recursive
535call to the match() function by means of a local vector of frames which
536remember the backtracking points. */
537
538#define RMATCH(ra,rb)\
539 {\
540 start_ecode = ra;\
541 Freturn_id = rb;\
542 goto MATCH_RECURSE;\
543 L_##rb:;\
544 }
545
546#define RRETURN(ra)\
547 {\
548 rrc = ra;\
549 goto RETURN_SWITCH;\
550 }
551
552
553
554/*************************************************
555* Match from current position *
556*************************************************/
557
558/* This function is called to run one match attempt at a single starting point
559in the subject.
560
561Performance note: It might be tempting to extract commonly used fields from the
562mb structure (e.g. end_subject) into individual variables to improve
563performance. Tests using gcc on a SPARC disproved this; in the first case, it
564made performance worse.
565
566Arguments:
567 start_eptr starting character in subject
568 start_ecode starting position in compiled code
569 ovector pointer to the final output vector
570 oveccount number of pairs in ovector
571 top_bracket number of capturing parentheses in the pattern
572 frame_size size of each backtracking frame
573 mb pointer to "static" variables block
574
575Returns: MATCH_MATCH if matched ) these values are >= 0
576 MATCH_NOMATCH if failed to match )
577 negative MATCH_xxx value for PRUNE, SKIP, etc
578 negative PCRE2_ERROR_xxx value if aborted by an error condition
579 (e.g. stopped by repeated call or depth limit)
580*/
581
582static int
583match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, PCRE2_SIZE *ovector,
584 uint16_t oveccount, uint16_t top_bracket, PCRE2_SIZE frame_size,
585 match_block *mb)
586{
587/* Frame-handling variables */
588
589heapframe *F; /* Current frame pointer */
590heapframe *N = NULL; /* Temporary frame pointers */
591heapframe *P = NULL;
592heapframe *assert_accept_frame = NULL; /* For passing back a frame with captures */
593PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */
594
595/* Local variables that do not need to be preserved over calls to RRMATCH(). */
596
597PCRE2_SPTR bracode; /* Temp pointer to start of group */
598PCRE2_SIZE offset; /* Used for group offsets */
599PCRE2_SIZE length; /* Used for various length calculations */
600
601int rrc; /* Return from functions & backtracking "recursions" */
602#ifdef SUPPORT_UNICODE
603int proptype; /* Type of character property */
604#endif
605
606uint32_t i; /* Used for local loops */
607uint32_t fc; /* Character values */
608uint32_t number; /* Used for group and other numbers */
609uint32_t reptype = 0; /* Type of repetition (0 to avoid compiler warning) */
610uint32_t group_frame_type; /* Specifies type for new group frames */
611
612BOOL condition; /* Used in conditional groups */
613BOOL cur_is_word; /* Used in "word" tests */
614BOOL prev_is_word; /* Used in "word" tests */
615
616/* UTF and UCP flags */
617
618#ifdef SUPPORT_UNICODE
619BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
620BOOL ucp = (mb->poptions & PCRE2_UCP) != 0;
621#else
622BOOL utf = FALSE; /* Required for convenience even when no Unicode support */
623#endif
624
625/* This is the length of the last part of a backtracking frame that must be
626copied when a new frame is created. */
627
628frame_copy_size = frame_size - offsetof(heapframe, eptr);
629
630/* Set up the first current frame at the start of the vector, and initialize
631fields that are not reset for new frames. */
632
633F = mb->match_frames;
634Frdepth = 0; /* "Recursion" depth */
635Fcapture_last = 0; /* Number of most recent capture */
636Fcurrent_recurse = RECURSE_UNSET; /* Not pattern recursing. */
637Fstart_match = Feptr = start_eptr; /* Current data pointer and start match */
638Fmark = NULL; /* Most recent mark */
639Foffset_top = 0; /* End of captures within the frame */
640Flast_group_offset = PCRE2_UNSET; /* Saved frame of most recent group */
641group_frame_type = 0; /* Not a start of group frame */
642goto NEW_FRAME; /* Start processing with this frame */
643
644/* Come back here when we want to create a new frame for remembering a
645backtracking point. */
646
647MATCH_RECURSE:
648
649/* Set up a new backtracking frame. If the vector is full, get a new one
650on the heap, doubling the size, but constrained by the heap limit. */
651
652N = (heapframe *)((char *)F + frame_size);
653if (N >= mb->match_frames_top)
654 {
655 PCRE2_SIZE newsize = mb->frame_vector_size * 2;
656 heapframe *new;
657
658 if ((newsize / 1024) > mb->heap_limit)
659 {
660 PCRE2_SIZE maxsize = ((mb->heap_limit * 1024)/frame_size) * frame_size;
661 if (mb->frame_vector_size >= maxsize) return PCRE2_ERROR_HEAPLIMIT;
662 newsize = maxsize;
663 }
664
665 new = mb->memctl.malloc(newsize, mb->memctl.memory_data);
666 if (new == NULL) return PCRE2_ERROR_NOMEMORY;
667 memcpy(new, mb->match_frames, mb->frame_vector_size);
668
669 F = (heapframe *)((char *)new + ((char *)F - (char *)mb->match_frames));
670 N = (heapframe *)((char *)F + frame_size);
671
672 if (mb->match_frames != mb->stack_frames)
673 mb->memctl.free(mb->match_frames, mb->memctl.memory_data);
674 mb->match_frames = new;
675 mb->match_frames_top = (heapframe *)((char *)mb->match_frames + newsize);
676 mb->frame_vector_size = newsize;
677 }
678
679#ifdef DEBUG_SHOW_RMATCH
680fprintf(stderr, "++ RMATCH %2d frame=%d", Freturn_id, Frdepth + 1);
681if (group_frame_type != 0)
682 {
683 fprintf(stderr, " type=%x ", group_frame_type);
684 switch (GF_IDMASK(group_frame_type))
685 {
686 case GF_CAPTURE:
687 fprintf(stderr, "capture=%d", GF_DATAMASK(group_frame_type));
688 break;
689
690 case GF_NOCAPTURE:
691 fprintf(stderr, "nocapture op=%d", GF_DATAMASK(group_frame_type));
692 break;
693
694 case GF_CONDASSERT:
695 fprintf(stderr, "condassert op=%d", GF_DATAMASK(group_frame_type));
696 break;
697
698 case GF_RECURSE:
699 fprintf(stderr, "recurse=%d", GF_DATAMASK(group_frame_type));
700 break;
701
702 default:
703 fprintf(stderr, "*** unknown ***");
704 break;
705 }
706 }
707fprintf(stderr, "\n");
708#endif
709
710/* Copy those fields that must be copied into the new frame, increase the
711"recursion" depth (i.e. the new frame's index) and then make the new frame
712current. */
713
714memcpy((char *)N + offsetof(heapframe, eptr),
715 (char *)F + offsetof(heapframe, eptr),
716 frame_copy_size);
717
718N->rdepth = Frdepth + 1;
719F = N;
720
721/* Carry on processing with a new frame. */
722
723NEW_FRAME:
724Fgroup_frame_type = group_frame_type;
725Fecode = start_ecode; /* Starting code pointer */
726Fback_frame = frame_size; /* Default is go back one frame */
727
728/* If this is a special type of group frame, remember its offset for quick
729access at the end of the group. If this is a recursion, set a new current
730recursion value. */
731
732if (group_frame_type != 0)
733 {
734 Flast_group_offset = (char *)F - (char *)mb->match_frames;
735 if (GF_IDMASK(group_frame_type) == GF_RECURSE)
736 Fcurrent_recurse = GF_DATAMASK(group_frame_type);
737 group_frame_type = 0;
738 }
739
740
741/* ========================================================================= */
742/* This is the main processing loop. First check that we haven't recorded too
743many backtracks (search tree is too large), or that we haven't exceeded the
744recursive depth limit (used too many backtracking frames). If not, process the
745opcodes. */
746
747if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
748if (Frdepth >= mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
749
750for (;;)
751 {
752#ifdef DEBUG_SHOW_OPS
753fprintf(stderr, "++ op=%d\n", *Fecode);
754#endif
755
756 Fop = (uint8_t)(*Fecode); /* Cast needed for 16-bit and 32-bit modes */
757 switch(Fop)
758 {
759 /* ===================================================================== */
760 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, to close
761 any currently open capturing brackets. Unlike reaching the end of a group,
762 where we know the starting frame is at the top of the chained frames, in
763 this case we have to search back for the relevant frame in case other types
764 of group that use chained frames have intervened. Multiple OP_CLOSEs always
765 come innermost first, which matches the chain order. We can ignore this in
766 a recursion, because captures are not passed out of recursions. */
767
768 case OP_CLOSE:
769 if (Fcurrent_recurse == RECURSE_UNSET)
770 {
771 number = GET2(Fecode, 1);
772 offset = Flast_group_offset;
773 for(;;)
774 {
775 if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
776 N = (heapframe *)((char *)mb->match_frames + offset);
777 P = (heapframe *)((char *)N - frame_size);
778 if (N->group_frame_type == (GF_CAPTURE | number)) break;
779 offset = P->last_group_offset;
780 }
781 offset = (number << 1) - 2;
782 Fcapture_last = number;
783 Fovector[offset] = P->eptr - mb->start_subject;
784 Fovector[offset+1] = Feptr - mb->start_subject;
785 if (offset >= Foffset_top) Foffset_top = offset + 2;
786 }
787 Fecode += PRIV(OP_lengths)[*Fecode];
788 break;
789
790
791 /* ===================================================================== */
792 /* Real or forced end of the pattern, assertion, or recursion. In an
793 assertion ACCEPT, update the last used pointer and remember the current
794 frame so that the captures and mark can be fished out of it. */
795
796 case OP_ASSERT_ACCEPT:
797 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
798 assert_accept_frame = F;
799 RRETURN(MATCH_ACCEPT);
800
801 /* If recursing, we have to find the most recent recursion. */
802
803 case OP_ACCEPT:
804 case OP_END:
805
806 /* Handle end of a recursion. */
807
808 if (Fcurrent_recurse != RECURSE_UNSET)
809 {
810 offset = Flast_group_offset;
811 for(;;)
812 {
813 if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
814 N = (heapframe *)((char *)mb->match_frames + offset);
815 P = (heapframe *)((char *)N - frame_size);
816 if (GF_IDMASK(N->group_frame_type) == GF_RECURSE) break;
817 offset = P->last_group_offset;
818 }
819
820 /* N is now the frame of the recursion; the previous frame is at the
821 OP_RECURSE position. Go back there, copying the current subject position
822 and mark, and the start_match position (\K might have changed it), and
823 then move on past the OP_RECURSE. */
824
825 P->eptr = Feptr;
826 P->mark = Fmark;
827 P->start_match = Fstart_match;
828 F = P;
829 Fecode += 1 + LINK_SIZE;
830 continue;
831 }
832
833 /* Not a recursion. Fail for an empty string match if either PCRE2_NOTEMPTY
834 is set, or if PCRE2_NOTEMPTY_ATSTART is set and we have matched at the
835 start of the subject. In both cases, backtracking will then try other
836 alternatives, if any. */
837
838 if (Feptr == Fstart_match &&
839 ((mb->moptions & PCRE2_NOTEMPTY) != 0 ||
840 ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) != 0 &&
841 Fstart_match == mb->start_subject + mb->start_offset)))
842 RRETURN(MATCH_NOMATCH);
843
844 /* Also fail if PCRE2_ENDANCHORED is set and the end of the match is not
845 the end of the subject. After (*ACCEPT) we fail the entire match (at this
846 position) but backtrack on reaching the end of the pattern. */
847
848 if (Feptr < mb->end_subject &&
849 ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0)
850 {
851 if (Fop == OP_END) RRETURN(MATCH_NOMATCH);
852 return MATCH_NOMATCH;
853 }
854
855 /* We have a successful match of the whole pattern. Record the result and
856 then do a direct return from the function. If there is space in the offset
857 vector, set any pairs that follow the highest-numbered captured string but
858 are less than the number of capturing groups in the pattern to PCRE2_UNSET.
859 It is documented that this happens. "Gaps" are set to PCRE2_UNSET
860 dynamically. It is only those at the end that need setting here. */
861
862 mb->end_match_ptr = Feptr; /* Record where we ended */
863 mb->end_offset_top = Foffset_top; /* and how many extracts were taken */
864 mb->mark = Fmark; /* and the last success mark */
865 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
866
867 ovector[0] = Fstart_match - mb->start_subject;
868 ovector[1] = Feptr - mb->start_subject;
869
870 /* Set i to the smaller of the sizes of the external and frame ovectors. */
871
872 i = 2 * ((top_bracket + 1 > oveccount)? oveccount : top_bracket + 1);
873 memcpy(ovector + 2, Fovector, (i - 2) * sizeof(PCRE2_SIZE));
874 while (--i >= Foffset_top + 2) ovector[i] = PCRE2_UNSET;
875 return MATCH_MATCH; /* Note: NOT RRETURN */
876
877
878 /*===================================================================== */
879 /* Match any single character type except newline; have to take care with
880 CRLF newlines and partial matching. */
881
882 case OP_ANY:
883 if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
884 if (mb->partial != 0 &&
885 Feptr == mb->end_subject - 1 &&
886 NLBLOCK->nltype == NLTYPE_FIXED &&
887 NLBLOCK->nllen == 2 &&
888 UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
889 {
890 mb->hitend = TRUE;
891 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
892 }
893 /* Fall through */
894
895 /* Match any single character whatsoever. */
896
897 case OP_ALLANY:
898 if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */
899 { /* not be updated before SCHECK_PARTIAL. */
900 SCHECK_PARTIAL();
901 RRETURN(MATCH_NOMATCH);
902 }
903 Feptr++;
904#ifdef SUPPORT_UNICODE
905 if (utf) ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
906#endif
907 Fecode++;
908 break;
909
910
911 /* ===================================================================== */
912 /* Match a single code unit, even in UTF mode. This opcode really does
913 match any code unit, even newline. (It really should be called ANYCODEUNIT,
914 of course - the byte name is from pre-16 bit days.) */
915
916 case OP_ANYBYTE:
917 if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */
918 { /* not be updated before SCHECK_PARTIAL. */
919 SCHECK_PARTIAL();
920 RRETURN(MATCH_NOMATCH);
921 }
922 Feptr++;
923 Fecode++;
924 break;
925
926
927 /* ===================================================================== */
928 /* Match a single character, casefully */
929
930 case OP_CHAR:
931#ifdef SUPPORT_UNICODE
932 if (utf)
933 {
934 Flength = 1;
935 Fecode++;
936 GETCHARLEN(fc, Fecode, Flength);
937 if (Flength > (PCRE2_SIZE)(mb->end_subject - Feptr))
938 {
939 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
940 RRETURN(MATCH_NOMATCH);
941 }
942 for (; Flength > 0; Flength--)
943 {
944 if (*Fecode++ != UCHAR21INC(Feptr)) RRETURN(MATCH_NOMATCH);
945 }
946 }
947 else
948#endif
949
950 /* Not UTF mode */
951 {
952 if (mb->end_subject - Feptr < 1)
953 {
954 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
955 RRETURN(MATCH_NOMATCH);
956 }
957 if (Fecode[1] != *Feptr++) RRETURN(MATCH_NOMATCH);
958 Fecode += 2;
959 }
960 break;
961
962
963 /* ===================================================================== */
964 /* Match a single character, caselessly. If we are at the end of the
965 subject, give up immediately. We get here only when the pattern character
966 has at most one other case. Characters with more than two cases are coded
967 as OP_PROP with the pseudo-property PT_CLIST. */
968
969 case OP_CHARI:
970 if (Feptr >= mb->end_subject)
971 {
972 SCHECK_PARTIAL();
973 RRETURN(MATCH_NOMATCH);
974 }
975
976#ifdef SUPPORT_UNICODE
977 if (utf)
978 {
979 Flength = 1;
980 Fecode++;
981 GETCHARLEN(fc, Fecode, Flength);
982
983 /* If the pattern character's value is < 128, we know that its other case
984 (if any) is also < 128 (and therefore only one code unit long in all
985 code-unit widths), so we can use the fast lookup table. We checked above
986 that there is at least one character left in the subject. */
987
988 if (fc < 128)
989 {
990 uint32_t cc = UCHAR21(Feptr);
991 if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
992 Fecode++;
993 Feptr++;
994 }
995
996 /* Otherwise we must pick up the subject character and use Unicode
997 property support to test its other case. Note that we cannot use the
998 value of "Flength" to check for sufficient bytes left, because the other
999 case of the character may have more or fewer code units. */
1000
1001 else
1002 {
1003 uint32_t dc;
1004 GETCHARINC(dc, Feptr);
1005 Fecode += Flength;
1006 if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
1007 }
1008 }
1009
1010 /* If UCP is set without UTF we must do the same as above, but with one
1011 character per code unit. */
1012
1013 else if (ucp)
1014 {
1015 uint32_t cc = UCHAR21(Feptr);
1016 fc = Fecode[1];
1017 if (fc < 128)
1018 {
1019 if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
1020 }
1021 else
1022 {
1023 if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
1024 }
1025 Feptr++;
1026 Fecode += 2;
1027 }
1028
1029 else
1030#endif /* SUPPORT_UNICODE */
1031
1032 /* Not UTF or UCP mode; use the table for characters < 256. */
1033 {
1034 if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1])
1035 != TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH);
1036 Feptr++;
1037 Fecode += 2;
1038 }
1039 break;
1040
1041
1042 /* ===================================================================== */
1043 /* Match not a single character. */
1044
1045 case OP_NOT:
1046 case OP_NOTI:
1047 if (Feptr >= mb->end_subject)
1048 {
1049 SCHECK_PARTIAL();
1050 RRETURN(MATCH_NOMATCH);
1051 }
1052
1053#ifdef SUPPORT_UNICODE
1054 if (utf)
1055 {
1056 uint32_t ch;
1057 Fecode++;
1058 GETCHARINC(ch, Fecode);
1059 GETCHARINC(fc, Feptr);
1060 if (ch == fc)
1061 {
1062 RRETURN(MATCH_NOMATCH); /* Caseful match */
1063 }
1064 else if (Fop == OP_NOTI) /* If caseless */
1065 {
1066 if (ch > 127)
1067 ch = UCD_OTHERCASE(ch);
1068 else
1069 ch = (mb->fcc)[ch];
1070 if (ch == fc) RRETURN(MATCH_NOMATCH);
1071 }
1072 }
1073
1074 /* UCP without UTF is as above, but with one character per code unit. */
1075
1076 else if (ucp)
1077 {
1078 uint32_t ch;
1079 fc = UCHAR21INC(Feptr);
1080 ch = Fecode[1];
1081 Fecode += 2;
1082
1083 if (ch == fc)
1084 {
1085 RRETURN(MATCH_NOMATCH); /* Caseful match */
1086 }
1087 else if (Fop == OP_NOTI) /* If caseless */
1088 {
1089 if (ch > 127)
1090 ch = UCD_OTHERCASE(ch);
1091 else
1092 ch = (mb->fcc)[ch];
1093 if (ch == fc) RRETURN(MATCH_NOMATCH);
1094 }
1095 }
1096
1097 else
1098#endif /* SUPPORT_UNICODE */
1099
1100 /* Neither UTF nor UCP is set */
1101
1102 {
1103 uint32_t ch = Fecode[1];
1104 fc = UCHAR21INC(Feptr);
1105 if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc))
1106 RRETURN(MATCH_NOMATCH);
1107 Fecode += 2;
1108 }
1109 break;
1110
1111
1112 /* ===================================================================== */
1113 /* Match a single character repeatedly. */
1114
1115#define Loclength F->temp_size
1116#define Lstart_eptr F->temp_sptr[0]
1117#define Lcharptr F->temp_sptr[1]
1118#define Lmin F->temp_32[0]
1119#define Lmax F->temp_32[1]
1120#define Lc F->temp_32[2]
1121#define Loc F->temp_32[3]
1122
1123 case OP_EXACT:
1124 case OP_EXACTI:
1125 Lmin = Lmax = GET2(Fecode, 1);
1126 Fecode += 1 + IMM2_SIZE;
1127 goto REPEATCHAR;
1128
1129 case OP_POSUPTO:
1130 case OP_POSUPTOI:
1131 reptype = REPTYPE_POS;
1132 Lmin = 0;
1133 Lmax = GET2(Fecode, 1);
1134 Fecode += 1 + IMM2_SIZE;
1135 goto REPEATCHAR;
1136
1137 case OP_UPTO:
1138 case OP_UPTOI:
1139 reptype = REPTYPE_MAX;
1140 Lmin = 0;
1141 Lmax = GET2(Fecode, 1);
1142 Fecode += 1 + IMM2_SIZE;
1143 goto REPEATCHAR;
1144
1145 case OP_MINUPTO:
1146 case OP_MINUPTOI:
1147 reptype = REPTYPE_MIN;
1148 Lmin = 0;
1149 Lmax = GET2(Fecode, 1);
1150 Fecode += 1 + IMM2_SIZE;
1151 goto REPEATCHAR;
1152
1153 case OP_POSSTAR:
1154 case OP_POSSTARI:
1155 reptype = REPTYPE_POS;
1156 Lmin = 0;
1157 Lmax = UINT32_MAX;
1158 Fecode++;
1159 goto REPEATCHAR;
1160
1161 case OP_POSPLUS:
1162 case OP_POSPLUSI:
1163 reptype = REPTYPE_POS;
1164 Lmin = 1;
1165 Lmax = UINT32_MAX;
1166 Fecode++;
1167 goto REPEATCHAR;
1168
1169 case OP_POSQUERY:
1170 case OP_POSQUERYI:
1171 reptype = REPTYPE_POS;
1172 Lmin = 0;
1173 Lmax = 1;
1174 Fecode++;
1175 goto REPEATCHAR;
1176
1177 case OP_STAR:
1178 case OP_STARI:
1179 case OP_MINSTAR:
1180 case OP_MINSTARI:
1181 case OP_PLUS:
1182 case OP_PLUSI:
1183 case OP_MINPLUS:
1184 case OP_MINPLUSI:
1185 case OP_QUERY:
1186 case OP_QUERYI:
1187 case OP_MINQUERY:
1188 case OP_MINQUERYI:
1189 fc = *Fecode++ - ((Fop < OP_STARI)? OP_STAR : OP_STARI);
1190 Lmin = rep_min[fc];
1191 Lmax = rep_max[fc];
1192 reptype = rep_typ[fc];
1193
1194 /* Common code for all repeated single-character matches. We first check
1195 for the minimum number of characters. If the minimum equals the maximum, we
1196 are done. Otherwise, if minimizing, check the rest of the pattern for a
1197 match; if there isn't one, advance up to the maximum, one character at a
1198 time.
1199
1200 If maximizing, advance up to the maximum number of matching characters,
1201 until Feptr is past the end of the maximum run. If possessive, we are
1202 then done (no backing up). Otherwise, match at this position; anything
1203 other than no match is immediately returned. For nomatch, back up one
1204 character, unless we are matching \R and the last thing matched was
1205 \r\n, in which case, back up two code units until we reach the first
1206 optional character position.
1207
1208 The various UTF/non-UTF and caseful/caseless cases are handled separately,
1209 for speed. */
1210
1211 REPEATCHAR:
1212#ifdef SUPPORT_UNICODE
1213 if (utf)
1214 {
1215 Flength = 1;
1216 Lcharptr = Fecode;
1217 GETCHARLEN(fc, Fecode, Flength);
1218 Fecode += Flength;
1219
1220 /* Handle multi-code-unit character matching, caseful and caseless. */
1221
1222 if (Flength > 1)
1223 {
1224 uint32_t othercase;
1225
1226 if (Fop >= OP_STARI && /* Caseless */
1227 (othercase = UCD_OTHERCASE(fc)) != fc)
1228 Loclength = PRIV(ord2utf)(othercase, Foccu);
1229 else Loclength = 0;
1230
1231 for (i = 1; i <= Lmin; i++)
1232 {
1233 if (Feptr <= mb->end_subject - Flength &&
1234 memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength;
1235 else if (Loclength > 0 &&
1236 Feptr <= mb->end_subject - Loclength &&
1237 memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1238 Feptr += Loclength;
1239 else
1240 {
1241 CHECK_PARTIAL();
1242 RRETURN(MATCH_NOMATCH);
1243 }
1244 }
1245
1246 if (Lmin == Lmax) continue;
1247
1248 if (reptype == REPTYPE_MIN)
1249 {
1250 for (;;)
1251 {
1252 RMATCH(Fecode, RM202);
1253 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1254 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1255 if (Feptr <= mb->end_subject - Flength &&
1256 memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength;
1257 else if (Loclength > 0 &&
1258 Feptr <= mb->end_subject - Loclength &&
1259 memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1260 Feptr += Loclength;
1261 else
1262 {
1263 CHECK_PARTIAL();
1264 RRETURN(MATCH_NOMATCH);
1265 }
1266 }
1267 /* Control never gets here */
1268 }
1269
1270 else /* Maximize */
1271 {
1272 Lstart_eptr = Feptr;
1273 for (i = Lmin; i < Lmax; i++)
1274 {
1275 if (Feptr <= mb->end_subject - Flength &&
1276 memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0)
1277 Feptr += Flength;
1278 else if (Loclength > 0 &&
1279 Feptr <= mb->end_subject - Loclength &&
1280 memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1281 Feptr += Loclength;
1282 else
1283 {
1284 CHECK_PARTIAL();
1285 break;
1286 }
1287 }
1288
1289 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
1290 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1291 go too far. */
1292
1293 if (reptype != REPTYPE_POS) for(;;)
1294 {
1295 if (Feptr <= Lstart_eptr) break;
1296 RMATCH(Fecode, RM203);
1297 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1298 Feptr--;
1299 BACKCHAR(Feptr);
1300 }
1301 }
1302 break; /* End of repeated wide character handling */
1303 }
1304
1305 /* Length of UTF character is 1. Put it into the preserved variable and
1306 fall through to the non-UTF code. */
1307
1308 Lc = fc;
1309 }
1310 else
1311#endif /* SUPPORT_UNICODE */
1312
1313 /* When not in UTF mode, load a single-code-unit character. Then proceed as
1314 above, using Unicode casing if either UTF or UCP is set. */
1315
1316 Lc = *Fecode++;
1317
1318 /* Caseless comparison */
1319
1320 if (Fop >= OP_STARI)
1321 {
1322#if PCRE2_CODE_UNIT_WIDTH == 8
1323#ifdef SUPPORT_UNICODE
1324 if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
1325 else
1326#endif /* SUPPORT_UNICODE */
1327 /* Lc will be < 128 in UTF-8 mode. */
1328 Loc = mb->fcc[Lc];
1329#else /* 16-bit & 32-bit */
1330#ifdef SUPPORT_UNICODE
1331 if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc);
1332 else
1333#endif /* SUPPORT_UNICODE */
1334 Loc = TABLE_GET(Lc, mb->fcc, Lc);
1335#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
1336
1337 for (i = 1; i <= Lmin; i++)
1338 {
1339 uint32_t cc; /* Faster than PCRE2_UCHAR */
1340 if (Feptr >= mb->end_subject)
1341 {
1342 SCHECK_PARTIAL();
1343 RRETURN(MATCH_NOMATCH);
1344 }
1345 cc = UCHAR21TEST(Feptr);
1346 if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH);
1347 Feptr++;
1348 }
1349 if (Lmin == Lmax) continue;
1350
1351 if (reptype == REPTYPE_MIN)
1352 {
1353 for (;;)
1354 {
1355 uint32_t cc; /* Faster than PCRE2_UCHAR */
1356 RMATCH(Fecode, RM25);
1357 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1358 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1359 if (Feptr >= mb->end_subject)
1360 {
1361 SCHECK_PARTIAL();
1362 RRETURN(MATCH_NOMATCH);
1363 }
1364 cc = UCHAR21TEST(Feptr);
1365 if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH);
1366 Feptr++;
1367 }
1368 /* Control never gets here */
1369 }
1370
1371 else /* Maximize */
1372 {
1373 Lstart_eptr = Feptr;
1374 for (i = Lmin; i < Lmax; i++)
1375 {
1376 uint32_t cc; /* Faster than PCRE2_UCHAR */
1377 if (Feptr >= mb->end_subject)
1378 {
1379 SCHECK_PARTIAL();
1380 break;
1381 }
1382 cc = UCHAR21TEST(Feptr);
1383 if (Lc != cc && Loc != cc) break;
1384 Feptr++;
1385 }
1386 if (reptype != REPTYPE_POS) for (;;)
1387 {
1388 if (Feptr == Lstart_eptr) break;
1389 RMATCH(Fecode, RM26);
1390 Feptr--;
1391 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1392 }
1393 }
1394 }
1395
1396 /* Caseful comparisons (includes all multi-byte characters) */
1397
1398 else
1399 {
1400 for (i = 1; i <= Lmin; i++)
1401 {
1402 if (Feptr >= mb->end_subject)
1403 {
1404 SCHECK_PARTIAL();
1405 RRETURN(MATCH_NOMATCH);
1406 }
1407 if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH);
1408 }
1409
1410 if (Lmin == Lmax) continue;
1411
1412 if (reptype == REPTYPE_MIN)
1413 {
1414 for (;;)
1415 {
1416 RMATCH(Fecode, RM27);
1417 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1418 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1419 if (Feptr >= mb->end_subject)
1420 {
1421 SCHECK_PARTIAL();
1422 RRETURN(MATCH_NOMATCH);
1423 }
1424 if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH);
1425 }
1426 /* Control never gets here */
1427 }
1428 else /* Maximize */
1429 {
1430 Lstart_eptr = Feptr;
1431 for (i = Lmin; i < Lmax; i++)
1432 {
1433 if (Feptr >= mb->end_subject)
1434 {
1435 SCHECK_PARTIAL();
1436 break;
1437 }
1438
1439 if (Lc != UCHAR21TEST(Feptr)) break;
1440 Feptr++;
1441 }
1442
1443 if (reptype != REPTYPE_POS) for (;;)
1444 {
1445 if (Feptr <= Lstart_eptr) break;
1446 RMATCH(Fecode, RM28);
1447 Feptr--;
1448 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1449 }
1450 }
1451 }
1452 break;
1453
1454#undef Loclength
1455#undef Lstart_eptr
1456#undef Lcharptr
1457#undef Lmin
1458#undef Lmax
1459#undef Lc
1460#undef Loc
1461
1462
1463 /* ===================================================================== */
1464 /* Match a negated single one-byte character repeatedly. This is almost a
1465 repeat of the code for a repeated single character, but I haven't found a
1466 nice way of commoning these up that doesn't require a test of the
1467 positive/negative option for each character match. Maybe that wouldn't add
1468 very much to the time taken, but character matching *is* what this is all
1469 about... */
1470
1471#define Lstart_eptr F->temp_sptr[0]
1472#define Lmin F->temp_32[0]
1473#define Lmax F->temp_32[1]
1474#define Lc F->temp_32[2]
1475#define Loc F->temp_32[3]
1476
1477 case OP_NOTEXACT:
1478 case OP_NOTEXACTI:
1479 Lmin = Lmax = GET2(Fecode, 1);
1480 Fecode += 1 + IMM2_SIZE;
1481 goto REPEATNOTCHAR;
1482
1483 case OP_NOTUPTO:
1484 case OP_NOTUPTOI:
1485 Lmin = 0;
1486 Lmax = GET2(Fecode, 1);
1487 reptype = REPTYPE_MAX;
1488 Fecode += 1 + IMM2_SIZE;
1489 goto REPEATNOTCHAR;
1490
1491 case OP_NOTMINUPTO:
1492 case OP_NOTMINUPTOI:
1493 Lmin = 0;
1494 Lmax = GET2(Fecode, 1);
1495 reptype = REPTYPE_MIN;
1496 Fecode += 1 + IMM2_SIZE;
1497 goto REPEATNOTCHAR;
1498
1499 case OP_NOTPOSSTAR:
1500 case OP_NOTPOSSTARI:
1501 reptype = REPTYPE_POS;
1502 Lmin = 0;
1503 Lmax = UINT32_MAX;
1504 Fecode++;
1505 goto REPEATNOTCHAR;
1506
1507 case OP_NOTPOSPLUS:
1508 case OP_NOTPOSPLUSI:
1509 reptype = REPTYPE_POS;
1510 Lmin = 1;
1511 Lmax = UINT32_MAX;
1512 Fecode++;
1513 goto REPEATNOTCHAR;
1514
1515 case OP_NOTPOSQUERY:
1516 case OP_NOTPOSQUERYI:
1517 reptype = REPTYPE_POS;
1518 Lmin = 0;
1519 Lmax = 1;
1520 Fecode++;
1521 goto REPEATNOTCHAR;
1522
1523 case OP_NOTPOSUPTO:
1524 case OP_NOTPOSUPTOI:
1525 reptype = REPTYPE_POS;
1526 Lmin = 0;
1527 Lmax = GET2(Fecode, 1);
1528 Fecode += 1 + IMM2_SIZE;
1529 goto REPEATNOTCHAR;
1530
1531 case OP_NOTSTAR:
1532 case OP_NOTSTARI:
1533 case OP_NOTMINSTAR:
1534 case OP_NOTMINSTARI:
1535 case OP_NOTPLUS:
1536 case OP_NOTPLUSI:
1537 case OP_NOTMINPLUS:
1538 case OP_NOTMINPLUSI:
1539 case OP_NOTQUERY:
1540 case OP_NOTQUERYI:
1541 case OP_NOTMINQUERY:
1542 case OP_NOTMINQUERYI:
1543 fc = *Fecode++ - ((Fop >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
1544 Lmin = rep_min[fc];
1545 Lmax = rep_max[fc];
1546 reptype = rep_typ[fc];
1547
1548 /* Common code for all repeated single-character non-matches. */
1549
1550 REPEATNOTCHAR:
1551 GETCHARINCTEST(Lc, Fecode);
1552
1553 /* The code is duplicated for the caseless and caseful cases, for speed,
1554 since matching characters is likely to be quite common. First, ensure the
1555 minimum number of matches are present. If Lmin = Lmax, we are done.
1556 Otherwise, if minimizing, keep trying the rest of the expression and
1557 advancing one matching character if failing, up to the maximum.
1558 Alternatively, if maximizing, find the maximum number of characters and
1559 work backwards. */
1560
1561 if (Fop >= OP_NOTSTARI) /* Caseless */
1562 {
1563#ifdef SUPPORT_UNICODE
1564 if ((utf || ucp) && Lc > 127)
1565 Loc = UCD_OTHERCASE(Lc);
1566 else
1567#endif /* SUPPORT_UNICODE */
1568
1569 Loc = TABLE_GET(Lc, mb->fcc, Lc); /* Other case from table */
1570
1571#ifdef SUPPORT_UNICODE
1572 if (utf)
1573 {
1574 uint32_t d;
1575 for (i = 1; i <= Lmin; i++)
1576 {
1577 if (Feptr >= mb->end_subject)
1578 {
1579 SCHECK_PARTIAL();
1580 RRETURN(MATCH_NOMATCH);
1581 }
1582 GETCHARINC(d, Feptr);
1583 if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH);
1584 }
1585 }
1586 else
1587#endif /* SUPPORT_UNICODE */
1588
1589 /* Not UTF mode */
1590 {
1591 for (i = 1; i <= Lmin; i++)
1592 {
1593 if (Feptr >= mb->end_subject)
1594 {
1595 SCHECK_PARTIAL();
1596 RRETURN(MATCH_NOMATCH);
1597 }
1598 if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH);
1599 Feptr++;
1600 }
1601 }
1602
1603 if (Lmin == Lmax) continue; /* Finished for exact count */
1604
1605 if (reptype == REPTYPE_MIN)
1606 {
1607#ifdef SUPPORT_UNICODE
1608 if (utf)
1609 {
1610 uint32_t d;
1611 for (;;)
1612 {
1613 RMATCH(Fecode, RM204);
1614 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1615 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1616 if (Feptr >= mb->end_subject)
1617 {
1618 SCHECK_PARTIAL();
1619 RRETURN(MATCH_NOMATCH);
1620 }
1621 GETCHARINC(d, Feptr);
1622 if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH);
1623 }
1624 }
1625 else
1626#endif /*SUPPORT_UNICODE */
1627
1628 /* Not UTF mode */
1629 {
1630 for (;;)
1631 {
1632 RMATCH(Fecode, RM29);
1633 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1634 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1635 if (Feptr >= mb->end_subject)
1636 {
1637 SCHECK_PARTIAL();
1638 RRETURN(MATCH_NOMATCH);
1639 }
1640 if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH);
1641 Feptr++;
1642 }
1643 }
1644 /* Control never gets here */
1645 }
1646
1647 /* Maximize case */
1648
1649 else
1650 {
1651 Lstart_eptr = Feptr;
1652
1653#ifdef SUPPORT_UNICODE
1654 if (utf)
1655 {
1656 uint32_t d;
1657 for (i = Lmin; i < Lmax; i++)
1658 {
1659 int len = 1;
1660 if (Feptr >= mb->end_subject)
1661 {
1662 SCHECK_PARTIAL();
1663 break;
1664 }
1665 GETCHARLEN(d, Feptr, len);
1666 if (Lc == d || Loc == d) break;
1667 Feptr += len;
1668 }
1669
1670 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
1671 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1672 go too far. */
1673
1674 if (reptype != REPTYPE_POS) for(;;)
1675 {
1676 if (Feptr <= Lstart_eptr) break;
1677 RMATCH(Fecode, RM205);
1678 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1679 Feptr--;
1680 BACKCHAR(Feptr);
1681 }
1682 }
1683 else
1684#endif /* SUPPORT_UNICODE */
1685
1686 /* Not UTF mode */
1687 {
1688 for (i = Lmin; i < Lmax; i++)
1689 {
1690 if (Feptr >= mb->end_subject)
1691 {
1692 SCHECK_PARTIAL();
1693 break;
1694 }
1695 if (Lc == *Feptr || Loc == *Feptr) break;
1696 Feptr++;
1697 }
1698 if (reptype != REPTYPE_POS) for (;;)
1699 {
1700 if (Feptr == Lstart_eptr) break;
1701 RMATCH(Fecode, RM30);
1702 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1703 Feptr--;
1704 }
1705 }
1706 }
1707 }
1708
1709 /* Caseful comparisons */
1710
1711 else
1712 {
1713#ifdef SUPPORT_UNICODE
1714 if (utf)
1715 {
1716 uint32_t d;
1717 for (i = 1; i <= Lmin; i++)
1718 {
1719 if (Feptr >= mb->end_subject)
1720 {
1721 SCHECK_PARTIAL();
1722 RRETURN(MATCH_NOMATCH);
1723 }
1724 GETCHARINC(d, Feptr);
1725 if (Lc == d) RRETURN(MATCH_NOMATCH);
1726 }
1727 }
1728 else
1729#endif
1730 /* Not UTF mode */
1731 {
1732 for (i = 1; i <= Lmin; i++)
1733 {
1734 if (Feptr >= mb->end_subject)
1735 {
1736 SCHECK_PARTIAL();
1737 RRETURN(MATCH_NOMATCH);
1738 }
1739 if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH);
1740 }
1741 }
1742
1743 if (Lmin == Lmax) continue;
1744
1745 if (reptype == REPTYPE_MIN)
1746 {
1747#ifdef SUPPORT_UNICODE
1748 if (utf)
1749 {
1750 uint32_t d;
1751 for (;;)
1752 {
1753 RMATCH(Fecode, RM206);
1754 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1755 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1756 if (Feptr >= mb->end_subject)
1757 {
1758 SCHECK_PARTIAL();
1759 RRETURN(MATCH_NOMATCH);
1760 }
1761 GETCHARINC(d, Feptr);
1762 if (Lc == d) RRETURN(MATCH_NOMATCH);
1763 }
1764 }
1765 else
1766#endif
1767 /* Not UTF mode */
1768 {
1769 for (;;)
1770 {
1771 RMATCH(Fecode, RM31);
1772 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1773 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1774 if (Feptr >= mb->end_subject)
1775 {
1776 SCHECK_PARTIAL();
1777 RRETURN(MATCH_NOMATCH);
1778 }
1779 if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH);
1780 }
1781 }
1782 /* Control never gets here */
1783 }
1784
1785 /* Maximize case */
1786
1787 else
1788 {
1789 Lstart_eptr = Feptr;
1790
1791#ifdef SUPPORT_UNICODE
1792 if (utf)
1793 {
1794 uint32_t d;
1795 for (i = Lmin; i < Lmax; i++)
1796 {
1797 int len = 1;
1798 if (Feptr >= mb->end_subject)
1799 {
1800 SCHECK_PARTIAL();
1801 break;
1802 }
1803 GETCHARLEN(d, Feptr, len);
1804 if (Lc == d) break;
1805 Feptr += len;
1806 }
1807
1808 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
1809 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1810 go too far. */
1811
1812 if (reptype != REPTYPE_POS) for(;;)
1813 {
1814 if (Feptr <= Lstart_eptr) break;
1815 RMATCH(Fecode, RM207);
1816 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1817 Feptr--;
1818 BACKCHAR(Feptr);
1819 }
1820 }
1821 else
1822#endif
1823 /* Not UTF mode */
1824 {
1825 for (i = Lmin; i < Lmax; i++)
1826 {
1827 if (Feptr >= mb->end_subject)
1828 {
1829 SCHECK_PARTIAL();
1830 break;
1831 }
1832 if (Lc == *Feptr) break;
1833 Feptr++;
1834 }
1835 if (reptype != REPTYPE_POS) for (;;)
1836 {
1837 if (Feptr == Lstart_eptr) break;
1838 RMATCH(Fecode, RM32);
1839 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1840 Feptr--;
1841 }
1842 }
1843 }
1844 }
1845 break;
1846
1847#undef Lstart_eptr
1848#undef Lmin
1849#undef Lmax
1850#undef Lc
1851#undef Loc
1852
1853
1854 /* ===================================================================== */
1855 /* Match a bit-mapped character class, possibly repeatedly. These opcodes
1856 are used when all the characters in the class have values in the range
1857 0-255, and either the matching is caseful, or the characters are in the
1858 range 0-127 when UTF processing is enabled. The only difference between
1859 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1860 encountered. */
1861
1862#define Lmin F->temp_32[0]
1863#define Lmax F->temp_32[1]
1864#define Lstart_eptr F->temp_sptr[0]
1865#define Lbyte_map_address F->temp_sptr[1]
1866#define Lbyte_map ((unsigned char *)Lbyte_map_address)
1867
1868 case OP_NCLASS:
1869 case OP_CLASS:
1870 {
1871 Lbyte_map_address = Fecode + 1; /* Save for matching */
1872 Fecode += 1 + (32 / sizeof(PCRE2_UCHAR)); /* Advance past the item */
1873
1874 /* Look past the end of the item to see if there is repeat information
1875 following. Then obey similar code to character type repeats. */
1876
1877 switch (*Fecode)
1878 {
1879 case OP_CRSTAR:
1880 case OP_CRMINSTAR:
1881 case OP_CRPLUS:
1882 case OP_CRMINPLUS:
1883 case OP_CRQUERY:
1884 case OP_CRMINQUERY:
1885 case OP_CRPOSSTAR:
1886 case OP_CRPOSPLUS:
1887 case OP_CRPOSQUERY:
1888 fc = *Fecode++ - OP_CRSTAR;
1889 Lmin = rep_min[fc];
1890 Lmax = rep_max[fc];
1891 reptype = rep_typ[fc];
1892 break;
1893
1894 case OP_CRRANGE:
1895 case OP_CRMINRANGE:
1896 case OP_CRPOSRANGE:
1897 Lmin = GET2(Fecode, 1);
1898 Lmax = GET2(Fecode, 1 + IMM2_SIZE);
1899 if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
1900 reptype = rep_typ[*Fecode - OP_CRSTAR];
1901 Fecode += 1 + 2 * IMM2_SIZE;
1902 break;
1903
1904 default: /* No repeat follows */
1905 Lmin = Lmax = 1;
1906 break;
1907 }
1908
1909 /* First, ensure the minimum number of matches are present. */
1910
1911#ifdef SUPPORT_UNICODE
1912 if (utf)
1913 {
1914 for (i = 1; i <= Lmin; i++)
1915 {
1916 if (Feptr >= mb->end_subject)
1917 {
1918 SCHECK_PARTIAL();
1919 RRETURN(MATCH_NOMATCH);
1920 }
1921 GETCHARINC(fc, Feptr);
1922 if (fc > 255)
1923 {
1924 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
1925 }
1926 else
1927 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
1928 }
1929 }
1930 else
1931#endif
1932 /* Not UTF mode */
1933 {
1934 for (i = 1; i <= Lmin; i++)
1935 {
1936 if (Feptr >= mb->end_subject)
1937 {
1938 SCHECK_PARTIAL();
1939 RRETURN(MATCH_NOMATCH);
1940 }
1941 fc = *Feptr++;
1942#if PCRE2_CODE_UNIT_WIDTH != 8
1943 if (fc > 255)
1944 {
1945 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
1946 }
1947 else
1948#endif
1949 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
1950 }
1951 }
1952
1953 /* If Lmax == Lmin we are done. Continue with main loop. */
1954
1955 if (Lmin == Lmax) continue;
1956
1957 /* If minimizing, keep testing the rest of the expression and advancing
1958 the pointer while it matches the class. */
1959
1960 if (reptype == REPTYPE_MIN)
1961 {
1962#ifdef SUPPORT_UNICODE
1963 if (utf)
1964 {
1965 for (;;)
1966 {
1967 RMATCH(Fecode, RM200);
1968 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1969 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1970 if (Feptr >= mb->end_subject)
1971 {
1972 SCHECK_PARTIAL();
1973 RRETURN(MATCH_NOMATCH);
1974 }
1975 GETCHARINC(fc, Feptr);
1976 if (fc > 255)
1977 {
1978 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
1979 }
1980 else
1981 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
1982 }
1983 }
1984 else
1985#endif
1986 /* Not UTF mode */
1987 {
1988 for (;;)
1989 {
1990 RMATCH(Fecode, RM23);
1991 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1992 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1993 if (Feptr >= mb->end_subject)
1994 {
1995 SCHECK_PARTIAL();
1996 RRETURN(MATCH_NOMATCH);
1997 }
1998 fc = *Feptr++;
1999#if PCRE2_CODE_UNIT_WIDTH != 8
2000 if (fc > 255)
2001 {
2002 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
2003 }
2004 else
2005#endif
2006 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
2007 }
2008 }
2009 /* Control never gets here */
2010 }
2011
2012 /* If maximizing, find the longest possible run, then work backwards. */
2013
2014 else
2015 {
2016 Lstart_eptr = Feptr;
2017
2018#ifdef SUPPORT_UNICODE
2019 if (utf)
2020 {
2021 for (i = Lmin; i < Lmax; i++)
2022 {
2023 int len = 1;
2024 if (Feptr >= mb->end_subject)
2025 {
2026 SCHECK_PARTIAL();
2027 break;
2028 }
2029 GETCHARLEN(fc, Feptr, len);
2030 if (fc > 255)
2031 {
2032 if (Fop == OP_CLASS) break;
2033 }
2034 else
2035 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break;
2036 Feptr += len;
2037 }
2038
2039 if (reptype == REPTYPE_POS) continue; /* No backtracking */
2040
2041 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
2042 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
2043 go too far. */
2044
2045 for (;;)
2046 {
2047 RMATCH(Fecode, RM201);
2048 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2049 if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */
2050 BACKCHAR(Feptr);
2051 }
2052 }
2053 else
2054#endif
2055 /* Not UTF mode */
2056 {
2057 for (i = Lmin; i < Lmax; i++)
2058 {
2059 if (Feptr >= mb->end_subject)
2060 {
2061 SCHECK_PARTIAL();
2062 break;
2063 }
2064 fc = *Feptr;
2065#if PCRE2_CODE_UNIT_WIDTH != 8
2066 if (fc > 255)
2067 {
2068 if (Fop == OP_CLASS) break;
2069 }
2070 else
2071#endif
2072 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break;
2073 Feptr++;
2074 }
2075
2076 if (reptype == REPTYPE_POS) continue; /* No backtracking */
2077
2078 while (Feptr >= Lstart_eptr)
2079 {
2080 RMATCH(Fecode, RM24);
2081 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2082 Feptr--;
2083 }
2084 }
2085
2086 RRETURN(MATCH_NOMATCH);
2087 }
2088 }
2089 /* Control never gets here */
2090
2091#undef Lbyte_map_address
2092#undef Lbyte_map
2093#undef Lstart_eptr
2094#undef Lmin
2095#undef Lmax
2096
2097
2098 /* ===================================================================== */
2099 /* Match an extended character class. In the 8-bit library, this opcode is
2100 encountered only when UTF-8 mode mode is supported. In the 16-bit and
2101 32-bit libraries, codepoints greater than 255 may be encountered even when
2102 UTF is not supported. */
2103
2104#define Lstart_eptr F->temp_sptr[0]
2105#define Lxclass_data F->temp_sptr[1]
2106#define Lmin F->temp_32[0]
2107#define Lmax F->temp_32[1]
2108
2109#ifdef SUPPORT_WIDE_CHARS
2110 case OP_XCLASS:
2111 {
2112 Lxclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */
2113 Fecode += GET(Fecode, 1); /* Advance past the item */
2114
2115 switch (*Fecode)
2116 {
2117 case OP_CRSTAR:
2118 case OP_CRMINSTAR:
2119 case OP_CRPLUS:
2120 case OP_CRMINPLUS:
2121 case OP_CRQUERY:
2122 case OP_CRMINQUERY:
2123 case OP_CRPOSSTAR:
2124 case OP_CRPOSPLUS:
2125 case OP_CRPOSQUERY:
2126 fc = *Fecode++ - OP_CRSTAR;
2127 Lmin = rep_min[fc];
2128 Lmax = rep_max[fc];
2129 reptype = rep_typ[fc];
2130 break;
2131
2132 case OP_CRRANGE:
2133 case OP_CRMINRANGE:
2134 case OP_CRPOSRANGE:
2135 Lmin = GET2(Fecode, 1);
2136 Lmax = GET2(Fecode, 1 + IMM2_SIZE);
2137 if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
2138 reptype = rep_typ[*Fecode - OP_CRSTAR];
2139 Fecode += 1 + 2 * IMM2_SIZE;
2140 break;
2141
2142 default: /* No repeat follows */
2143 Lmin = Lmax = 1;
2144 break;
2145 }
2146
2147 /* First, ensure the minimum number of matches are present. */
2148
2149 for (i = 1; i <= Lmin; i++)
2150 {
2151 if (Feptr >= mb->end_subject)
2152 {
2153 SCHECK_PARTIAL();
2154 RRETURN(MATCH_NOMATCH);
2155 }
2156 GETCHARINCTEST(fc, Feptr);
2157 if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH);
2158 }
2159
2160 /* If Lmax == Lmin we can just continue with the main loop. */
2161
2162 if (Lmin == Lmax) continue;
2163
2164 /* If minimizing, keep testing the rest of the expression and advancing
2165 the pointer while it matches the class. */
2166
2167 if (reptype == REPTYPE_MIN)
2168 {
2169 for (;;)
2170 {
2171 RMATCH(Fecode, RM100);
2172 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2173 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
2174 if (Feptr >= mb->end_subject)
2175 {
2176 SCHECK_PARTIAL();
2177 RRETURN(MATCH_NOMATCH);
2178 }
2179 GETCHARINCTEST(fc, Feptr);
2180 if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH);
2181 }
2182 /* Control never gets here */
2183 }
2184
2185 /* If maximizing, find the longest possible run, then work backwards. */
2186
2187 else
2188 {
2189 Lstart_eptr = Feptr;
2190 for (i = Lmin; i < Lmax; i++)
2191 {
2192 int len = 1;
2193 if (Feptr >= mb->end_subject)
2194 {
2195 SCHECK_PARTIAL();
2196 break;
2197 }
2198#ifdef SUPPORT_UNICODE
2199 GETCHARLENTEST(fc, Feptr, len);
2200#else
2201 fc = *Feptr;
2202#endif
2203 if (!PRIV(xclass)(fc, Lxclass_data, utf)) break;
2204 Feptr += len;
2205 }
2206
2207 if (reptype == REPTYPE_POS) continue; /* No backtracking */
2208
2209 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
2210 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
2211 go too far. */
2212
2213 for(;;)
2214 {
2215 RMATCH(Fecode, RM101);
2216 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2217 if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */
2218#ifdef SUPPORT_UNICODE
2219 if (utf) BACKCHAR(Feptr);
2220#endif
2221 }
2222 RRETURN(MATCH_NOMATCH);
2223 }
2224
2225 /* Control never gets here */
2226 }
2227#endif /* SUPPORT_WIDE_CHARS: end of XCLASS */
2228
2229#undef Lstart_eptr
2230#undef Lxclass_data
2231#undef Lmin
2232#undef Lmax
2233
2234
2235 /* ===================================================================== */
2236 /* Match various character types when PCRE2_UCP is not set. These opcodes
2237 are not generated when PCRE2_UCP is set - instead appropriate property
2238 tests are compiled. */
2239
2240 case OP_NOT_DIGIT:
2241 if (Feptr >= mb->end_subject)
2242 {
2243 SCHECK_PARTIAL();
2244 RRETURN(MATCH_NOMATCH);
2245 }
2246 GETCHARINCTEST(fc, Feptr);
2247 if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0)
2248 RRETURN(MATCH_NOMATCH);
2249 Fecode++;
2250 break;
2251
2252 case OP_DIGIT:
2253 if (Feptr >= mb->end_subject)
2254 {
2255 SCHECK_PARTIAL();
2256 RRETURN(MATCH_NOMATCH);
2257 }
2258 GETCHARINCTEST(fc, Feptr);
2259 if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0)
2260 RRETURN(MATCH_NOMATCH);
2261 Fecode++;
2262 break;
2263
2264 case OP_NOT_WHITESPACE:
2265 if (Feptr >= mb->end_subject)
2266 {
2267 SCHECK_PARTIAL();
2268 RRETURN(MATCH_NOMATCH);
2269 }
2270 GETCHARINCTEST(fc, Feptr);
2271 if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0)
2272 RRETURN(MATCH_NOMATCH);
2273 Fecode++;
2274 break;
2275
2276 case OP_WHITESPACE:
2277 if (Feptr >= mb->end_subject)
2278 {
2279 SCHECK_PARTIAL();
2280 RRETURN(MATCH_NOMATCH);
2281 }
2282 GETCHARINCTEST(fc, Feptr);
2283 if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0)
2284 RRETURN(MATCH_NOMATCH);
2285 Fecode++;
2286 break;
2287
2288 case OP_NOT_WORDCHAR:
2289 if (Feptr >= mb->end_subject)
2290 {
2291 SCHECK_PARTIAL();
2292 RRETURN(MATCH_NOMATCH);
2293 }
2294 GETCHARINCTEST(fc, Feptr);
2295 if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0)
2296 RRETURN(MATCH_NOMATCH);
2297 Fecode++;
2298 break;
2299
2300 case OP_WORDCHAR:
2301 if (Feptr >= mb->end_subject)
2302 {
2303 SCHECK_PARTIAL();
2304 RRETURN(MATCH_NOMATCH);
2305 }
2306 GETCHARINCTEST(fc, Feptr);
2307 if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0)
2308 RRETURN(MATCH_NOMATCH);
2309 Fecode++;
2310 break;
2311
2312 case OP_ANYNL:
2313 if (Feptr >= mb->end_subject)
2314 {
2315 SCHECK_PARTIAL();
2316 RRETURN(MATCH_NOMATCH);
2317 }
2318 GETCHARINCTEST(fc, Feptr);
2319 switch(fc)
2320 {
2321 default: RRETURN(MATCH_NOMATCH);
2322
2323 case CHAR_CR:
2324 if (Feptr >= mb->end_subject)
2325 {
2326 SCHECK_PARTIAL();
2327 }
2328 else if (UCHAR21TEST(Feptr) == CHAR_LF) Feptr++;
2329 break;
2330
2331 case CHAR_LF:
2332 break;
2333
2334 case CHAR_VT:
2335 case CHAR_FF:
2336 case CHAR_NEL:
2337#ifndef EBCDIC
2338 case 0x2028:
2339 case 0x2029:
2340#endif /* Not EBCDIC */
2341 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
2342 break;
2343 }
2344 Fecode++;
2345 break;
2346
2347 case OP_NOT_HSPACE:
2348 if (Feptr >= mb->end_subject)
2349 {
2350 SCHECK_PARTIAL();
2351 RRETURN(MATCH_NOMATCH);
2352 }
2353 GETCHARINCTEST(fc, Feptr);
2354 switch(fc)
2355 {
2356 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2357 default: break;
2358 }
2359 Fecode++;
2360 break;
2361
2362 case OP_HSPACE:
2363 if (Feptr >= mb->end_subject)
2364 {
2365 SCHECK_PARTIAL();
2366 RRETURN(MATCH_NOMATCH);
2367 }
2368 GETCHARINCTEST(fc, Feptr);
2369 switch(fc)
2370 {
2371 HSPACE_CASES: break; /* Byte and multibyte cases */
2372 default: RRETURN(MATCH_NOMATCH);
2373 }
2374 Fecode++;
2375 break;
2376
2377 case OP_NOT_VSPACE:
2378 if (Feptr >= mb->end_subject)
2379 {
2380 SCHECK_PARTIAL();
2381 RRETURN(MATCH_NOMATCH);
2382 }
2383 GETCHARINCTEST(fc, Feptr);
2384 switch(fc)
2385 {
2386 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2387 default: break;
2388 }
2389 Fecode++;
2390 break;
2391
2392 case OP_VSPACE:
2393 if (Feptr >= mb->end_subject)
2394 {
2395 SCHECK_PARTIAL();
2396 RRETURN(MATCH_NOMATCH);
2397 }
2398 GETCHARINCTEST(fc, Feptr);
2399 switch(fc)
2400 {
2401 VSPACE_CASES: break;
2402 default: RRETURN(MATCH_NOMATCH);
2403 }
2404 Fecode++;
2405 break;
2406
2407
2408#ifdef SUPPORT_UNICODE
2409
2410 /* ===================================================================== */
2411 /* Check the next character by Unicode property. We will get here only
2412 if the support is in the binary; otherwise a compile-time error occurs. */
2413
2414 case OP_PROP:
2415 case OP_NOTPROP:
2416 if (Feptr >= mb->end_subject)
2417 {
2418 SCHECK_PARTIAL();
2419 RRETURN(MATCH_NOMATCH);
2420 }
2421 GETCHARINCTEST(fc, Feptr);
2422 {
2423 const uint32_t *cp;
2424 const ucd_record *prop = GET_UCD(fc);
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002425 BOOL notmatch = Fop == OP_NOTPROP;
Elliott Hughes5b808042021-10-01 10:56:10 -07002426
2427 switch(Fecode[1])
2428 {
2429 case PT_ANY:
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002430 if (notmatch) RRETURN(MATCH_NOMATCH);
Elliott Hughes5b808042021-10-01 10:56:10 -07002431 break;
2432
2433 case PT_LAMP:
2434 if ((prop->chartype == ucp_Lu ||
2435 prop->chartype == ucp_Ll ||
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002436 prop->chartype == ucp_Lt) == notmatch)
Elliott Hughes5b808042021-10-01 10:56:10 -07002437 RRETURN(MATCH_NOMATCH);
2438 break;
2439
2440 case PT_GC:
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002441 if ((Fecode[2] == PRIV(ucp_gentype)[prop->chartype]) == notmatch)
Elliott Hughes5b808042021-10-01 10:56:10 -07002442 RRETURN(MATCH_NOMATCH);
2443 break;
2444
2445 case PT_PC:
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002446 if ((Fecode[2] == prop->chartype) == notmatch)
Elliott Hughes5b808042021-10-01 10:56:10 -07002447 RRETURN(MATCH_NOMATCH);
2448 break;
2449
2450 case PT_SC:
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002451 if ((Fecode[2] == prop->script) == notmatch)
Elliott Hughes5b808042021-10-01 10:56:10 -07002452 RRETURN(MATCH_NOMATCH);
2453 break;
2454
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002455 case PT_SCX:
2456 {
2457 BOOL ok = (Fecode[2] == prop->script ||
2458 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Fecode[2]) != 0);
2459 if (ok == notmatch) RRETURN(MATCH_NOMATCH);
2460 }
2461 break;
2462
Elliott Hughes5b808042021-10-01 10:56:10 -07002463 /* These are specials */
2464
2465 case PT_ALNUM:
2466 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002467 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == notmatch)
Elliott Hughes5b808042021-10-01 10:56:10 -07002468 RRETURN(MATCH_NOMATCH);
2469 break;
2470
2471 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2472 which means that Perl space and POSIX space are now identical. PCRE
2473 was changed at release 8.34. */
2474
2475 case PT_SPACE: /* Perl space */
2476 case PT_PXSPACE: /* POSIX space */
2477 switch(fc)
2478 {
2479 HSPACE_CASES:
2480 VSPACE_CASES:
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002481 if (notmatch) RRETURN(MATCH_NOMATCH);
Elliott Hughes5b808042021-10-01 10:56:10 -07002482 break;
2483
2484 default:
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002485 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == notmatch)
2486 RRETURN(MATCH_NOMATCH);
Elliott Hughes5b808042021-10-01 10:56:10 -07002487 break;
2488 }
2489 break;
2490
2491 case PT_WORD:
2492 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2493 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002494 fc == CHAR_UNDERSCORE) == notmatch)
Elliott Hughes5b808042021-10-01 10:56:10 -07002495 RRETURN(MATCH_NOMATCH);
2496 break;
2497
2498 case PT_CLIST:
2499 cp = PRIV(ucd_caseless_sets) + Fecode[2];
2500 for (;;)
2501 {
2502 if (fc < *cp)
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002503 { if (notmatch) break; else { RRETURN(MATCH_NOMATCH); } }
Elliott Hughes5b808042021-10-01 10:56:10 -07002504 if (fc == *cp++)
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002505 { if (notmatch) { RRETURN(MATCH_NOMATCH); } else break; }
Elliott Hughes5b808042021-10-01 10:56:10 -07002506 }
2507 break;
2508
2509 case PT_UCNC:
2510 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
2511 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002512 fc >= 0xe000) == notmatch)
Elliott Hughes5b808042021-10-01 10:56:10 -07002513 RRETURN(MATCH_NOMATCH);
2514 break;
2515
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002516 case PT_BIDICL:
2517 if ((UCD_BIDICLASS_PROP(prop) == Fecode[2]) == notmatch)
2518 RRETURN(MATCH_NOMATCH);
2519 break;
2520
2521 case PT_BOOL:
2522 {
2523 BOOL ok = MAPBIT(PRIV(ucd_boolprop_sets) +
2524 UCD_BPROPS_PROP(prop), Fecode[2]) != 0;
2525 if (ok == notmatch) RRETURN(MATCH_NOMATCH);
2526 }
2527 break;
2528
Elliott Hughes5b808042021-10-01 10:56:10 -07002529 /* This should never occur */
2530
2531 default:
2532 return PCRE2_ERROR_INTERNAL;
2533 }
2534
2535 Fecode += 3;
2536 }
2537 break;
2538
2539
2540 /* ===================================================================== */
2541 /* Match an extended Unicode sequence. We will get here only if the support
2542 is in the binary; otherwise a compile-time error occurs. */
2543
2544 case OP_EXTUNI:
2545 if (Feptr >= mb->end_subject)
2546 {
2547 SCHECK_PARTIAL();
2548 RRETURN(MATCH_NOMATCH);
2549 }
2550 else
2551 {
2552 GETCHARINCTEST(fc, Feptr);
2553 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, utf,
2554 NULL);
2555 }
2556 CHECK_PARTIAL();
2557 Fecode++;
2558 break;
2559
2560#endif /* SUPPORT_UNICODE */
2561
2562
2563 /* ===================================================================== */
2564 /* Match a single character type repeatedly. Note that the property type
2565 does not need to be in a stack frame as it is not used within an RMATCH()
2566 loop. */
2567
2568#define Lstart_eptr F->temp_sptr[0]
2569#define Lmin F->temp_32[0]
2570#define Lmax F->temp_32[1]
2571#define Lctype F->temp_32[2]
2572#define Lpropvalue F->temp_32[3]
2573
2574 case OP_TYPEEXACT:
2575 Lmin = Lmax = GET2(Fecode, 1);
2576 Fecode += 1 + IMM2_SIZE;
2577 goto REPEATTYPE;
2578
2579 case OP_TYPEUPTO:
2580 case OP_TYPEMINUPTO:
2581 Lmin = 0;
2582 Lmax = GET2(Fecode, 1);
2583 reptype = (*Fecode == OP_TYPEMINUPTO)? REPTYPE_MIN : REPTYPE_MAX;
2584 Fecode += 1 + IMM2_SIZE;
2585 goto REPEATTYPE;
2586
2587 case OP_TYPEPOSSTAR:
2588 reptype = REPTYPE_POS;
2589 Lmin = 0;
2590 Lmax = UINT32_MAX;
2591 Fecode++;
2592 goto REPEATTYPE;
2593
2594 case OP_TYPEPOSPLUS:
2595 reptype = REPTYPE_POS;
2596 Lmin = 1;
2597 Lmax = UINT32_MAX;
2598 Fecode++;
2599 goto REPEATTYPE;
2600
2601 case OP_TYPEPOSQUERY:
2602 reptype = REPTYPE_POS;
2603 Lmin = 0;
2604 Lmax = 1;
2605 Fecode++;
2606 goto REPEATTYPE;
2607
2608 case OP_TYPEPOSUPTO:
2609 reptype = REPTYPE_POS;
2610 Lmin = 0;
2611 Lmax = GET2(Fecode, 1);
2612 Fecode += 1 + IMM2_SIZE;
2613 goto REPEATTYPE;
2614
2615 case OP_TYPESTAR:
2616 case OP_TYPEMINSTAR:
2617 case OP_TYPEPLUS:
2618 case OP_TYPEMINPLUS:
2619 case OP_TYPEQUERY:
2620 case OP_TYPEMINQUERY:
2621 fc = *Fecode++ - OP_TYPESTAR;
2622 Lmin = rep_min[fc];
2623 Lmax = rep_max[fc];
2624 reptype = rep_typ[fc];
2625
2626 /* Common code for all repeated character type matches. */
2627
2628 REPEATTYPE:
2629 Lctype = *Fecode++; /* Code for the character type */
2630
2631#ifdef SUPPORT_UNICODE
2632 if (Lctype == OP_PROP || Lctype == OP_NOTPROP)
2633 {
2634 proptype = *Fecode++;
2635 Lpropvalue = *Fecode++;
2636 }
2637 else proptype = -1;
2638#endif
2639
2640 /* First, ensure the minimum number of matches are present. Use inline
2641 code for maximizing the speed, and do the type test once at the start
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002642 (i.e. keep it out of the loops). As there are no calls to RMATCH in the
2643 loops, we can use an ordinary variable for "notmatch". The code for UTF
2644 mode is separated out for tidiness, except for Unicode property tests. */
Elliott Hughes5b808042021-10-01 10:56:10 -07002645
2646 if (Lmin > 0)
2647 {
2648#ifdef SUPPORT_UNICODE
2649 if (proptype >= 0) /* Property tests in all modes */
2650 {
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002651 BOOL notmatch = Lctype == OP_NOTPROP;
Elliott Hughes5b808042021-10-01 10:56:10 -07002652 switch(proptype)
2653 {
2654 case PT_ANY:
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002655 if (notmatch) RRETURN(MATCH_NOMATCH);
Elliott Hughes5b808042021-10-01 10:56:10 -07002656 for (i = 1; i <= Lmin; i++)
2657 {
2658 if (Feptr >= mb->end_subject)
2659 {
2660 SCHECK_PARTIAL();
2661 RRETURN(MATCH_NOMATCH);
2662 }
2663 GETCHARINCTEST(fc, Feptr);
2664 }
2665 break;
2666
2667 case PT_LAMP:
2668 for (i = 1; i <= Lmin; i++)
2669 {
2670 int chartype;
2671 if (Feptr >= mb->end_subject)
2672 {
2673 SCHECK_PARTIAL();
2674 RRETURN(MATCH_NOMATCH);
2675 }
2676 GETCHARINCTEST(fc, Feptr);
2677 chartype = UCD_CHARTYPE(fc);
2678 if ((chartype == ucp_Lu ||
2679 chartype == ucp_Ll ||
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002680 chartype == ucp_Lt) == notmatch)
Elliott Hughes5b808042021-10-01 10:56:10 -07002681 RRETURN(MATCH_NOMATCH);
2682 }
2683 break;
2684
2685 case PT_GC:
2686 for (i = 1; i <= Lmin; i++)
2687 {
2688 if (Feptr >= mb->end_subject)
2689 {
2690 SCHECK_PARTIAL();
2691 RRETURN(MATCH_NOMATCH);
2692 }
2693 GETCHARINCTEST(fc, Feptr);
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002694 if ((UCD_CATEGORY(fc) == Lpropvalue) == notmatch)
Elliott Hughes5b808042021-10-01 10:56:10 -07002695 RRETURN(MATCH_NOMATCH);
2696 }
2697 break;
2698
2699 case PT_PC:
2700 for (i = 1; i <= Lmin; i++)
2701 {
2702 if (Feptr >= mb->end_subject)
2703 {
2704 SCHECK_PARTIAL();
2705 RRETURN(MATCH_NOMATCH);
2706 }
2707 GETCHARINCTEST(fc, Feptr);
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002708 if ((UCD_CHARTYPE(fc) == Lpropvalue) == notmatch)
Elliott Hughes5b808042021-10-01 10:56:10 -07002709 RRETURN(MATCH_NOMATCH);
2710 }
2711 break;
2712
2713 case PT_SC:
2714 for (i = 1; i <= Lmin; i++)
2715 {
2716 if (Feptr >= mb->end_subject)
2717 {
2718 SCHECK_PARTIAL();
2719 RRETURN(MATCH_NOMATCH);
2720 }
2721 GETCHARINCTEST(fc, Feptr);
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002722 if ((UCD_SCRIPT(fc) == Lpropvalue) == notmatch)
2723 RRETURN(MATCH_NOMATCH);
2724 }
2725 break;
2726
2727 case PT_SCX:
2728 for (i = 1; i <= Lmin; i++)
2729 {
2730 BOOL ok;
2731 const ucd_record *prop;
2732 if (Feptr >= mb->end_subject)
2733 {
2734 SCHECK_PARTIAL();
2735 RRETURN(MATCH_NOMATCH);
2736 }
2737 GETCHARINCTEST(fc, Feptr);
2738 prop = GET_UCD(fc);
2739 ok = (prop->script == Lpropvalue ||
2740 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0);
2741 if (ok == notmatch)
Elliott Hughes5b808042021-10-01 10:56:10 -07002742 RRETURN(MATCH_NOMATCH);
2743 }
2744 break;
2745
2746 case PT_ALNUM:
2747 for (i = 1; i <= Lmin; i++)
2748 {
2749 int category;
2750 if (Feptr >= mb->end_subject)
2751 {
2752 SCHECK_PARTIAL();
2753 RRETURN(MATCH_NOMATCH);
2754 }
2755 GETCHARINCTEST(fc, Feptr);
2756 category = UCD_CATEGORY(fc);
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002757 if ((category == ucp_L || category == ucp_N) == notmatch)
Elliott Hughes5b808042021-10-01 10:56:10 -07002758 RRETURN(MATCH_NOMATCH);
2759 }
2760 break;
2761
2762 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2763 which means that Perl space and POSIX space are now identical. PCRE
2764 was changed at release 8.34. */
2765
2766 case PT_SPACE: /* Perl space */
2767 case PT_PXSPACE: /* POSIX space */
2768 for (i = 1; i <= Lmin; i++)
2769 {
2770 if (Feptr >= mb->end_subject)
2771 {
2772 SCHECK_PARTIAL();
2773 RRETURN(MATCH_NOMATCH);
2774 }
2775 GETCHARINCTEST(fc, Feptr);
2776 switch(fc)
2777 {
2778 HSPACE_CASES:
2779 VSPACE_CASES:
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002780 if (notmatch) RRETURN(MATCH_NOMATCH);
Elliott Hughes5b808042021-10-01 10:56:10 -07002781 break;
2782
2783 default:
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002784 if ((UCD_CATEGORY(fc) == ucp_Z) == notmatch)
Elliott Hughes5b808042021-10-01 10:56:10 -07002785 RRETURN(MATCH_NOMATCH);
2786 break;
2787 }
2788 }
2789 break;
2790
2791 case PT_WORD:
2792 for (i = 1; i <= Lmin; i++)
2793 {
2794 int category;
2795 if (Feptr >= mb->end_subject)
2796 {
2797 SCHECK_PARTIAL();
2798 RRETURN(MATCH_NOMATCH);
2799 }
2800 GETCHARINCTEST(fc, Feptr);
2801 category = UCD_CATEGORY(fc);
2802 if ((category == ucp_L || category == ucp_N ||
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002803 fc == CHAR_UNDERSCORE) == notmatch)
Elliott Hughes5b808042021-10-01 10:56:10 -07002804 RRETURN(MATCH_NOMATCH);
2805 }
2806 break;
2807
2808 case PT_CLIST:
2809 for (i = 1; i <= Lmin; i++)
2810 {
2811 const uint32_t *cp;
2812 if (Feptr >= mb->end_subject)
2813 {
2814 SCHECK_PARTIAL();
2815 RRETURN(MATCH_NOMATCH);
2816 }
2817 GETCHARINCTEST(fc, Feptr);
2818 cp = PRIV(ucd_caseless_sets) + Lpropvalue;
2819 for (;;)
2820 {
2821 if (fc < *cp)
2822 {
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002823 if (notmatch) break;
Elliott Hughes5b808042021-10-01 10:56:10 -07002824 RRETURN(MATCH_NOMATCH);
2825 }
2826 if (fc == *cp++)
2827 {
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002828 if (notmatch) RRETURN(MATCH_NOMATCH);
Elliott Hughes5b808042021-10-01 10:56:10 -07002829 break;
2830 }
2831 }
2832 }
2833 break;
2834
2835 case PT_UCNC:
2836 for (i = 1; i <= Lmin; i++)
2837 {
2838 if (Feptr >= mb->end_subject)
2839 {
2840 SCHECK_PARTIAL();
2841 RRETURN(MATCH_NOMATCH);
2842 }
2843 GETCHARINCTEST(fc, Feptr);
2844 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
2845 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002846 fc >= 0xe000) == notmatch)
2847 RRETURN(MATCH_NOMATCH);
2848 }
2849 break;
2850
2851 case PT_BIDICL:
2852 for (i = 1; i <= Lmin; i++)
2853 {
2854 if (Feptr >= mb->end_subject)
2855 {
2856 SCHECK_PARTIAL();
2857 RRETURN(MATCH_NOMATCH);
2858 }
2859 GETCHARINCTEST(fc, Feptr);
2860 if ((UCD_BIDICLASS(fc) == Lpropvalue) == notmatch)
2861 RRETURN(MATCH_NOMATCH);
2862 }
2863 break;
2864
2865 case PT_BOOL:
2866 for (i = 1; i <= Lmin; i++)
2867 {
2868 BOOL ok;
2869 const ucd_record *prop;
2870 if (Feptr >= mb->end_subject)
2871 {
2872 SCHECK_PARTIAL();
2873 RRETURN(MATCH_NOMATCH);
2874 }
2875 GETCHARINCTEST(fc, Feptr);
2876 prop = GET_UCD(fc);
2877 ok = MAPBIT(PRIV(ucd_boolprop_sets) +
2878 UCD_BPROPS_PROP(prop), Lpropvalue) != 0;
2879 if (ok == notmatch)
Elliott Hughes5b808042021-10-01 10:56:10 -07002880 RRETURN(MATCH_NOMATCH);
2881 }
2882 break;
2883
2884 /* This should not occur */
2885
2886 default:
2887 return PCRE2_ERROR_INTERNAL;
2888 }
2889 }
2890
2891 /* Match extended Unicode sequences. We will get here only if the
2892 support is in the binary; otherwise a compile-time error occurs. */
2893
2894 else if (Lctype == OP_EXTUNI)
2895 {
2896 for (i = 1; i <= Lmin; i++)
2897 {
2898 if (Feptr >= mb->end_subject)
2899 {
2900 SCHECK_PARTIAL();
2901 RRETURN(MATCH_NOMATCH);
2902 }
2903 else
2904 {
2905 GETCHARINCTEST(fc, Feptr);
2906 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject,
2907 mb->end_subject, utf, NULL);
2908 }
2909 CHECK_PARTIAL();
2910 }
2911 }
2912 else
2913#endif /* SUPPORT_UNICODE */
2914
2915/* Handle all other cases in UTF mode */
2916
2917#ifdef SUPPORT_UNICODE
2918 if (utf) switch(Lctype)
2919 {
2920 case OP_ANY:
2921 for (i = 1; i <= Lmin; i++)
2922 {
2923 if (Feptr >= mb->end_subject)
2924 {
2925 SCHECK_PARTIAL();
2926 RRETURN(MATCH_NOMATCH);
2927 }
2928 if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
2929 if (mb->partial != 0 &&
2930 Feptr + 1 >= mb->end_subject &&
2931 NLBLOCK->nltype == NLTYPE_FIXED &&
2932 NLBLOCK->nllen == 2 &&
2933 UCHAR21(Feptr) == NLBLOCK->nl[0])
2934 {
2935 mb->hitend = TRUE;
2936 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
2937 }
2938 Feptr++;
2939 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
2940 }
2941 break;
2942
2943 case OP_ALLANY:
2944 for (i = 1; i <= Lmin; i++)
2945 {
2946 if (Feptr >= mb->end_subject)
2947 {
2948 SCHECK_PARTIAL();
2949 RRETURN(MATCH_NOMATCH);
2950 }
2951 Feptr++;
2952 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
2953 }
2954 break;
2955
2956 case OP_ANYBYTE:
2957 if (Feptr > mb->end_subject - Lmin) RRETURN(MATCH_NOMATCH);
2958 Feptr += Lmin;
2959 break;
2960
2961 case OP_ANYNL:
2962 for (i = 1; i <= Lmin; i++)
2963 {
2964 if (Feptr >= mb->end_subject)
2965 {
2966 SCHECK_PARTIAL();
2967 RRETURN(MATCH_NOMATCH);
2968 }
2969 GETCHARINC(fc, Feptr);
2970 switch(fc)
2971 {
2972 default: RRETURN(MATCH_NOMATCH);
2973
2974 case CHAR_CR:
2975 if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++;
2976 break;
2977
2978 case CHAR_LF:
2979 break;
2980
2981 case CHAR_VT:
2982 case CHAR_FF:
2983 case CHAR_NEL:
2984#ifndef EBCDIC
2985 case 0x2028:
2986 case 0x2029:
2987#endif /* Not EBCDIC */
2988 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
2989 break;
2990 }
2991 }
2992 break;
2993
2994 case OP_NOT_HSPACE:
2995 for (i = 1; i <= Lmin; i++)
2996 {
2997 if (Feptr >= mb->end_subject)
2998 {
2999 SCHECK_PARTIAL();
3000 RRETURN(MATCH_NOMATCH);
3001 }
3002 GETCHARINC(fc, Feptr);
3003 switch(fc)
3004 {
3005 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
3006 default: break;
3007 }
3008 }
3009 break;
3010
3011 case OP_HSPACE:
3012 for (i = 1; i <= Lmin; i++)
3013 {
3014 if (Feptr >= mb->end_subject)
3015 {
3016 SCHECK_PARTIAL();
3017 RRETURN(MATCH_NOMATCH);
3018 }
3019 GETCHARINC(fc, Feptr);
3020 switch(fc)
3021 {
3022 HSPACE_CASES: break;
3023 default: RRETURN(MATCH_NOMATCH);
3024 }
3025 }
3026 break;
3027
3028 case OP_NOT_VSPACE:
3029 for (i = 1; i <= Lmin; i++)
3030 {
3031 if (Feptr >= mb->end_subject)
3032 {
3033 SCHECK_PARTIAL();
3034 RRETURN(MATCH_NOMATCH);
3035 }
3036 GETCHARINC(fc, Feptr);
3037 switch(fc)
3038 {
3039 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
3040 default: break;
3041 }
3042 }
3043 break;
3044
3045 case OP_VSPACE:
3046 for (i = 1; i <= Lmin; i++)
3047 {
3048 if (Feptr >= mb->end_subject)
3049 {
3050 SCHECK_PARTIAL();
3051 RRETURN(MATCH_NOMATCH);
3052 }
3053 GETCHARINC(fc, Feptr);
3054 switch(fc)
3055 {
3056 VSPACE_CASES: break;
3057 default: RRETURN(MATCH_NOMATCH);
3058 }
3059 }
3060 break;
3061
3062 case OP_NOT_DIGIT:
3063 for (i = 1; i <= Lmin; i++)
3064 {
3065 if (Feptr >= mb->end_subject)
3066 {
3067 SCHECK_PARTIAL();
3068 RRETURN(MATCH_NOMATCH);
3069 }
3070 GETCHARINC(fc, Feptr);
3071 if (fc < 128 && (mb->ctypes[fc] & ctype_digit) != 0)
3072 RRETURN(MATCH_NOMATCH);
3073 }
3074 break;
3075
3076 case OP_DIGIT:
3077 for (i = 1; i <= Lmin; i++)
3078 {
3079 uint32_t cc;
3080 if (Feptr >= mb->end_subject)
3081 {
3082 SCHECK_PARTIAL();
3083 RRETURN(MATCH_NOMATCH);
3084 }
3085 cc = UCHAR21(Feptr);
3086 if (cc >= 128 || (mb->ctypes[cc] & ctype_digit) == 0)
3087 RRETURN(MATCH_NOMATCH);
3088 Feptr++;
3089 /* No need to skip more code units - we know it has only one. */
3090 }
3091 break;
3092
3093 case OP_NOT_WHITESPACE:
3094 for (i = 1; i <= Lmin; i++)
3095 {
3096 uint32_t cc;
3097 if (Feptr >= mb->end_subject)
3098 {
3099 SCHECK_PARTIAL();
3100 RRETURN(MATCH_NOMATCH);
3101 }
3102 cc = UCHAR21(Feptr);
3103 if (cc < 128 && (mb->ctypes[cc] & ctype_space) != 0)
3104 RRETURN(MATCH_NOMATCH);
3105 Feptr++;
3106 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
3107 }
3108 break;
3109
3110 case OP_WHITESPACE:
3111 for (i = 1; i <= Lmin; i++)
3112 {
3113 uint32_t cc;
3114 if (Feptr >= mb->end_subject)
3115 {
3116 SCHECK_PARTIAL();
3117 RRETURN(MATCH_NOMATCH);
3118 }
3119 cc = UCHAR21(Feptr);
3120 if (cc >= 128 || (mb->ctypes[cc] & ctype_space) == 0)
3121 RRETURN(MATCH_NOMATCH);
3122 Feptr++;
3123 /* No need to skip more code units - we know it has only one. */
3124 }
3125 break;
3126
3127 case OP_NOT_WORDCHAR:
3128 for (i = 1; i <= Lmin; i++)
3129 {
3130 uint32_t cc;
3131 if (Feptr >= mb->end_subject)
3132 {
3133 SCHECK_PARTIAL();
3134 RRETURN(MATCH_NOMATCH);
3135 }
3136 cc = UCHAR21(Feptr);
3137 if (cc < 128 && (mb->ctypes[cc] & ctype_word) != 0)
3138 RRETURN(MATCH_NOMATCH);
3139 Feptr++;
3140 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
3141 }
3142 break;
3143
3144 case OP_WORDCHAR:
3145 for (i = 1; i <= Lmin; i++)
3146 {
3147 uint32_t cc;
3148 if (Feptr >= mb->end_subject)
3149 {
3150 SCHECK_PARTIAL();
3151 RRETURN(MATCH_NOMATCH);
3152 }
3153 cc = UCHAR21(Feptr);
3154 if (cc >= 128 || (mb->ctypes[cc] & ctype_word) == 0)
3155 RRETURN(MATCH_NOMATCH);
3156 Feptr++;
3157 /* No need to skip more code units - we know it has only one. */
3158 }
3159 break;
3160
3161 default:
3162 return PCRE2_ERROR_INTERNAL;
3163 } /* End switch(Lctype) */
3164
3165 else
3166#endif /* SUPPORT_UNICODE */
3167
3168 /* Code for the non-UTF case for minimum matching of operators other
3169 than OP_PROP and OP_NOTPROP. */
3170
3171 switch(Lctype)
3172 {
3173 case OP_ANY:
3174 for (i = 1; i <= Lmin; i++)
3175 {
3176 if (Feptr >= mb->end_subject)
3177 {
3178 SCHECK_PARTIAL();
3179 RRETURN(MATCH_NOMATCH);
3180 }
3181 if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
3182 if (mb->partial != 0 &&
3183 Feptr + 1 >= mb->end_subject &&
3184 NLBLOCK->nltype == NLTYPE_FIXED &&
3185 NLBLOCK->nllen == 2 &&
3186 *Feptr == NLBLOCK->nl[0])
3187 {
3188 mb->hitend = TRUE;
3189 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3190 }
3191 Feptr++;
3192 }
3193 break;
3194
3195 case OP_ALLANY:
3196 if (Feptr > mb->end_subject - Lmin)
3197 {
3198 SCHECK_PARTIAL();
3199 RRETURN(MATCH_NOMATCH);
3200 }
3201 Feptr += Lmin;
3202 break;
3203
3204 /* This OP_ANYBYTE case will never be reached because \C gets turned
3205 into OP_ALLANY in non-UTF mode. Cut out the code so that coverage
3206 reports don't complain about it's never being used. */
3207
3208/* case OP_ANYBYTE:
3209* if (Feptr > mb->end_subject - Lmin)
3210* {
3211* SCHECK_PARTIAL();
3212* RRETURN(MATCH_NOMATCH);
3213* }
3214* Feptr += Lmin;
3215* break;
3216*/
3217 case OP_ANYNL:
3218 for (i = 1; i <= Lmin; i++)
3219 {
3220 if (Feptr >= mb->end_subject)
3221 {
3222 SCHECK_PARTIAL();
3223 RRETURN(MATCH_NOMATCH);
3224 }
3225 switch(*Feptr++)
3226 {
3227 default: RRETURN(MATCH_NOMATCH);
3228
3229 case CHAR_CR:
3230 if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++;
3231 break;
3232
3233 case CHAR_LF:
3234 break;
3235
3236 case CHAR_VT:
3237 case CHAR_FF:
3238 case CHAR_NEL:
3239#if PCRE2_CODE_UNIT_WIDTH != 8
3240 case 0x2028:
3241 case 0x2029:
3242#endif
3243 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
3244 break;
3245 }
3246 }
3247 break;
3248
3249 case OP_NOT_HSPACE:
3250 for (i = 1; i <= Lmin; i++)
3251 {
3252 if (Feptr >= mb->end_subject)
3253 {
3254 SCHECK_PARTIAL();
3255 RRETURN(MATCH_NOMATCH);
3256 }
3257 switch(*Feptr++)
3258 {
3259 default: break;
3260 HSPACE_BYTE_CASES:
3261#if PCRE2_CODE_UNIT_WIDTH != 8
3262 HSPACE_MULTIBYTE_CASES:
3263#endif
3264 RRETURN(MATCH_NOMATCH);
3265 }
3266 }
3267 break;
3268
3269 case OP_HSPACE:
3270 for (i = 1; i <= Lmin; i++)
3271 {
3272 if (Feptr >= mb->end_subject)
3273 {
3274 SCHECK_PARTIAL();
3275 RRETURN(MATCH_NOMATCH);
3276 }
3277 switch(*Feptr++)
3278 {
3279 default: RRETURN(MATCH_NOMATCH);
3280 HSPACE_BYTE_CASES:
3281#if PCRE2_CODE_UNIT_WIDTH != 8
3282 HSPACE_MULTIBYTE_CASES:
3283#endif
3284 break;
3285 }
3286 }
3287 break;
3288
3289 case OP_NOT_VSPACE:
3290 for (i = 1; i <= Lmin; i++)
3291 {
3292 if (Feptr >= mb->end_subject)
3293 {
3294 SCHECK_PARTIAL();
3295 RRETURN(MATCH_NOMATCH);
3296 }
3297 switch(*Feptr++)
3298 {
3299 VSPACE_BYTE_CASES:
3300#if PCRE2_CODE_UNIT_WIDTH != 8
3301 VSPACE_MULTIBYTE_CASES:
3302#endif
3303 RRETURN(MATCH_NOMATCH);
3304 default: break;
3305 }
3306 }
3307 break;
3308
3309 case OP_VSPACE:
3310 for (i = 1; i <= Lmin; i++)
3311 {
3312 if (Feptr >= mb->end_subject)
3313 {
3314 SCHECK_PARTIAL();
3315 RRETURN(MATCH_NOMATCH);
3316 }
3317 switch(*Feptr++)
3318 {
3319 default: RRETURN(MATCH_NOMATCH);
3320 VSPACE_BYTE_CASES:
3321#if PCRE2_CODE_UNIT_WIDTH != 8
3322 VSPACE_MULTIBYTE_CASES:
3323#endif
3324 break;
3325 }
3326 }
3327 break;
3328
3329 case OP_NOT_DIGIT:
3330 for (i = 1; i <= Lmin; i++)
3331 {
3332 if (Feptr >= mb->end_subject)
3333 {
3334 SCHECK_PARTIAL();
3335 RRETURN(MATCH_NOMATCH);
3336 }
3337 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0)
3338 RRETURN(MATCH_NOMATCH);
3339 Feptr++;
3340 }
3341 break;
3342
3343 case OP_DIGIT:
3344 for (i = 1; i <= Lmin; i++)
3345 {
3346 if (Feptr >= mb->end_subject)
3347 {
3348 SCHECK_PARTIAL();
3349 RRETURN(MATCH_NOMATCH);
3350 }
3351 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0)
3352 RRETURN(MATCH_NOMATCH);
3353 Feptr++;
3354 }
3355 break;
3356
3357 case OP_NOT_WHITESPACE:
3358 for (i = 1; i <= Lmin; i++)
3359 {
3360 if (Feptr >= mb->end_subject)
3361 {
3362 SCHECK_PARTIAL();
3363 RRETURN(MATCH_NOMATCH);
3364 }
3365 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0)
3366 RRETURN(MATCH_NOMATCH);
3367 Feptr++;
3368 }
3369 break;
3370
3371 case OP_WHITESPACE:
3372 for (i = 1; i <= Lmin; i++)
3373 {
3374 if (Feptr >= mb->end_subject)
3375 {
3376 SCHECK_PARTIAL();
3377 RRETURN(MATCH_NOMATCH);
3378 }
3379 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0)
3380 RRETURN(MATCH_NOMATCH);
3381 Feptr++;
3382 }
3383 break;
3384
3385 case OP_NOT_WORDCHAR:
3386 for (i = 1; i <= Lmin; i++)
3387 {
3388 if (Feptr >= mb->end_subject)
3389 {
3390 SCHECK_PARTIAL();
3391 RRETURN(MATCH_NOMATCH);
3392 }
3393 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0)
3394 RRETURN(MATCH_NOMATCH);
3395 Feptr++;
3396 }
3397 break;
3398
3399 case OP_WORDCHAR:
3400 for (i = 1; i <= Lmin; i++)
3401 {
3402 if (Feptr >= mb->end_subject)
3403 {
3404 SCHECK_PARTIAL();
3405 RRETURN(MATCH_NOMATCH);
3406 }
3407 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0)
3408 RRETURN(MATCH_NOMATCH);
3409 Feptr++;
3410 }
3411 break;
3412
3413 default:
3414 return PCRE2_ERROR_INTERNAL;
3415 }
3416 }
3417
3418 /* If Lmin = Lmax we are done. Continue with the main loop. */
3419
3420 if (Lmin == Lmax) continue;
3421
3422 /* If minimizing, we have to test the rest of the pattern before each
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07003423 subsequent match. This means we cannot use a local "notmatch" variable as
3424 in the other cases. As all 4 temporary 32-bit values in the frame are
3425 already in use, just test the type each time. */
Elliott Hughes5b808042021-10-01 10:56:10 -07003426
3427 if (reptype == REPTYPE_MIN)
3428 {
3429#ifdef SUPPORT_UNICODE
3430 if (proptype >= 0)
3431 {
3432 switch(proptype)
3433 {
3434 case PT_ANY:
3435 for (;;)
3436 {
3437 RMATCH(Fecode, RM208);
3438 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3439 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3440 if (Feptr >= mb->end_subject)
3441 {
3442 SCHECK_PARTIAL();
3443 RRETURN(MATCH_NOMATCH);
3444 }
3445 GETCHARINCTEST(fc, Feptr);
3446 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
3447 }
3448 /* Control never gets here */
3449
3450 case PT_LAMP:
3451 for (;;)
3452 {
3453 int chartype;
3454 RMATCH(Fecode, RM209);
3455 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3456 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3457 if (Feptr >= mb->end_subject)
3458 {
3459 SCHECK_PARTIAL();
3460 RRETURN(MATCH_NOMATCH);
3461 }
3462 GETCHARINCTEST(fc, Feptr);
3463 chartype = UCD_CHARTYPE(fc);
3464 if ((chartype == ucp_Lu ||
3465 chartype == ucp_Ll ||
3466 chartype == ucp_Lt) == (Lctype == OP_NOTPROP))
3467 RRETURN(MATCH_NOMATCH);
3468 }
3469 /* Control never gets here */
3470
3471 case PT_GC:
3472 for (;;)
3473 {
3474 RMATCH(Fecode, RM210);
3475 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3476 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3477 if (Feptr >= mb->end_subject)
3478 {
3479 SCHECK_PARTIAL();
3480 RRETURN(MATCH_NOMATCH);
3481 }
3482 GETCHARINCTEST(fc, Feptr);
3483 if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3484 RRETURN(MATCH_NOMATCH);
3485 }
3486 /* Control never gets here */
3487
3488 case PT_PC:
3489 for (;;)
3490 {
3491 RMATCH(Fecode, RM211);
3492 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3493 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3494 if (Feptr >= mb->end_subject)
3495 {
3496 SCHECK_PARTIAL();
3497 RRETURN(MATCH_NOMATCH);
3498 }
3499 GETCHARINCTEST(fc, Feptr);
3500 if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3501 RRETURN(MATCH_NOMATCH);
3502 }
3503 /* Control never gets here */
3504
3505 case PT_SC:
3506 for (;;)
3507 {
3508 RMATCH(Fecode, RM212);
3509 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3510 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3511 if (Feptr >= mb->end_subject)
3512 {
3513 SCHECK_PARTIAL();
3514 RRETURN(MATCH_NOMATCH);
3515 }
3516 GETCHARINCTEST(fc, Feptr);
3517 if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3518 RRETURN(MATCH_NOMATCH);
3519 }
3520 /* Control never gets here */
3521
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07003522 case PT_SCX:
3523 for (;;)
3524 {
3525 BOOL ok;
3526 const ucd_record *prop;
3527 RMATCH(Fecode, RM225);
3528 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3529 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3530 if (Feptr >= mb->end_subject)
3531 {
3532 SCHECK_PARTIAL();
3533 RRETURN(MATCH_NOMATCH);
3534 }
3535 GETCHARINCTEST(fc, Feptr);
3536 prop = GET_UCD(fc);
3537 ok = (prop->script == Lpropvalue
3538 || MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0);
3539 if (ok == (Lctype == OP_NOTPROP))
3540 RRETURN(MATCH_NOMATCH);
3541 }
3542 /* Control never gets here */
3543
Elliott Hughes5b808042021-10-01 10:56:10 -07003544 case PT_ALNUM:
3545 for (;;)
3546 {
3547 int category;
3548 RMATCH(Fecode, RM213);
3549 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3550 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3551 if (Feptr >= mb->end_subject)
3552 {
3553 SCHECK_PARTIAL();
3554 RRETURN(MATCH_NOMATCH);
3555 }
3556 GETCHARINCTEST(fc, Feptr);
3557 category = UCD_CATEGORY(fc);
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07003558 if ((category == ucp_L || category == ucp_N) == (Lctype == OP_NOTPROP))
Elliott Hughes5b808042021-10-01 10:56:10 -07003559 RRETURN(MATCH_NOMATCH);
3560 }
3561 /* Control never gets here */
3562
3563 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
3564 which means that Perl space and POSIX space are now identical. PCRE
3565 was changed at release 8.34. */
3566
3567 case PT_SPACE: /* Perl space */
3568 case PT_PXSPACE: /* POSIX space */
3569 for (;;)
3570 {
3571 RMATCH(Fecode, RM214);
3572 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3573 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3574 if (Feptr >= mb->end_subject)
3575 {
3576 SCHECK_PARTIAL();
3577 RRETURN(MATCH_NOMATCH);
3578 }
3579 GETCHARINCTEST(fc, Feptr);
3580 switch(fc)
3581 {
3582 HSPACE_CASES:
3583 VSPACE_CASES:
3584 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
3585 break;
3586
3587 default:
3588 if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP))
3589 RRETURN(MATCH_NOMATCH);
3590 break;
3591 }
3592 }
3593 /* Control never gets here */
3594
3595 case PT_WORD:
3596 for (;;)
3597 {
3598 int category;
3599 RMATCH(Fecode, RM215);
3600 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3601 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3602 if (Feptr >= mb->end_subject)
3603 {
3604 SCHECK_PARTIAL();
3605 RRETURN(MATCH_NOMATCH);
3606 }
3607 GETCHARINCTEST(fc, Feptr);
3608 category = UCD_CATEGORY(fc);
3609 if ((category == ucp_L ||
3610 category == ucp_N ||
3611 fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP))
3612 RRETURN(MATCH_NOMATCH);
3613 }
3614 /* Control never gets here */
3615
3616 case PT_CLIST:
3617 for (;;)
3618 {
3619 const uint32_t *cp;
3620 RMATCH(Fecode, RM216);
3621 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3622 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3623 if (Feptr >= mb->end_subject)
3624 {
3625 SCHECK_PARTIAL();
3626 RRETURN(MATCH_NOMATCH);
3627 }
3628 GETCHARINCTEST(fc, Feptr);
3629 cp = PRIV(ucd_caseless_sets) + Lpropvalue;
3630 for (;;)
3631 {
3632 if (fc < *cp)
3633 {
3634 if (Lctype == OP_NOTPROP) break;
3635 RRETURN(MATCH_NOMATCH);
3636 }
3637 if (fc == *cp++)
3638 {
3639 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
3640 break;
3641 }
3642 }
3643 }
3644 /* Control never gets here */
3645
3646 case PT_UCNC:
3647 for (;;)
3648 {
3649 RMATCH(Fecode, RM217);
3650 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3651 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3652 if (Feptr >= mb->end_subject)
3653 {
3654 SCHECK_PARTIAL();
3655 RRETURN(MATCH_NOMATCH);
3656 }
3657 GETCHARINCTEST(fc, Feptr);
3658 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
3659 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
3660 fc >= 0xe000) == (Lctype == OP_NOTPROP))
3661 RRETURN(MATCH_NOMATCH);
3662 }
3663 /* Control never gets here */
3664
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07003665 case PT_BIDICL:
3666 for (;;)
3667 {
3668 RMATCH(Fecode, RM224);
3669 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3670 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3671 if (Feptr >= mb->end_subject)
3672 {
3673 SCHECK_PARTIAL();
3674 RRETURN(MATCH_NOMATCH);
3675 }
3676 GETCHARINCTEST(fc, Feptr);
3677 if ((UCD_BIDICLASS(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3678 RRETURN(MATCH_NOMATCH);
3679 }
3680 /* Control never gets here */
3681
3682 case PT_BOOL:
3683 for (;;)
3684 {
3685 BOOL ok;
3686 const ucd_record *prop;
3687 RMATCH(Fecode, RM223);
3688 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3689 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3690 if (Feptr >= mb->end_subject)
3691 {
3692 SCHECK_PARTIAL();
3693 RRETURN(MATCH_NOMATCH);
3694 }
3695 GETCHARINCTEST(fc, Feptr);
3696 prop = GET_UCD(fc);
3697 ok = MAPBIT(PRIV(ucd_boolprop_sets) +
3698 UCD_BPROPS_PROP(prop), Lpropvalue) != 0;
3699 if (ok == (Lctype == OP_NOTPROP))
3700 RRETURN(MATCH_NOMATCH);
3701 }
3702 /* Control never gets here */
3703
Elliott Hughes5b808042021-10-01 10:56:10 -07003704 /* This should never occur */
3705 default:
3706 return PCRE2_ERROR_INTERNAL;
3707 }
3708 }
3709
3710 /* Match extended Unicode sequences. We will get here only if the
3711 support is in the binary; otherwise a compile-time error occurs. */
3712
3713 else if (Lctype == OP_EXTUNI)
3714 {
3715 for (;;)
3716 {
3717 RMATCH(Fecode, RM218);
3718 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3719 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3720 if (Feptr >= mb->end_subject)
3721 {
3722 SCHECK_PARTIAL();
3723 RRETURN(MATCH_NOMATCH);
3724 }
3725 else
3726 {
3727 GETCHARINCTEST(fc, Feptr);
3728 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,
3729 utf, NULL);
3730 }
3731 CHECK_PARTIAL();
3732 }
3733 }
3734 else
3735#endif /* SUPPORT_UNICODE */
3736
3737 /* UTF mode for non-property testing character types. */
3738
3739#ifdef SUPPORT_UNICODE
3740 if (utf)
3741 {
3742 for (;;)
3743 {
3744 RMATCH(Fecode, RM219);
3745 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3746 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3747 if (Feptr >= mb->end_subject)
3748 {
3749 SCHECK_PARTIAL();
3750 RRETURN(MATCH_NOMATCH);
3751 }
3752 if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
3753 GETCHARINC(fc, Feptr);
3754 switch(Lctype)
3755 {
3756 case OP_ANY: /* This is the non-NL case */
3757 if (mb->partial != 0 && /* Take care with CRLF partial */
3758 Feptr >= mb->end_subject &&
3759 NLBLOCK->nltype == NLTYPE_FIXED &&
3760 NLBLOCK->nllen == 2 &&
3761 fc == NLBLOCK->nl[0])
3762 {
3763 mb->hitend = TRUE;
3764 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3765 }
3766 break;
3767
3768 case OP_ALLANY:
3769 case OP_ANYBYTE:
3770 break;
3771
3772 case OP_ANYNL:
3773 switch(fc)
3774 {
3775 default: RRETURN(MATCH_NOMATCH);
3776
3777 case CHAR_CR:
3778 if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++;
3779 break;
3780
3781 case CHAR_LF:
3782 break;
3783
3784 case CHAR_VT:
3785 case CHAR_FF:
3786 case CHAR_NEL:
3787#ifndef EBCDIC
3788 case 0x2028:
3789 case 0x2029:
3790#endif /* Not EBCDIC */
3791 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF)
3792 RRETURN(MATCH_NOMATCH);
3793 break;
3794 }
3795 break;
3796
3797 case OP_NOT_HSPACE:
3798 switch(fc)
3799 {
3800 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
3801 default: break;
3802 }
3803 break;
3804
3805 case OP_HSPACE:
3806 switch(fc)
3807 {
3808 HSPACE_CASES: break;
3809 default: RRETURN(MATCH_NOMATCH);
3810 }
3811 break;
3812
3813 case OP_NOT_VSPACE:
3814 switch(fc)
3815 {
3816 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
3817 default: break;
3818 }
3819 break;
3820
3821 case OP_VSPACE:
3822 switch(fc)
3823 {
3824 VSPACE_CASES: break;
3825 default: RRETURN(MATCH_NOMATCH);
3826 }
3827 break;
3828
3829 case OP_NOT_DIGIT:
3830 if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0)
3831 RRETURN(MATCH_NOMATCH);
3832 break;
3833
3834 case OP_DIGIT:
3835 if (fc >= 256 || (mb->ctypes[fc] & ctype_digit) == 0)
3836 RRETURN(MATCH_NOMATCH);
3837 break;
3838
3839 case OP_NOT_WHITESPACE:
3840 if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0)
3841 RRETURN(MATCH_NOMATCH);
3842 break;
3843
3844 case OP_WHITESPACE:
3845 if (fc >= 256 || (mb->ctypes[fc] & ctype_space) == 0)
3846 RRETURN(MATCH_NOMATCH);
3847 break;
3848
3849 case OP_NOT_WORDCHAR:
3850 if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0)
3851 RRETURN(MATCH_NOMATCH);
3852 break;
3853
3854 case OP_WORDCHAR:
3855 if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0)
3856 RRETURN(MATCH_NOMATCH);
3857 break;
3858
3859 default:
3860 return PCRE2_ERROR_INTERNAL;
3861 }
3862 }
3863 }
3864 else
3865#endif /* SUPPORT_UNICODE */
3866
3867 /* Not UTF mode */
3868 {
3869 for (;;)
3870 {
3871 RMATCH(Fecode, RM33);
3872 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3873 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3874 if (Feptr >= mb->end_subject)
3875 {
3876 SCHECK_PARTIAL();
3877 RRETURN(MATCH_NOMATCH);
3878 }
3879 if (Lctype == OP_ANY && IS_NEWLINE(Feptr))
3880 RRETURN(MATCH_NOMATCH);
3881 fc = *Feptr++;
3882 switch(Lctype)
3883 {
3884 case OP_ANY: /* This is the non-NL case */
3885 if (mb->partial != 0 && /* Take care with CRLF partial */
3886 Feptr >= mb->end_subject &&
3887 NLBLOCK->nltype == NLTYPE_FIXED &&
3888 NLBLOCK->nllen == 2 &&
3889 fc == NLBLOCK->nl[0])
3890 {
3891 mb->hitend = TRUE;
3892 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3893 }
3894 break;
3895
3896 case OP_ALLANY:
3897 case OP_ANYBYTE:
3898 break;
3899
3900 case OP_ANYNL:
3901 switch(fc)
3902 {
3903 default: RRETURN(MATCH_NOMATCH);
3904
3905 case CHAR_CR:
3906 if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++;
3907 break;
3908
3909 case CHAR_LF:
3910 break;
3911
3912 case CHAR_VT:
3913 case CHAR_FF:
3914 case CHAR_NEL:
3915#if PCRE2_CODE_UNIT_WIDTH != 8
3916 case 0x2028:
3917 case 0x2029:
3918#endif
3919 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF)
3920 RRETURN(MATCH_NOMATCH);
3921 break;
3922 }
3923 break;
3924
3925 case OP_NOT_HSPACE:
3926 switch(fc)
3927 {
3928 default: break;
3929 HSPACE_BYTE_CASES:
3930#if PCRE2_CODE_UNIT_WIDTH != 8
3931 HSPACE_MULTIBYTE_CASES:
3932#endif
3933 RRETURN(MATCH_NOMATCH);
3934 }
3935 break;
3936
3937 case OP_HSPACE:
3938 switch(fc)
3939 {
3940 default: RRETURN(MATCH_NOMATCH);
3941 HSPACE_BYTE_CASES:
3942#if PCRE2_CODE_UNIT_WIDTH != 8
3943 HSPACE_MULTIBYTE_CASES:
3944#endif
3945 break;
3946 }
3947 break;
3948
3949 case OP_NOT_VSPACE:
3950 switch(fc)
3951 {
3952 default: break;
3953 VSPACE_BYTE_CASES:
3954#if PCRE2_CODE_UNIT_WIDTH != 8
3955 VSPACE_MULTIBYTE_CASES:
3956#endif
3957 RRETURN(MATCH_NOMATCH);
3958 }
3959 break;
3960
3961 case OP_VSPACE:
3962 switch(fc)
3963 {
3964 default: RRETURN(MATCH_NOMATCH);
3965 VSPACE_BYTE_CASES:
3966#if PCRE2_CODE_UNIT_WIDTH != 8
3967 VSPACE_MULTIBYTE_CASES:
3968#endif
3969 break;
3970 }
3971 break;
3972
3973 case OP_NOT_DIGIT:
3974 if (MAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0)
3975 RRETURN(MATCH_NOMATCH);
3976 break;
3977
3978 case OP_DIGIT:
3979 if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0)
3980 RRETURN(MATCH_NOMATCH);
3981 break;
3982
3983 case OP_NOT_WHITESPACE:
3984 if (MAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0)
3985 RRETURN(MATCH_NOMATCH);
3986 break;
3987
3988 case OP_WHITESPACE:
3989 if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0)
3990 RRETURN(MATCH_NOMATCH);
3991 break;
3992
3993 case OP_NOT_WORDCHAR:
3994 if (MAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0)
3995 RRETURN(MATCH_NOMATCH);
3996 break;
3997
3998 case OP_WORDCHAR:
3999 if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0)
4000 RRETURN(MATCH_NOMATCH);
4001 break;
4002
4003 default:
4004 return PCRE2_ERROR_INTERNAL;
4005 }
4006 }
4007 }
4008 /* Control never gets here */
4009 }
4010
4011 /* If maximizing, it is worth using inline code for speed, doing the type
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07004012 test once at the start (i.e. keep it out of the loops). Once again,
4013 "notmatch" can be an ordinary local variable because the loops do not call
4014 RMATCH. */
Elliott Hughes5b808042021-10-01 10:56:10 -07004015
4016 else
4017 {
4018 Lstart_eptr = Feptr; /* Remember where we started */
4019
4020#ifdef SUPPORT_UNICODE
4021 if (proptype >= 0)
4022 {
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07004023 BOOL notmatch = Lctype == OP_NOTPROP;
Elliott Hughes5b808042021-10-01 10:56:10 -07004024 switch(proptype)
4025 {
4026 case PT_ANY:
4027 for (i = Lmin; i < Lmax; i++)
4028 {
4029 int len = 1;
4030 if (Feptr >= mb->end_subject)
4031 {
4032 SCHECK_PARTIAL();
4033 break;
4034 }
4035 GETCHARLENTEST(fc, Feptr, len);
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07004036 if (notmatch) break;
Elliott Hughes5b808042021-10-01 10:56:10 -07004037 Feptr+= len;
4038 }
4039 break;
4040
4041 case PT_LAMP:
4042 for (i = Lmin; i < Lmax; i++)
4043 {
4044 int chartype;
4045 int len = 1;
4046 if (Feptr >= mb->end_subject)
4047 {
4048 SCHECK_PARTIAL();
4049 break;
4050 }
4051 GETCHARLENTEST(fc, Feptr, len);
4052 chartype = UCD_CHARTYPE(fc);
4053 if ((chartype == ucp_Lu ||
4054 chartype == ucp_Ll ||
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07004055 chartype == ucp_Lt) == notmatch)
Elliott Hughes5b808042021-10-01 10:56:10 -07004056 break;
4057 Feptr+= len;
4058 }
4059 break;
4060
4061 case PT_GC:
4062 for (i = Lmin; i < Lmax; i++)
4063 {
4064 int len = 1;
4065 if (Feptr >= mb->end_subject)
4066 {
4067 SCHECK_PARTIAL();
4068 break;
4069 }
4070 GETCHARLENTEST(fc, Feptr, len);
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07004071 if ((UCD_CATEGORY(fc) == Lpropvalue) == notmatch) break;
Elliott Hughes5b808042021-10-01 10:56:10 -07004072 Feptr+= len;
4073 }
4074 break;
4075
4076 case PT_PC:
4077 for (i = Lmin; i < Lmax; i++)
4078 {
4079 int len = 1;
4080 if (Feptr >= mb->end_subject)
4081 {
4082 SCHECK_PARTIAL();
4083 break;
4084 }
4085 GETCHARLENTEST(fc, Feptr, len);
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07004086 if ((UCD_CHARTYPE(fc) == Lpropvalue) == notmatch) break;
Elliott Hughes5b808042021-10-01 10:56:10 -07004087 Feptr+= len;
4088 }
4089 break;
4090
4091 case PT_SC:
4092 for (i = Lmin; i < Lmax; i++)
4093 {
4094 int len = 1;
4095 if (Feptr >= mb->end_subject)
4096 {
4097 SCHECK_PARTIAL();
4098 break;
4099 }
4100 GETCHARLENTEST(fc, Feptr, len);
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07004101 if ((UCD_SCRIPT(fc) == Lpropvalue) == notmatch) break;
4102 Feptr+= len;
4103 }
4104 break;
4105
4106 case PT_SCX:
4107 for (i = Lmin; i < Lmax; i++)
4108 {
4109 BOOL ok;
4110 const ucd_record *prop;
4111 int len = 1;
4112 if (Feptr >= mb->end_subject)
4113 {
4114 SCHECK_PARTIAL();
Elliott Hughes5b808042021-10-01 10:56:10 -07004115 break;
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07004116 }
4117 GETCHARLENTEST(fc, Feptr, len);
4118 prop = GET_UCD(fc);
4119 ok = (prop->script == Lpropvalue ||
4120 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0);
4121 if (ok == notmatch) break;
Elliott Hughes5b808042021-10-01 10:56:10 -07004122 Feptr+= len;
4123 }
4124 break;
4125
4126 case PT_ALNUM:
4127 for (i = Lmin; i < Lmax; i++)
4128 {
4129 int category;
4130 int len = 1;
4131 if (Feptr >= mb->end_subject)
4132 {
4133 SCHECK_PARTIAL();
4134 break;
4135 }
4136 GETCHARLENTEST(fc, Feptr, len);
4137 category = UCD_CATEGORY(fc);
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07004138 if ((category == ucp_L || category == ucp_N) == notmatch)
Elliott Hughes5b808042021-10-01 10:56:10 -07004139 break;
4140 Feptr+= len;
4141 }
4142 break;
4143
4144 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4145 which means that Perl space and POSIX space are now identical. PCRE
4146 was changed at release 8.34. */
4147
4148 case PT_SPACE: /* Perl space */
4149 case PT_PXSPACE: /* POSIX space */
4150 for (i = Lmin; i < Lmax; i++)
4151 {
4152 int len = 1;
4153 if (Feptr >= mb->end_subject)
4154 {
4155 SCHECK_PARTIAL();
4156 break;
4157 }
4158 GETCHARLENTEST(fc, Feptr, len);
4159 switch(fc)
4160 {
4161 HSPACE_CASES:
4162 VSPACE_CASES:
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07004163 if (notmatch) goto ENDLOOP99; /* Break the loop */
Elliott Hughes5b808042021-10-01 10:56:10 -07004164 break;
4165
4166 default:
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07004167 if ((UCD_CATEGORY(fc) == ucp_Z) == notmatch)
Elliott Hughes5b808042021-10-01 10:56:10 -07004168 goto ENDLOOP99; /* Break the loop */
4169 break;
4170 }
4171 Feptr+= len;
4172 }
4173 ENDLOOP99:
4174 break;
4175
4176 case PT_WORD:
4177 for (i = Lmin; i < Lmax; i++)
4178 {
4179 int category;
4180 int len = 1;
4181 if (Feptr >= mb->end_subject)
4182 {
4183 SCHECK_PARTIAL();
4184 break;
4185 }
4186 GETCHARLENTEST(fc, Feptr, len);
4187 category = UCD_CATEGORY(fc);
4188 if ((category == ucp_L || category == ucp_N ||
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07004189 fc == CHAR_UNDERSCORE) == notmatch)
Elliott Hughes5b808042021-10-01 10:56:10 -07004190 break;
4191 Feptr+= len;
4192 }
4193 break;
4194
4195 case PT_CLIST:
4196 for (i = Lmin; i < Lmax; i++)
4197 {
4198 const uint32_t *cp;
4199 int len = 1;
4200 if (Feptr >= mb->end_subject)
4201 {
4202 SCHECK_PARTIAL();
4203 break;
4204 }
4205 GETCHARLENTEST(fc, Feptr, len);
4206 cp = PRIV(ucd_caseless_sets) + Lpropvalue;
4207 for (;;)
4208 {
4209 if (fc < *cp)
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07004210 { if (notmatch) break; else goto GOT_MAX; }
Elliott Hughes5b808042021-10-01 10:56:10 -07004211 if (fc == *cp++)
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07004212 { if (notmatch) goto GOT_MAX; else break; }
Elliott Hughes5b808042021-10-01 10:56:10 -07004213 }
4214 Feptr += len;
4215 }
4216 GOT_MAX:
4217 break;
4218
4219 case PT_UCNC:
4220 for (i = Lmin; i < Lmax; i++)
4221 {
4222 int len = 1;
4223 if (Feptr >= mb->end_subject)
4224 {
4225 SCHECK_PARTIAL();
4226 break;
4227 }
4228 GETCHARLENTEST(fc, Feptr, len);
4229 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
4230 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07004231 fc >= 0xe000) == notmatch)
Elliott Hughes5b808042021-10-01 10:56:10 -07004232 break;
4233 Feptr += len;
4234 }
4235 break;
4236
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07004237 case PT_BIDICL:
4238 for (i = Lmin; i < Lmax; i++)
4239 {
4240 int len = 1;
4241 if (Feptr >= mb->end_subject)
4242 {
4243 SCHECK_PARTIAL();
4244 break;
4245 }
4246 GETCHARLENTEST(fc, Feptr, len);
4247 if ((UCD_BIDICLASS(fc) == Lpropvalue) == notmatch) break;
4248 Feptr+= len;
4249 }
4250 break;
4251
4252 case PT_BOOL:
4253 for (i = Lmin; i < Lmax; i++)
4254 {
4255 BOOL ok;
4256 const ucd_record *prop;
4257 int len = 1;
4258 if (Feptr >= mb->end_subject)
4259 {
4260 SCHECK_PARTIAL();
4261 break;
4262 }
4263 GETCHARLENTEST(fc, Feptr, len);
4264 prop = GET_UCD(fc);
4265 ok = MAPBIT(PRIV(ucd_boolprop_sets) +
4266 UCD_BPROPS_PROP(prop), Lpropvalue) != 0;
4267 if (ok == notmatch) break;
4268 Feptr+= len;
4269 }
4270 break;
4271
Elliott Hughes5b808042021-10-01 10:56:10 -07004272 default:
4273 return PCRE2_ERROR_INTERNAL;
4274 }
4275
4276 /* Feptr is now past the end of the maximum run */
4277
4278 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4279
4280 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
4281 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
4282 go too far. */
4283
4284 for(;;)
4285 {
4286 if (Feptr <= Lstart_eptr) break;
4287 RMATCH(Fecode, RM222);
4288 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4289 Feptr--;
4290 if (utf) BACKCHAR(Feptr);
4291 }
4292 }
4293
4294 /* Match extended Unicode grapheme clusters. We will get here only if the
4295 support is in the binary; otherwise a compile-time error occurs. */
4296
4297 else if (Lctype == OP_EXTUNI)
4298 {
4299 for (i = Lmin; i < Lmax; i++)
4300 {
4301 if (Feptr >= mb->end_subject)
4302 {
4303 SCHECK_PARTIAL();
4304 break;
4305 }
4306 else
4307 {
4308 GETCHARINCTEST(fc, Feptr);
4309 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,
4310 utf, NULL);
4311 }
4312 CHECK_PARTIAL();
4313 }
4314
4315 /* Feptr is now past the end of the maximum run */
4316
4317 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4318
4319 /* We use <= Lstart_eptr rather than == Lstart_eptr to detect the start
4320 of the run while backtracking because the use of \C in UTF mode can
4321 cause BACKCHAR to move back past Lstart_eptr. This is just palliative;
4322 the use of \C in UTF mode is fraught with danger. */
4323
4324 for(;;)
4325 {
4326 int lgb, rgb;
4327 PCRE2_SPTR fptr;
4328
4329 if (Feptr <= Lstart_eptr) break; /* At start of char run */
4330 RMATCH(Fecode, RM220);
4331 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4332
4333 /* Backtracking over an extended grapheme cluster involves inspecting
4334 the previous two characters (if present) to see if a break is
4335 permitted between them. */
4336
4337 Feptr--;
4338 if (!utf) fc = *Feptr; else
4339 {
4340 BACKCHAR(Feptr);
4341 GETCHAR(fc, Feptr);
4342 }
4343 rgb = UCD_GRAPHBREAK(fc);
4344
4345 for (;;)
4346 {
4347 if (Feptr <= Lstart_eptr) break; /* At start of char run */
4348 fptr = Feptr - 1;
4349 if (!utf) fc = *fptr; else
4350 {
4351 BACKCHAR(fptr);
4352 GETCHAR(fc, fptr);
4353 }
4354 lgb = UCD_GRAPHBREAK(fc);
4355 if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
4356 Feptr = fptr;
4357 rgb = lgb;
4358 }
4359 }
4360 }
4361
4362 else
4363#endif /* SUPPORT_UNICODE */
4364
4365#ifdef SUPPORT_UNICODE
4366 if (utf)
4367 {
4368 switch(Lctype)
4369 {
4370 case OP_ANY:
4371 for (i = Lmin; i < Lmax; i++)
4372 {
4373 if (Feptr >= mb->end_subject)
4374 {
4375 SCHECK_PARTIAL();
4376 break;
4377 }
4378 if (IS_NEWLINE(Feptr)) break;
4379 if (mb->partial != 0 && /* Take care with CRLF partial */
4380 Feptr + 1 >= mb->end_subject &&
4381 NLBLOCK->nltype == NLTYPE_FIXED &&
4382 NLBLOCK->nllen == 2 &&
4383 UCHAR21(Feptr) == NLBLOCK->nl[0])
4384 {
4385 mb->hitend = TRUE;
4386 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4387 }
4388 Feptr++;
4389 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
4390 }
4391 break;
4392
4393 case OP_ALLANY:
4394 if (Lmax < UINT32_MAX)
4395 {
4396 for (i = Lmin; i < Lmax; i++)
4397 {
4398 if (Feptr >= mb->end_subject)
4399 {
4400 SCHECK_PARTIAL();
4401 break;
4402 }
4403 Feptr++;
4404 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
4405 }
4406 }
4407 else
4408 {
4409 Feptr = mb->end_subject; /* Unlimited UTF-8 repeat */
4410 SCHECK_PARTIAL();
4411 }
4412 break;
4413
4414 /* The "byte" (i.e. "code unit") case is the same as non-UTF */
4415
4416 case OP_ANYBYTE:
4417 fc = Lmax - Lmin;
4418 if (fc > (uint32_t)(mb->end_subject - Feptr))
4419 {
4420 Feptr = mb->end_subject;
4421 SCHECK_PARTIAL();
4422 }
4423 else Feptr += fc;
4424 break;
4425
4426 case OP_ANYNL:
4427 for (i = Lmin; i < Lmax; i++)
4428 {
4429 int len = 1;
4430 if (Feptr >= mb->end_subject)
4431 {
4432 SCHECK_PARTIAL();
4433 break;
4434 }
4435 GETCHARLEN(fc, Feptr, len);
4436 if (fc == CHAR_CR)
4437 {
4438 if (++Feptr >= mb->end_subject) break;
4439 if (UCHAR21(Feptr) == CHAR_LF) Feptr++;
4440 }
4441 else
4442 {
4443 if (fc != CHAR_LF &&
4444 (mb->bsr_convention == PCRE2_BSR_ANYCRLF ||
4445 (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL
4446#ifndef EBCDIC
4447 && fc != 0x2028 && fc != 0x2029
4448#endif /* Not EBCDIC */
4449 )))
4450 break;
4451 Feptr += len;
4452 }
4453 }
4454 break;
4455
4456 case OP_NOT_HSPACE:
4457 case OP_HSPACE:
4458 for (i = Lmin; i < Lmax; i++)
4459 {
4460 BOOL gotspace;
4461 int len = 1;
4462 if (Feptr >= mb->end_subject)
4463 {
4464 SCHECK_PARTIAL();
4465 break;
4466 }
4467 GETCHARLEN(fc, Feptr, len);
4468 switch(fc)
4469 {
4470 HSPACE_CASES: gotspace = TRUE; break;
4471 default: gotspace = FALSE; break;
4472 }
4473 if (gotspace == (Lctype == OP_NOT_HSPACE)) break;
4474 Feptr += len;
4475 }
4476 break;
4477
4478 case OP_NOT_VSPACE:
4479 case OP_VSPACE:
4480 for (i = Lmin; i < Lmax; i++)
4481 {
4482 BOOL gotspace;
4483 int len = 1;
4484 if (Feptr >= mb->end_subject)
4485 {
4486 SCHECK_PARTIAL();
4487 break;
4488 }
4489 GETCHARLEN(fc, Feptr, len);
4490 switch(fc)
4491 {
4492 VSPACE_CASES: gotspace = TRUE; break;
4493 default: gotspace = FALSE; break;
4494 }
4495 if (gotspace == (Lctype == OP_NOT_VSPACE)) break;
4496 Feptr += len;
4497 }
4498 break;
4499
4500 case OP_NOT_DIGIT:
4501 for (i = Lmin; i < Lmax; i++)
4502 {
4503 int len = 1;
4504 if (Feptr >= mb->end_subject)
4505 {
4506 SCHECK_PARTIAL();
4507 break;
4508 }
4509 GETCHARLEN(fc, Feptr, len);
4510 if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) break;
4511 Feptr+= len;
4512 }
4513 break;
4514
4515 case OP_DIGIT:
4516 for (i = Lmin; i < Lmax; i++)
4517 {
4518 int len = 1;
4519 if (Feptr >= mb->end_subject)
4520 {
4521 SCHECK_PARTIAL();
4522 break;
4523 }
4524 GETCHARLEN(fc, Feptr, len);
4525 if (fc >= 256 ||(mb->ctypes[fc] & ctype_digit) == 0) break;
4526 Feptr+= len;
4527 }
4528 break;
4529
4530 case OP_NOT_WHITESPACE:
4531 for (i = Lmin; i < Lmax; i++)
4532 {
4533 int len = 1;
4534 if (Feptr >= mb->end_subject)
4535 {
4536 SCHECK_PARTIAL();
4537 break;
4538 }
4539 GETCHARLEN(fc, Feptr, len);
4540 if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) break;
4541 Feptr+= len;
4542 }
4543 break;
4544
4545 case OP_WHITESPACE:
4546 for (i = Lmin; i < Lmax; i++)
4547 {
4548 int len = 1;
4549 if (Feptr >= mb->end_subject)
4550 {
4551 SCHECK_PARTIAL();
4552 break;
4553 }
4554 GETCHARLEN(fc, Feptr, len);
4555 if (fc >= 256 ||(mb->ctypes[fc] & ctype_space) == 0) break;
4556 Feptr+= len;
4557 }
4558 break;
4559
4560 case OP_NOT_WORDCHAR:
4561 for (i = Lmin; i < Lmax; i++)
4562 {
4563 int len = 1;
4564 if (Feptr >= mb->end_subject)
4565 {
4566 SCHECK_PARTIAL();
4567 break;
4568 }
4569 GETCHARLEN(fc, Feptr, len);
4570 if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) break;
4571 Feptr+= len;
4572 }
4573 break;
4574
4575 case OP_WORDCHAR:
4576 for (i = Lmin; i < Lmax; i++)
4577 {
4578 int len = 1;
4579 if (Feptr >= mb->end_subject)
4580 {
4581 SCHECK_PARTIAL();
4582 break;
4583 }
4584 GETCHARLEN(fc, Feptr, len);
4585 if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) break;
4586 Feptr+= len;
4587 }
4588 break;
4589
4590 default:
4591 return PCRE2_ERROR_INTERNAL;
4592 }
4593
4594 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4595
4596 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
4597 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't go
4598 too far. */
4599
4600 for(;;)
4601 {
4602 if (Feptr <= Lstart_eptr) break;
4603 RMATCH(Fecode, RM221);
4604 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4605 Feptr--;
4606 BACKCHAR(Feptr);
4607 if (Lctype == OP_ANYNL && Feptr > Lstart_eptr &&
4608 UCHAR21(Feptr) == CHAR_NL && UCHAR21(Feptr - 1) == CHAR_CR)
4609 Feptr--;
4610 }
4611 }
4612 else
4613#endif /* SUPPORT_UNICODE */
4614
4615 /* Not UTF mode */
4616 {
4617 switch(Lctype)
4618 {
4619 case OP_ANY:
4620 for (i = Lmin; i < Lmax; i++)
4621 {
4622 if (Feptr >= mb->end_subject)
4623 {
4624 SCHECK_PARTIAL();
4625 break;
4626 }
4627 if (IS_NEWLINE(Feptr)) break;
4628 if (mb->partial != 0 && /* Take care with CRLF partial */
4629 Feptr + 1 >= mb->end_subject &&
4630 NLBLOCK->nltype == NLTYPE_FIXED &&
4631 NLBLOCK->nllen == 2 &&
4632 *Feptr == NLBLOCK->nl[0])
4633 {
4634 mb->hitend = TRUE;
4635 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4636 }
4637 Feptr++;
4638 }
4639 break;
4640
4641 case OP_ALLANY:
4642 case OP_ANYBYTE:
4643 fc = Lmax - Lmin;
4644 if (fc > (uint32_t)(mb->end_subject - Feptr))
4645 {
4646 Feptr = mb->end_subject;
4647 SCHECK_PARTIAL();
4648 }
4649 else Feptr += fc;
4650 break;
4651
4652 case OP_ANYNL:
4653 for (i = Lmin; i < Lmax; i++)
4654 {
4655 if (Feptr >= mb->end_subject)
4656 {
4657 SCHECK_PARTIAL();
4658 break;
4659 }
4660 fc = *Feptr;
4661 if (fc == CHAR_CR)
4662 {
4663 if (++Feptr >= mb->end_subject) break;
4664 if (*Feptr == CHAR_LF) Feptr++;
4665 }
4666 else
4667 {
4668 if (fc != CHAR_LF && (mb->bsr_convention == PCRE2_BSR_ANYCRLF ||
4669 (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL
4670#if PCRE2_CODE_UNIT_WIDTH != 8
4671 && fc != 0x2028 && fc != 0x2029
4672#endif
4673 ))) break;
4674 Feptr++;
4675 }
4676 }
4677 break;
4678
4679 case OP_NOT_HSPACE:
4680 for (i = Lmin; i < Lmax; i++)
4681 {
4682 if (Feptr >= mb->end_subject)
4683 {
4684 SCHECK_PARTIAL();
4685 break;
4686 }
4687 switch(*Feptr)
4688 {
4689 default: Feptr++; break;
4690 HSPACE_BYTE_CASES:
4691#if PCRE2_CODE_UNIT_WIDTH != 8
4692 HSPACE_MULTIBYTE_CASES:
4693#endif
4694 goto ENDLOOP00;
4695 }
4696 }
4697 ENDLOOP00:
4698 break;
4699
4700 case OP_HSPACE:
4701 for (i = Lmin; i < Lmax; i++)
4702 {
4703 if (Feptr >= mb->end_subject)
4704 {
4705 SCHECK_PARTIAL();
4706 break;
4707 }
4708 switch(*Feptr)
4709 {
4710 default: goto ENDLOOP01;
4711 HSPACE_BYTE_CASES:
4712#if PCRE2_CODE_UNIT_WIDTH != 8
4713 HSPACE_MULTIBYTE_CASES:
4714#endif
4715 Feptr++; break;
4716 }
4717 }
4718 ENDLOOP01:
4719 break;
4720
4721 case OP_NOT_VSPACE:
4722 for (i = Lmin; i < Lmax; i++)
4723 {
4724 if (Feptr >= mb->end_subject)
4725 {
4726 SCHECK_PARTIAL();
4727 break;
4728 }
4729 switch(*Feptr)
4730 {
4731 default: Feptr++; break;
4732 VSPACE_BYTE_CASES:
4733#if PCRE2_CODE_UNIT_WIDTH != 8
4734 VSPACE_MULTIBYTE_CASES:
4735#endif
4736 goto ENDLOOP02;
4737 }
4738 }
4739 ENDLOOP02:
4740 break;
4741
4742 case OP_VSPACE:
4743 for (i = Lmin; i < Lmax; i++)
4744 {
4745 if (Feptr >= mb->end_subject)
4746 {
4747 SCHECK_PARTIAL();
4748 break;
4749 }
4750 switch(*Feptr)
4751 {
4752 default: goto ENDLOOP03;
4753 VSPACE_BYTE_CASES:
4754#if PCRE2_CODE_UNIT_WIDTH != 8
4755 VSPACE_MULTIBYTE_CASES:
4756#endif
4757 Feptr++; break;
4758 }
4759 }
4760 ENDLOOP03:
4761 break;
4762
4763 case OP_NOT_DIGIT:
4764 for (i = Lmin; i < Lmax; i++)
4765 {
4766 if (Feptr >= mb->end_subject)
4767 {
4768 SCHECK_PARTIAL();
4769 break;
4770 }
4771 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0)
4772 break;
4773 Feptr++;
4774 }
4775 break;
4776
4777 case OP_DIGIT:
4778 for (i = Lmin; i < Lmax; i++)
4779 {
4780 if (Feptr >= mb->end_subject)
4781 {
4782 SCHECK_PARTIAL();
4783 break;
4784 }
4785 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0)
4786 break;
4787 Feptr++;
4788 }
4789 break;
4790
4791 case OP_NOT_WHITESPACE:
4792 for (i = Lmin; i < Lmax; i++)
4793 {
4794 if (Feptr >= mb->end_subject)
4795 {
4796 SCHECK_PARTIAL();
4797 break;
4798 }
4799 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0)
4800 break;
4801 Feptr++;
4802 }
4803 break;
4804
4805 case OP_WHITESPACE:
4806 for (i = Lmin; i < Lmax; i++)
4807 {
4808 if (Feptr >= mb->end_subject)
4809 {
4810 SCHECK_PARTIAL();
4811 break;
4812 }
4813 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0)
4814 break;
4815 Feptr++;
4816 }
4817 break;
4818
4819 case OP_NOT_WORDCHAR:
4820 for (i = Lmin; i < Lmax; i++)
4821 {
4822 if (Feptr >= mb->end_subject)
4823 {
4824 SCHECK_PARTIAL();
4825 break;
4826 }
4827 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0)
4828 break;
4829 Feptr++;
4830 }
4831 break;
4832
4833 case OP_WORDCHAR:
4834 for (i = Lmin; i < Lmax; i++)
4835 {
4836 if (Feptr >= mb->end_subject)
4837 {
4838 SCHECK_PARTIAL();
4839 break;
4840 }
4841 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0)
4842 break;
4843 Feptr++;
4844 }
4845 break;
4846
4847 default:
4848 return PCRE2_ERROR_INTERNAL;
4849 }
4850
4851 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4852
4853 for (;;)
4854 {
4855 if (Feptr == Lstart_eptr) break;
4856 RMATCH(Fecode, RM34);
4857 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4858 Feptr--;
4859 if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && *Feptr == CHAR_LF &&
4860 Feptr[-1] == CHAR_CR) Feptr--;
4861 }
4862 }
4863 }
4864 break; /* End of repeat character type processing */
4865
4866#undef Lstart_eptr
4867#undef Lmin
4868#undef Lmax
4869#undef Lctype
4870#undef Lpropvalue
4871
4872
4873 /* ===================================================================== */
4874 /* Match a back reference, possibly repeatedly. Look past the end of the
4875 item to see if there is repeat information following. The OP_REF and
4876 OP_REFI opcodes are used for a reference to a numbered group or to a
4877 non-duplicated named group. For a duplicated named group, OP_DNREF and
4878 OP_DNREFI are used. In this case we must scan the list of groups to which
4879 the name refers, and use the first one that is set. */
4880
4881#define Lmin F->temp_32[0]
4882#define Lmax F->temp_32[1]
4883#define Lcaseless F->temp_32[2]
4884#define Lstart F->temp_sptr[0]
4885#define Loffset F->temp_size
4886
4887 case OP_DNREF:
4888 case OP_DNREFI:
4889 Lcaseless = (Fop == OP_DNREFI);
4890 {
4891 int count = GET2(Fecode, 1+IMM2_SIZE);
4892 PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
4893 Fecode += 1 + 2*IMM2_SIZE;
4894
4895 while (count-- > 0)
4896 {
4897 Loffset = (GET2(slot, 0) << 1) - 2;
4898 if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) break;
4899 slot += mb->name_entry_size;
4900 }
4901 }
4902 goto REF_REPEAT;
4903
4904 case OP_REF:
4905 case OP_REFI:
4906 Lcaseless = (Fop == OP_REFI);
4907 Loffset = (GET2(Fecode, 1) << 1) - 2;
4908 Fecode += 1 + IMM2_SIZE;
4909
4910 /* Set up for repetition, or handle the non-repeated case. The maximum and
4911 minimum must be in the heap frame, but as they are short-term values, we
4912 use temporary fields. */
4913
4914 REF_REPEAT:
4915 switch (*Fecode)
4916 {
4917 case OP_CRSTAR:
4918 case OP_CRMINSTAR:
4919 case OP_CRPLUS:
4920 case OP_CRMINPLUS:
4921 case OP_CRQUERY:
4922 case OP_CRMINQUERY:
4923 fc = *Fecode++ - OP_CRSTAR;
4924 Lmin = rep_min[fc];
4925 Lmax = rep_max[fc];
4926 reptype = rep_typ[fc];
4927 break;
4928
4929 case OP_CRRANGE:
4930 case OP_CRMINRANGE:
4931 Lmin = GET2(Fecode, 1);
4932 Lmax = GET2(Fecode, 1 + IMM2_SIZE);
4933 reptype = rep_typ[*Fecode - OP_CRSTAR];
4934 if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
4935 Fecode += 1 + 2 * IMM2_SIZE;
4936 break;
4937
4938 default: /* No repeat follows */
4939 {
4940 rrc = match_ref(Loffset, Lcaseless, F, mb, &length);
4941 if (rrc != 0)
4942 {
4943 if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
4944 CHECK_PARTIAL();
4945 RRETURN(MATCH_NOMATCH);
4946 }
4947 }
4948 Feptr += length;
4949 continue; /* With the main loop */
4950 }
4951
4952 /* Handle repeated back references. If a set group has length zero, just
4953 continue with the main loop, because it matches however many times. For an
4954 unset reference, if the minimum is zero, we can also just continue. We can
4955 also continue if PCRE2_MATCH_UNSET_BACKREF is set, because this makes unset
4956 group behave as a zero-length group. For any other unset cases, carrying
4957 on will result in NOMATCH. */
4958
4959 if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET)
4960 {
4961 if (Fovector[Loffset] == Fovector[Loffset + 1]) continue;
4962 }
4963 else /* Group is not set */
4964 {
4965 if (Lmin == 0 || (mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
4966 continue;
4967 }
4968
4969 /* First, ensure the minimum number of matches are present. */
4970
4971 for (i = 1; i <= Lmin; i++)
4972 {
4973 PCRE2_SIZE slength;
4974 rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
4975 if (rrc != 0)
4976 {
4977 if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
4978 CHECK_PARTIAL();
4979 RRETURN(MATCH_NOMATCH);
4980 }
4981 Feptr += slength;
4982 }
4983
4984 /* If min = max, we are done. They are not both allowed to be zero. */
4985
4986 if (Lmin == Lmax) continue;
4987
4988 /* If minimizing, keep trying and advancing the pointer. */
4989
4990 if (reptype == REPTYPE_MIN)
4991 {
4992 for (;;)
4993 {
4994 PCRE2_SIZE slength;
4995 RMATCH(Fecode, RM20);
4996 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4997 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
4998 rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
4999 if (rrc != 0)
5000 {
5001 if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
5002 CHECK_PARTIAL();
5003 RRETURN(MATCH_NOMATCH);
5004 }
5005 Feptr += slength;
5006 }
5007 /* Control never gets here */
5008 }
5009
5010 /* If maximizing, find the longest string and work backwards, as long as
5011 the matched lengths for each iteration are the same. */
5012
5013 else
5014 {
5015 BOOL samelengths = TRUE;
5016 Lstart = Feptr; /* Starting position */
5017 Flength = Fovector[Loffset+1] - Fovector[Loffset];
5018
5019 for (i = Lmin; i < Lmax; i++)
5020 {
5021 PCRE2_SIZE slength;
5022 rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
5023 if (rrc != 0)
5024 {
5025 /* Can't use CHECK_PARTIAL because we don't want to update Feptr in
5026 the soft partial matching case. */
5027
5028 if (rrc > 0 && mb->partial != 0 &&
5029 mb->end_subject > mb->start_used_ptr)
5030 {
5031 mb->hitend = TRUE;
5032 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
5033 }
5034 break;
5035 }
5036
5037 if (slength != Flength) samelengths = FALSE;
5038 Feptr += slength;
5039 }
5040
5041 /* If the length matched for each repetition is the same as the length of
5042 the captured group, we can easily work backwards. This is the normal
5043 case. However, in caseless UTF-8 mode there are pairs of case-equivalent
5044 characters whose lengths (in terms of code units) differ. However, this
5045 is very rare, so we handle it by re-matching fewer and fewer times. */
5046
5047 if (samelengths)
5048 {
5049 while (Feptr >= Lstart)
5050 {
5051 RMATCH(Fecode, RM21);
5052 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5053 Feptr -= Flength;
5054 }
5055 }
5056
5057 /* The rare case of non-matching lengths. Re-scan the repetition for each
5058 iteration. We know that match_ref() will succeed every time. */
5059
5060 else
5061 {
5062 Lmax = i;
5063 for (;;)
5064 {
5065 RMATCH(Fecode, RM22);
5066 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5067 if (Feptr == Lstart) break; /* Failed after minimal repetition */
5068 Feptr = Lstart;
5069 Lmax--;
5070 for (i = Lmin; i < Lmax; i++)
5071 {
5072 PCRE2_SIZE slength;
5073 (void)match_ref(Loffset, Lcaseless, F, mb, &slength);
5074 Feptr += slength;
5075 }
5076 }
5077 }
5078
5079 RRETURN(MATCH_NOMATCH);
5080 }
5081 /* Control never gets here */
5082
5083#undef Lcaseless
5084#undef Lmin
5085#undef Lmax
5086#undef Lstart
5087#undef Loffset
5088
5089
5090
5091/* ========================================================================= */
5092/* Opcodes for the start of various parenthesized items */
5093/* ========================================================================= */
5094
5095 /* In all cases, if the result of RMATCH() is MATCH_THEN, check whether the
5096 (*THEN) is within the current branch by comparing the address of OP_THEN
5097 that is passed back with the end of the branch. If (*THEN) is within the
5098 current branch, and the branch is one of two or more alternatives (it
5099 either starts or ends with OP_ALT), we have reached the limit of THEN's
5100 action, so convert the return code to NOMATCH, which will cause normal
5101 backtracking to happen from now on. Otherwise, THEN is passed back to an
5102 outer alternative. This implements Perl's treatment of parenthesized
5103 groups, where a group not containing | does not affect the current
5104 alternative, that is, (X) is NOT the same as (X|(*F)). */
5105
5106
5107 /* ===================================================================== */
5108 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a non-possessive
5109 bracket group, indicating that it may occur zero times. It may repeat
5110 infinitely, or not at all - i.e. it could be ()* or ()? or even (){0} in
5111 the pattern. Brackets with fixed upper repeat limits are compiled as a
5112 number of copies, with the optional ones preceded by BRAZERO or BRAMINZERO.
5113 Possessive groups with possible zero repeats are preceded by BRAPOSZERO. */
5114
5115#define Lnext_ecode F->temp_sptr[0]
5116
5117 case OP_BRAZERO:
5118 Lnext_ecode = Fecode + 1;
5119 RMATCH(Lnext_ecode, RM9);
5120 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5121 do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT);
5122 Fecode = Lnext_ecode + 1 + LINK_SIZE;
5123 break;
5124
5125 case OP_BRAMINZERO:
5126 Lnext_ecode = Fecode + 1;
5127 do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT);
5128 RMATCH(Lnext_ecode + 1 + LINK_SIZE, RM10);
5129 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5130 Fecode++;
5131 break;
5132
5133#undef Lnext_ecode
5134
5135 case OP_SKIPZERO:
5136 Fecode++;
5137 do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);
5138 Fecode += 1 + LINK_SIZE;
5139 break;
5140
5141
5142 /* ===================================================================== */
5143 /* Handle possessive brackets with an unlimited repeat. The end of these
5144 brackets will always be OP_KETRPOS, which returns MATCH_KETRPOS without
5145 going further in the pattern. */
5146
5147#define Lframe_type F->temp_32[0]
5148#define Lmatched_once F->temp_32[1]
5149#define Lzero_allowed F->temp_32[2]
5150#define Lstart_eptr F->temp_sptr[0]
5151#define Lstart_group F->temp_sptr[1]
5152
5153 case OP_BRAPOSZERO:
5154 Lzero_allowed = TRUE; /* Zero repeat is allowed */
5155 Fecode += 1;
5156 if (*Fecode == OP_CBRAPOS || *Fecode == OP_SCBRAPOS)
5157 goto POSSESSIVE_CAPTURE;
5158 goto POSSESSIVE_NON_CAPTURE;
5159
5160 case OP_BRAPOS:
5161 case OP_SBRAPOS:
5162 Lzero_allowed = FALSE; /* Zero repeat not allowed */
5163
5164 POSSESSIVE_NON_CAPTURE:
5165 Lframe_type = GF_NOCAPTURE; /* Remembered frame type */
5166 goto POSSESSIVE_GROUP;
5167
5168 case OP_CBRAPOS:
5169 case OP_SCBRAPOS:
5170 Lzero_allowed = FALSE; /* Zero repeat not allowed */
5171
5172 POSSESSIVE_CAPTURE:
5173 number = GET2(Fecode, 1+LINK_SIZE);
5174 Lframe_type = GF_CAPTURE | number; /* Remembered frame type */
5175
5176 POSSESSIVE_GROUP:
5177 Lmatched_once = FALSE; /* Never matched */
5178 Lstart_group = Fecode; /* Start of this group */
5179
5180 for (;;)
5181 {
5182 Lstart_eptr = Feptr; /* Position at group start */
5183 group_frame_type = Lframe_type;
5184 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM8);
5185 if (rrc == MATCH_KETRPOS)
5186 {
5187 Lmatched_once = TRUE; /* Matched at least once */
5188 if (Feptr == Lstart_eptr) /* Empty match; skip to end */
5189 {
5190 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5191 break;
5192 }
5193
5194 Fecode = Lstart_group;
5195 continue;
5196 }
5197
5198 /* See comment above about handling THEN. */
5199
5200 if (rrc == MATCH_THEN)
5201 {
5202 PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1);
5203 if (mb->verb_ecode_ptr < next_ecode &&
5204 (*Fecode == OP_ALT || *next_ecode == OP_ALT))
5205 rrc = MATCH_NOMATCH;
5206 }
5207
5208 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5209 Fecode += GET(Fecode, 1);
5210 if (*Fecode != OP_ALT) break;
5211 }
5212
5213 /* Success if matched something or zero repeat allowed */
5214
5215 if (Lmatched_once || Lzero_allowed)
5216 {
5217 Fecode += 1 + LINK_SIZE;
5218 break;
5219 }
5220
5221 RRETURN(MATCH_NOMATCH);
5222
5223#undef Lmatched_once
5224#undef Lzero_allowed
5225#undef Lframe_type
5226#undef Lstart_eptr
5227#undef Lstart_group
5228
5229
5230 /* ===================================================================== */
5231 /* Handle non-capturing brackets that cannot match an empty string. When we
5232 get to the final alternative within the brackets, as long as there are no
5233 THEN's in the pattern, we can optimize by not recording a new backtracking
5234 point. (Ideally we should test for a THEN within this group, but we don't
5235 have that information.) Don't do this if we are at the very top level,
5236 however, because that would make handling assertions and once-only brackets
5237 messier when there is nothing to go back to. */
5238
5239#define Lframe_type F->temp_32[0] /* Set for all that use GROUPLOOP */
5240#define Lnext_branch F->temp_sptr[0] /* Used only in OP_BRA handling */
5241
5242 case OP_BRA:
5243 if (mb->hasthen || Frdepth == 0)
5244 {
5245 Lframe_type = 0;
5246 goto GROUPLOOP;
5247 }
5248
5249 for (;;)
5250 {
5251 Lnext_branch = Fecode + GET(Fecode, 1);
5252 if (*Lnext_branch != OP_ALT) break;
5253
5254 /* This is never the final branch. We do not need to test for MATCH_THEN
5255 here because this code is not used when there is a THEN in the pattern. */
5256
5257 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM1);
5258 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5259 Fecode = Lnext_branch;
5260 }
5261
5262 /* Hit the start of the final branch. Continue at this level. */
5263
5264 Fecode += PRIV(OP_lengths)[*Fecode];
5265 break;
5266
5267#undef Lnext_branch
5268
5269
5270 /* ===================================================================== */
5271 /* Handle a capturing bracket, other than those that are possessive with an
5272 unlimited repeat. */
5273
5274 case OP_CBRA:
5275 case OP_SCBRA:
5276 Lframe_type = GF_CAPTURE | GET2(Fecode, 1+LINK_SIZE);
5277 goto GROUPLOOP;
5278
5279
5280 /* ===================================================================== */
5281 /* Atomic groups and non-capturing brackets that can match an empty string
5282 must record a backtracking point and also set up a chained frame. */
5283
5284 case OP_ONCE:
5285 case OP_SCRIPT_RUN:
5286 case OP_SBRA:
5287 Lframe_type = GF_NOCAPTURE | Fop;
5288
5289 GROUPLOOP:
5290 for (;;)
5291 {
5292 group_frame_type = Lframe_type;
5293 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM2);
5294 if (rrc == MATCH_THEN)
5295 {
5296 PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1);
5297 if (mb->verb_ecode_ptr < next_ecode &&
5298 (*Fecode == OP_ALT || *next_ecode == OP_ALT))
5299 rrc = MATCH_NOMATCH;
5300 }
5301 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5302 Fecode += GET(Fecode, 1);
5303 if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH);
5304 }
5305 /* Control never reaches here. */
5306
5307#undef Lframe_type
5308
5309
5310 /* ===================================================================== */
5311 /* Recursion either matches the current regex, or some subexpression. The
5312 offset data is the offset to the starting bracket from the start of the
5313 whole pattern. (This is so that it works from duplicated subpatterns.) */
5314
5315#define Lframe_type F->temp_32[0]
5316#define Lstart_branch F->temp_sptr[0]
5317
5318 case OP_RECURSE:
5319 bracode = mb->start_code + GET(Fecode, 1);
5320 number = (bracode == mb->start_code)? 0 : GET2(bracode, 1 + LINK_SIZE);
5321
5322 /* If we are already in a recursion, check for repeating the same one
5323 without advancing the subject pointer. This should catch convoluted mutual
5324 recursions. (Some simple cases are caught at compile time.) */
5325
5326 if (Fcurrent_recurse != RECURSE_UNSET)
5327 {
5328 offset = Flast_group_offset;
5329 while (offset != PCRE2_UNSET)
5330 {
5331 N = (heapframe *)((char *)mb->match_frames + offset);
5332 P = (heapframe *)((char *)N - frame_size);
5333 if (N->group_frame_type == (GF_RECURSE | number))
5334 {
5335 if (Feptr == P->eptr) return PCRE2_ERROR_RECURSELOOP;
5336 break;
5337 }
5338 offset = P->last_group_offset;
5339 }
5340 }
5341
5342 /* Now run the recursion, branch by branch. */
5343
5344 Lstart_branch = bracode;
5345 Lframe_type = GF_RECURSE | number;
5346
5347 for (;;)
5348 {
5349 PCRE2_SPTR next_ecode;
5350
5351 group_frame_type = Lframe_type;
5352 RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM11);
5353 next_ecode = Lstart_branch + GET(Lstart_branch,1);
5354
5355 /* Handle backtracking verbs, which are defined in a range that can
5356 easily be tested for. PCRE does not allow THEN, SKIP, PRUNE or COMMIT to
5357 escape beyond a recursion; they cause a NOMATCH for the entire recursion.
5358
5359 When one of these verbs triggers, the current recursion group number is
5360 recorded. If it matches the recursion we are processing, the verb
5361 happened within the recursion and we must deal with it. Otherwise it must
5362 have happened after the recursion completed, and so has to be passed
5363 back. See comment above about handling THEN. */
5364
5365 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX &&
5366 mb->verb_current_recurse == (Lframe_type ^ GF_RECURSE))
5367 {
5368 if (rrc == MATCH_THEN && mb->verb_ecode_ptr < next_ecode &&
5369 (*Lstart_branch == OP_ALT || *next_ecode == OP_ALT))
5370 rrc = MATCH_NOMATCH;
5371 else RRETURN(MATCH_NOMATCH);
5372 }
5373
5374 /* Note that carrying on after (*ACCEPT) in a recursion is handled in the
5375 OP_ACCEPT code. Nothing needs to be done here. */
5376
5377 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5378 Lstart_branch = next_ecode;
5379 if (*Lstart_branch != OP_ALT) RRETURN(MATCH_NOMATCH);
5380 }
5381 /* Control never reaches here. */
5382
5383#undef Lframe_type
5384#undef Lstart_branch
5385
5386
5387 /* ===================================================================== */
5388 /* Positive assertions are like other groups except that PCRE doesn't allow
5389 the effect of (*THEN) to escape beyond an assertion; it is therefore
5390 treated as NOMATCH. (*ACCEPT) is treated as successful assertion, with its
5391 captures and mark retained. Any other return is an error. */
5392
5393#define Lframe_type F->temp_32[0]
5394
5395 case OP_ASSERT:
5396 case OP_ASSERTBACK:
5397 case OP_ASSERT_NA:
5398 case OP_ASSERTBACK_NA:
5399 Lframe_type = GF_NOCAPTURE | Fop;
5400 for (;;)
5401 {
5402 group_frame_type = Lframe_type;
5403 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM3);
5404 if (rrc == MATCH_ACCEPT)
5405 {
5406 memcpy(Fovector,
5407 (char *)assert_accept_frame + offsetof(heapframe, ovector),
5408 assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));
5409 Foffset_top = assert_accept_frame->offset_top;
5410 Fmark = assert_accept_frame->mark;
5411 break;
5412 }
5413 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
5414 Fecode += GET(Fecode, 1);
5415 if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH);
5416 }
5417
5418 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5419 Fecode += 1 + LINK_SIZE;
5420 break;
5421
5422#undef Lframe_type
5423
5424
5425 /* ===================================================================== */
5426 /* Handle negative assertions. Loop for each non-matching branch as for
5427 positive assertions. */
5428
5429#define Lframe_type F->temp_32[0]
5430
5431 case OP_ASSERT_NOT:
5432 case OP_ASSERTBACK_NOT:
5433 Lframe_type = GF_NOCAPTURE | Fop;
5434
5435 for (;;)
5436 {
5437 group_frame_type = Lframe_type;
5438 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM4);
5439 switch(rrc)
5440 {
5441 case MATCH_ACCEPT: /* Assertion matched, therefore it fails. */
5442 case MATCH_MATCH:
5443 RRETURN (MATCH_NOMATCH);
5444
5445 case MATCH_NOMATCH: /* Branch failed, try next if present. */
5446 case MATCH_THEN:
5447 Fecode += GET(Fecode, 1);
5448 if (*Fecode != OP_ALT) goto ASSERT_NOT_FAILED;
5449 break;
5450
5451 case MATCH_COMMIT: /* Assertion forced to fail, therefore continue. */
5452 case MATCH_SKIP:
5453 case MATCH_PRUNE:
5454 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5455 goto ASSERT_NOT_FAILED;
5456
5457 default: /* Pass back any other return */
5458 RRETURN(rrc);
5459 }
5460 }
5461
5462 /* None of the branches have matched or there was a backtrack to (*COMMIT),
5463 (*SKIP), (*PRUNE), or (*THEN) in the last branch. This is success for a
5464 negative assertion, so carry on. */
5465
5466 ASSERT_NOT_FAILED:
5467 Fecode += 1 + LINK_SIZE;
5468 break;
5469
5470#undef Lframe_type
5471
5472
5473 /* ===================================================================== */
5474 /* The callout item calls an external function, if one is provided, passing
5475 details of the match so far. This is mainly for debugging, though the
5476 function is able to force a failure. */
5477
5478 case OP_CALLOUT:
5479 case OP_CALLOUT_STR:
5480 rrc = do_callout(F, mb, &length);
5481 if (rrc > 0) RRETURN(MATCH_NOMATCH);
5482 if (rrc < 0) RRETURN(rrc);
5483 Fecode += length;
5484 break;
5485
5486
5487 /* ===================================================================== */
5488 /* Conditional group: compilation checked that there are no more than two
5489 branches. If the condition is false, skipping the first branch takes us
5490 past the end of the item if there is only one branch, but that's exactly
5491 what we want. */
5492
5493 case OP_COND:
5494 case OP_SCOND:
5495
5496 /* The variable Flength will be added to Fecode when the condition is
5497 false, to get to the second branch. Setting it to the offset to the ALT or
5498 KET, then incrementing Fecode achieves this effect. However, if the second
5499 branch is non-existent, we must point to the KET so that the end of the
5500 group is correctly processed. We now have Fecode pointing to the condition
5501 or callout. */
5502
5503 Flength = GET(Fecode, 1); /* Offset to the second branch */
5504 if (Fecode[Flength] != OP_ALT) Flength -= 1 + LINK_SIZE;
5505 Fecode += 1 + LINK_SIZE; /* From this opcode */
5506
5507 /* Because of the way auto-callout works during compile, a callout item is
5508 inserted between OP_COND and an assertion condition. Such a callout can
5509 also be inserted manually. */
5510
5511 if (*Fecode == OP_CALLOUT || *Fecode == OP_CALLOUT_STR)
5512 {
5513 rrc = do_callout(F, mb, &length);
5514 if (rrc > 0) RRETURN(MATCH_NOMATCH);
5515 if (rrc < 0) RRETURN(rrc);
5516
5517 /* Advance Fecode past the callout, so it now points to the condition. We
5518 must adjust Flength so that the value of Fecode+Flength is unchanged. */
5519
5520 Fecode += length;
5521 Flength -= length;
5522 }
5523
5524 /* Test the various possible conditions */
5525
5526 condition = FALSE;
5527 switch(*Fecode)
5528 {
5529 case OP_RREF: /* Group recursion test */
5530 if (Fcurrent_recurse != RECURSE_UNSET)
5531 {
5532 number = GET2(Fecode, 1);
5533 condition = (number == RREF_ANY || number == Fcurrent_recurse);
5534 }
5535 break;
5536
5537 case OP_DNRREF: /* Duplicate named group recursion test */
5538 if (Fcurrent_recurse != RECURSE_UNSET)
5539 {
5540 int count = GET2(Fecode, 1 + IMM2_SIZE);
5541 PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
5542 while (count-- > 0)
5543 {
5544 number = GET2(slot, 0);
5545 condition = number == Fcurrent_recurse;
5546 if (condition) break;
5547 slot += mb->name_entry_size;
5548 }
5549 }
5550 break;
5551
5552 case OP_CREF: /* Numbered group used test */
5553 offset = (GET2(Fecode, 1) << 1) - 2; /* Doubled ref number */
5554 condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET;
5555 break;
5556
5557 case OP_DNCREF: /* Duplicate named group used test */
5558 {
5559 int count = GET2(Fecode, 1 + IMM2_SIZE);
5560 PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
5561 while (count-- > 0)
5562 {
5563 offset = (GET2(slot, 0) << 1) - 2;
5564 condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET;
5565 if (condition) break;
5566 slot += mb->name_entry_size;
5567 }
5568 }
5569 break;
5570
5571 case OP_FALSE:
5572 case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */
5573 break;
5574
5575 case OP_TRUE:
5576 condition = TRUE;
5577 break;
5578
5579 /* The condition is an assertion. Run code similar to the assertion code
5580 above. */
5581
5582#define Lpositive F->temp_32[0]
5583#define Lstart_branch F->temp_sptr[0]
5584
5585 default:
5586 Lpositive = (*Fecode == OP_ASSERT || *Fecode == OP_ASSERTBACK);
5587 Lstart_branch = Fecode;
5588
5589 for (;;)
5590 {
5591 group_frame_type = GF_CONDASSERT | *Fecode;
5592 RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM5);
5593
5594 switch(rrc)
5595 {
5596 case MATCH_ACCEPT: /* Save captures */
5597 memcpy(Fovector,
5598 (char *)assert_accept_frame + offsetof(heapframe, ovector),
5599 assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));
5600 Foffset_top = assert_accept_frame->offset_top;
5601
5602 /* Fall through */
5603 /* In the case of a match, the captures have already been put into
5604 the current frame. */
5605
5606 case MATCH_MATCH:
5607 condition = Lpositive; /* TRUE for positive assertion */
5608 break;
5609
5610 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
5611 assertion; it is therefore always treated as NOMATCH. */
5612
5613 case MATCH_NOMATCH:
5614 case MATCH_THEN:
5615 Lstart_branch += GET(Lstart_branch, 1);
5616 if (*Lstart_branch == OP_ALT) continue; /* Try next branch */
5617 condition = !Lpositive; /* TRUE for negative assertion */
5618 break;
5619
5620 /* These force no match without checking other branches. */
5621
5622 case MATCH_COMMIT:
5623 case MATCH_SKIP:
5624 case MATCH_PRUNE:
5625 condition = !Lpositive;
5626 break;
5627
5628 default:
5629 RRETURN(rrc);
5630 }
5631 break; /* Out of the branch loop */
5632 }
5633
5634 /* If the condition is true, find the end of the assertion so that
5635 advancing past it gets us to the start of the first branch. */
5636
5637 if (condition)
5638 {
5639 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5640 }
5641 break; /* End of assertion condition */
5642 }
5643
5644#undef Lpositive
5645#undef Lstart_branch
5646
5647 /* Choose branch according to the condition. */
5648
5649 Fecode += condition? PRIV(OP_lengths)[*Fecode] : Flength;
5650
5651 /* If the opcode is OP_SCOND it means we are at a repeated conditional
5652 group that might match an empty string. We must therefore descend a level
5653 so that the start is remembered for checking. For OP_COND we can just
5654 continue at this level. */
5655
5656 if (Fop == OP_SCOND)
5657 {
5658 group_frame_type = GF_NOCAPTURE | Fop;
5659 RMATCH(Fecode, RM35);
5660 RRETURN(rrc);
5661 }
5662 break;
5663
5664
5665
5666/* ========================================================================= */
5667/* End of start of parenthesis opcodes */
5668/* ========================================================================= */
5669
5670
5671 /* ===================================================================== */
5672 /* Move the subject pointer back. This occurs only at the start of each
5673 branch of a lookbehind assertion. If we are too close to the start to move
5674 back, fail. When working with UTF-8 we move back a number of characters,
5675 not bytes. */
5676
5677 case OP_REVERSE:
5678 number = GET(Fecode, 1);
5679#ifdef SUPPORT_UNICODE
5680 if (utf)
5681 {
5682 while (number-- > 0)
5683 {
5684 if (Feptr <= mb->check_subject) RRETURN(MATCH_NOMATCH);
5685 Feptr--;
5686 BACKCHAR(Feptr);
5687 }
5688 }
5689 else
5690#endif
5691
5692 /* No UTF-8 support, or not in UTF-8 mode: count is code unit count */
5693
5694 {
5695 if ((ptrdiff_t)number > Feptr - mb->start_subject) RRETURN(MATCH_NOMATCH);
5696 Feptr -= number;
5697 }
5698
5699 /* Save the earliest consulted character, then skip to next opcode */
5700
5701 if (Feptr < mb->start_used_ptr) mb->start_used_ptr = Feptr;
5702 Fecode += 1 + LINK_SIZE;
5703 break;
5704
5705
5706 /* ===================================================================== */
5707 /* An alternation is the end of a branch; scan along to find the end of the
5708 bracketed group. */
5709
5710 case OP_ALT:
5711 do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);
5712 break;
5713
5714
5715 /* ===================================================================== */
5716 /* The end of a parenthesized group. For all but OP_BRA and OP_COND, the
5717 starting frame was added to the chained frames in order to remember the
5718 starting subject position for the group. */
5719
5720 case OP_KET:
5721 case OP_KETRMIN:
5722 case OP_KETRMAX:
5723 case OP_KETRPOS:
5724
5725 bracode = Fecode - GET(Fecode, 1);
5726
5727 /* Point N to the frame at the start of the most recent group.
5728 Remember the subject pointer at the start of the group. */
5729
5730 if (*bracode != OP_BRA && *bracode != OP_COND)
5731 {
5732 N = (heapframe *)((char *)mb->match_frames + Flast_group_offset);
5733 P = (heapframe *)((char *)N - frame_size);
5734 Flast_group_offset = P->last_group_offset;
5735
5736#ifdef DEBUG_SHOW_RMATCH
5737 fprintf(stderr, "++ KET for frame=%d type=%x prev char offset=%lu\n",
5738 N->rdepth, N->group_frame_type,
5739 (char *)P->eptr - (char *)mb->start_subject);
5740#endif
5741
5742 /* If we are at the end of an assertion that is a condition, return a
5743 match, discarding any intermediate backtracking points. Copy back the
5744 mark setting and the captures into the frame before N so that they are
5745 set on return. Doing this for all assertions, both positive and negative,
5746 seems to match what Perl does. */
5747
5748 if (GF_IDMASK(N->group_frame_type) == GF_CONDASSERT)
5749 {
5750 memcpy((char *)P + offsetof(heapframe, ovector), Fovector,
5751 Foffset_top * sizeof(PCRE2_SIZE));
5752 P->offset_top = Foffset_top;
5753 P->mark = Fmark;
5754 Fback_frame = (char *)F - (char *)P;
5755 RRETURN(MATCH_MATCH);
5756 }
5757 }
5758 else P = NULL; /* Indicates starting frame not recorded */
5759
5760 /* The group was not a conditional assertion. */
5761
5762 switch (*bracode)
5763 {
5764 case OP_BRA: /* No need to do anything for these */
5765 case OP_COND:
5766 case OP_SCOND:
5767 break;
5768
5769 /* Non-atomic positive assertions are like OP_BRA, except that the
5770 subject pointer must be put back to where it was at the start of the
5771 assertion. */
5772
5773 case OP_ASSERT_NA:
5774 case OP_ASSERTBACK_NA:
5775 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
5776 Feptr = P->eptr;
5777 break;
5778
5779 /* Atomic positive assertions are like OP_ONCE, except that in addition
5780 the subject pointer must be put back to where it was at the start of the
5781 assertion. */
5782
5783 case OP_ASSERT:
5784 case OP_ASSERTBACK:
5785 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
5786 Feptr = P->eptr;
5787 /* Fall through */
5788
5789 /* For an atomic group, discard internal backtracking points. We must
5790 also ensure that any remaining branches within the top-level of the group
5791 are not tried. Do this by adjusting the code pointer within the backtrack
5792 frame so that it points to the final branch. */
5793
5794 case OP_ONCE:
5795 Fback_frame = ((char *)F - (char *)P);
5796 for (;;)
5797 {
5798 uint32_t y = GET(P->ecode,1);
5799 if ((P->ecode)[y] != OP_ALT) break;
5800 P->ecode += y;
5801 }
5802 break;
5803
5804 /* A matching negative assertion returns MATCH, which is turned into
5805 NOMATCH at the assertion level. */
5806
5807 case OP_ASSERT_NOT:
5808 case OP_ASSERTBACK_NOT:
5809 RRETURN(MATCH_MATCH);
5810
5811 /* At the end of a script run, apply the script-checking rules. This code
5812 will never by exercised if Unicode support it not compiled, because in
5813 that environment script runs cause an error at compile time. */
5814
5815 case OP_SCRIPT_RUN:
5816 if (!PRIV(script_run)(P->eptr, Feptr, utf)) RRETURN(MATCH_NOMATCH);
5817 break;
5818
5819 /* Whole-pattern recursion is coded as a recurse into group 0, so it
5820 won't be picked up here. Instead, we catch it when the OP_END is reached.
5821 Other recursion is handled here. */
5822
5823 case OP_CBRA:
5824 case OP_CBRAPOS:
5825 case OP_SCBRA:
5826 case OP_SCBRAPOS:
5827 number = GET2(bracode, 1+LINK_SIZE);
5828
5829 /* Handle a recursively called group. We reinstate the previous set of
5830 captures and then carry on after the recursion call. */
5831
5832 if (Fcurrent_recurse == number)
5833 {
5834 P = (heapframe *)((char *)N - frame_size);
5835 memcpy((char *)F + offsetof(heapframe, ovector), P->ovector,
5836 P->offset_top * sizeof(PCRE2_SIZE));
5837 Foffset_top = P->offset_top;
5838 Fcapture_last = P->capture_last;
5839 Fcurrent_recurse = P->current_recurse;
5840 Fecode = P->ecode + 1 + LINK_SIZE;
5841 continue; /* With next opcode */
5842 }
5843
5844 /* Deal with actual capturing. */
5845
5846 offset = (number << 1) - 2;
5847 Fcapture_last = number;
5848 Fovector[offset] = P->eptr - mb->start_subject;
5849 Fovector[offset+1] = Feptr - mb->start_subject;
5850 if (offset >= Foffset_top) Foffset_top = offset + 2;
5851 break;
5852 } /* End actions relating to the starting opcode */
5853
5854 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
5855 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
5856 at a time from the outer level. This must precede the empty string test -
5857 in this case that test is done at the outer level. */
5858
5859 if (*Fecode == OP_KETRPOS)
5860 {
5861 memcpy((char *)P + offsetof(heapframe, eptr),
5862 (char *)F + offsetof(heapframe, eptr),
5863 frame_copy_size);
5864 RRETURN(MATCH_KETRPOS);
5865 }
5866
5867 /* Handle the different kinds of closing brackets. A non-repeating ket
5868 needs no special action, just continuing at this level. This also happens
5869 for the repeating kets if the group matched no characters, in order to
5870 forcibly break infinite loops. Otherwise, the repeating kets try the rest
5871 of the pattern or restart from the preceding bracket, in the appropriate
5872 order. */
5873
5874 if (Fop != OP_KET && (P == NULL || Feptr != P->eptr))
5875 {
5876 if (Fop == OP_KETRMIN)
5877 {
5878 RMATCH(Fecode + 1 + LINK_SIZE, RM6);
5879 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5880 Fecode -= GET(Fecode, 1);
5881 break; /* End of ket processing */
5882 }
5883
5884 /* Repeat the maximum number of times (KETRMAX) */
5885
5886 RMATCH(bracode, RM7);
5887 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5888 }
5889
5890 /* Carry on at this level for a non-repeating ket, or after matching an
5891 empty string, or after repeating for a maximum number of times. */
5892
5893 Fecode += 1 + LINK_SIZE;
5894 break;
5895
5896
5897 /* ===================================================================== */
5898 /* Start and end of line assertions, not multiline mode. */
5899
5900 case OP_CIRC: /* Start of line, unless PCRE2_NOTBOL is set. */
5901 if (Feptr != mb->start_subject || (mb->moptions & PCRE2_NOTBOL) != 0)
5902 RRETURN(MATCH_NOMATCH);
5903 Fecode++;
5904 break;
5905
5906 case OP_SOD: /* Unconditional start of subject */
5907 if (Feptr != mb->start_subject) RRETURN(MATCH_NOMATCH);
5908 Fecode++;
5909 break;
5910
5911 /* When PCRE2_NOTEOL is unset, assert before the subject end, or a
5912 terminating newline unless PCRE2_DOLLAR_ENDONLY is set. */
5913
5914 case OP_DOLL:
5915 if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
5916 if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS;
5917
5918 /* Fall through */
5919 /* Unconditional end of subject assertion (\z) */
5920
5921 case OP_EOD:
5922 if (Feptr < mb->end_subject) RRETURN(MATCH_NOMATCH);
5923 if (mb->partial != 0)
5924 {
5925 mb->hitend = TRUE;
5926 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
5927 }
5928 Fecode++;
5929 break;
5930
5931 /* End of subject or ending \n assertion (\Z) */
5932
5933 case OP_EODN:
5934 ASSERT_NL_OR_EOS:
5935 if (Feptr < mb->end_subject &&
5936 (!IS_NEWLINE(Feptr) || Feptr != mb->end_subject - mb->nllen))
5937 {
5938 if (mb->partial != 0 &&
5939 Feptr + 1 >= mb->end_subject &&
5940 NLBLOCK->nltype == NLTYPE_FIXED &&
5941 NLBLOCK->nllen == 2 &&
5942 UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
5943 {
5944 mb->hitend = TRUE;
5945 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
5946 }
5947 RRETURN(MATCH_NOMATCH);
5948 }
5949
5950 /* Either at end of string or \n before end. */
5951
5952 if (mb->partial != 0)
5953 {
5954 mb->hitend = TRUE;
5955 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
5956 }
5957 Fecode++;
5958 break;
5959
5960
5961 /* ===================================================================== */
5962 /* Start and end of line assertions, multiline mode. */
5963
5964 /* Start of subject unless notbol, or after any newline except for one at
5965 the very end, unless PCRE2_ALT_CIRCUMFLEX is set. */
5966
5967 case OP_CIRCM:
5968 if ((mb->moptions & PCRE2_NOTBOL) != 0 && Feptr == mb->start_subject)
5969 RRETURN(MATCH_NOMATCH);
5970 if (Feptr != mb->start_subject &&
5971 ((Feptr == mb->end_subject &&
5972 (mb->poptions & PCRE2_ALT_CIRCUMFLEX) == 0) ||
5973 !WAS_NEWLINE(Feptr)))
5974 RRETURN(MATCH_NOMATCH);
5975 Fecode++;
5976 break;
5977
5978 /* Assert before any newline, or before end of subject unless noteol is
5979 set. */
5980
5981 case OP_DOLLM:
5982 if (Feptr < mb->end_subject)
5983 {
5984 if (!IS_NEWLINE(Feptr))
5985 {
5986 if (mb->partial != 0 &&
5987 Feptr + 1 >= mb->end_subject &&
5988 NLBLOCK->nltype == NLTYPE_FIXED &&
5989 NLBLOCK->nllen == 2 &&
5990 UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
5991 {
5992 mb->hitend = TRUE;
5993 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
5994 }
5995 RRETURN(MATCH_NOMATCH);
5996 }
5997 }
5998 else
5999 {
6000 if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
6001 SCHECK_PARTIAL();
6002 }
6003 Fecode++;
6004 break;
6005
6006
6007 /* ===================================================================== */
6008 /* Start of match assertion */
6009
6010 case OP_SOM:
6011 if (Feptr != mb->start_subject + mb->start_offset) RRETURN(MATCH_NOMATCH);
6012 Fecode++;
6013 break;
6014
6015
6016 /* ===================================================================== */
6017 /* Reset the start of match point */
6018
6019 case OP_SET_SOM:
6020 Fstart_match = Feptr;
6021 Fecode++;
6022 break;
6023
6024
6025 /* ===================================================================== */
6026 /* Word boundary assertions. Find out if the previous and current
6027 characters are "word" characters. It takes a bit more work in UTF mode.
6028 Characters > 255 are assumed to be "non-word" characters when PCRE2_UCP is
6029 not set. When it is set, use Unicode properties if available, even when not
6030 in UTF mode. Remember the earliest and latest consulted characters. */
6031
6032 case OP_NOT_WORD_BOUNDARY:
6033 case OP_WORD_BOUNDARY:
6034 if (Feptr == mb->check_subject) prev_is_word = FALSE; else
6035 {
6036 PCRE2_SPTR lastptr = Feptr - 1;
6037#ifdef SUPPORT_UNICODE
6038 if (utf)
6039 {
6040 BACKCHAR(lastptr);
6041 GETCHAR(fc, lastptr);
6042 }
6043 else
6044#endif /* SUPPORT_UNICODE */
6045 fc = *lastptr;
6046 if (lastptr < mb->start_used_ptr) mb->start_used_ptr = lastptr;
6047#ifdef SUPPORT_UNICODE
6048 if ((mb->poptions & PCRE2_UCP) != 0)
6049 {
6050 if (fc == '_') prev_is_word = TRUE; else
6051 {
6052 int cat = UCD_CATEGORY(fc);
6053 prev_is_word = (cat == ucp_L || cat == ucp_N);
6054 }
6055 }
6056 else
6057#endif /* SUPPORT_UNICODE */
6058 prev_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0;
6059 }
6060
6061 /* Get status of next character */
6062
6063 if (Feptr >= mb->end_subject)
6064 {
6065 SCHECK_PARTIAL();
6066 cur_is_word = FALSE;
6067 }
6068 else
6069 {
6070 PCRE2_SPTR nextptr = Feptr + 1;
6071#ifdef SUPPORT_UNICODE
6072 if (utf)
6073 {
6074 FORWARDCHARTEST(nextptr, mb->end_subject);
6075 GETCHAR(fc, Feptr);
6076 }
6077 else
6078#endif /* SUPPORT_UNICODE */
6079 fc = *Feptr;
6080 if (nextptr > mb->last_used_ptr) mb->last_used_ptr = nextptr;
6081#ifdef SUPPORT_UNICODE
6082 if ((mb->poptions & PCRE2_UCP) != 0)
6083 {
6084 if (fc == '_') cur_is_word = TRUE; else
6085 {
6086 int cat = UCD_CATEGORY(fc);
6087 cur_is_word = (cat == ucp_L || cat == ucp_N);
6088 }
6089 }
6090 else
6091#endif /* SUPPORT_UNICODE */
6092 cur_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0;
6093 }
6094
6095 /* Now see if the situation is what we want */
6096
6097 if ((*Fecode++ == OP_WORD_BOUNDARY)?
6098 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
6099 RRETURN(MATCH_NOMATCH);
6100 break;
6101
6102
6103 /* ===================================================================== */
6104 /* Backtracking (*VERB)s, with and without arguments. Note that if the
6105 pattern is successfully matched, we do not come back from RMATCH. */
6106
6107 case OP_MARK:
6108 Fmark = mb->nomatch_mark = Fecode + 2;
6109 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM12);
6110
6111 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
6112 argument, and we must check whether that argument matches this MARK's
6113 argument. It is passed back in mb->verb_skip_ptr. If it does match, we
6114 return MATCH_SKIP with mb->verb_skip_ptr now pointing to the subject
6115 position that corresponds to this mark. Otherwise, pass back the return
6116 code unaltered. */
6117
6118 if (rrc == MATCH_SKIP_ARG &&
6119 PRIV(strcmp)(Fecode + 2, mb->verb_skip_ptr) == 0)
6120 {
6121 mb->verb_skip_ptr = Feptr; /* Pass back current position */
6122 RRETURN(MATCH_SKIP);
6123 }
6124 RRETURN(rrc);
6125
6126 case OP_FAIL:
6127 RRETURN(MATCH_NOMATCH);
6128
6129 /* Record the current recursing group number in mb->verb_current_recurse
6130 when a backtracking return such as MATCH_COMMIT is given. This enables the
6131 recurse processing to catch verbs from within the recursion. */
6132
6133 case OP_COMMIT:
6134 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM13);
6135 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6136 mb->verb_current_recurse = Fcurrent_recurse;
6137 RRETURN(MATCH_COMMIT);
6138
6139 case OP_COMMIT_ARG:
6140 Fmark = mb->nomatch_mark = Fecode + 2;
6141 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM36);
6142 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6143 mb->verb_current_recurse = Fcurrent_recurse;
6144 RRETURN(MATCH_COMMIT);
6145
6146 case OP_PRUNE:
6147 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM14);
6148 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6149 mb->verb_current_recurse = Fcurrent_recurse;
6150 RRETURN(MATCH_PRUNE);
6151
6152 case OP_PRUNE_ARG:
6153 Fmark = mb->nomatch_mark = Fecode + 2;
6154 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM15);
6155 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6156 mb->verb_current_recurse = Fcurrent_recurse;
6157 RRETURN(MATCH_PRUNE);
6158
6159 case OP_SKIP:
6160 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM16);
6161 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6162 mb->verb_skip_ptr = Feptr; /* Pass back current position */
6163 mb->verb_current_recurse = Fcurrent_recurse;
6164 RRETURN(MATCH_SKIP);
6165
6166 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
6167 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
6168 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
6169 that failed and any that precede it (either they also failed, or were not
6170 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
6171 SKIP_ARG gets to top level, the match is re-run with mb->ignore_skip_arg
6172 set to the count of the one that failed. */
6173
6174 case OP_SKIP_ARG:
6175 mb->skip_arg_count++;
6176 if (mb->skip_arg_count <= mb->ignore_skip_arg)
6177 {
6178 Fecode += PRIV(OP_lengths)[*Fecode] + Fecode[1];
6179 break;
6180 }
6181 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM17);
6182 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6183
6184 /* Pass back the current skip name and return the special MATCH_SKIP_ARG
6185 return code. This will either be caught by a matching MARK, or get to the
6186 top, where it causes a rematch with mb->ignore_skip_arg set to the value of
6187 mb->skip_arg_count. */
6188
6189 mb->verb_skip_ptr = Fecode + 2;
6190 mb->verb_current_recurse = Fcurrent_recurse;
6191 RRETURN(MATCH_SKIP_ARG);
6192
6193 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
6194 the branch in which it occurs can be determined. */
6195
6196 case OP_THEN:
6197 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM18);
6198 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6199 mb->verb_ecode_ptr = Fecode;
6200 mb->verb_current_recurse = Fcurrent_recurse;
6201 RRETURN(MATCH_THEN);
6202
6203 case OP_THEN_ARG:
6204 Fmark = mb->nomatch_mark = Fecode + 2;
6205 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM19);
6206 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6207 mb->verb_ecode_ptr = Fecode;
6208 mb->verb_current_recurse = Fcurrent_recurse;
6209 RRETURN(MATCH_THEN);
6210
6211
6212 /* ===================================================================== */
6213 /* There's been some horrible disaster. Arrival here can only mean there is
6214 something seriously wrong in the code above or the OP_xxx definitions. */
6215
6216 default:
6217 return PCRE2_ERROR_INTERNAL;
6218 }
6219
6220 /* Do not insert any code in here without much thought; it is assumed
6221 that "continue" in the code above comes out to here to repeat the main
6222 loop. */
6223
6224 } /* End of main loop */
6225/* Control never reaches here */
6226
6227
6228/* ========================================================================= */
6229/* The RRETURN() macro jumps here. The number that is saved in Freturn_id
6230indicates which label we actually want to return to. The value in Frdepth is
6231the index number of the frame in the vector. The return value has been placed
6232in rrc. */
6233
6234#define LBL(val) case val: goto L_RM##val;
6235
6236RETURN_SWITCH:
6237if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
6238if (Frdepth == 0) return rrc; /* Exit from the top level */
6239F = (heapframe *)((char *)F - Fback_frame); /* Backtrack */
6240mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */
6241
6242#ifdef DEBUG_SHOW_RMATCH
6243fprintf(stderr, "++ RETURN %d to %d\n", rrc, Freturn_id);
6244#endif
6245
6246switch (Freturn_id)
6247 {
6248 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6249 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16)
6250 LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24)
6251 LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32)
6252 LBL(33) LBL(34) LBL(35) LBL(36)
6253
6254#ifdef SUPPORT_WIDE_CHARS
6255 LBL(100) LBL(101)
6256#endif
6257
6258#ifdef SUPPORT_UNICODE
6259 LBL(200) LBL(201) LBL(202) LBL(203) LBL(204) LBL(205) LBL(206)
6260 LBL(207) LBL(208) LBL(209) LBL(210) LBL(211) LBL(212) LBL(213)
6261 LBL(214) LBL(215) LBL(216) LBL(217) LBL(218) LBL(219) LBL(220)
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07006262 LBL(221) LBL(222) LBL(223) LBL(224) LBL(225)
Elliott Hughes5b808042021-10-01 10:56:10 -07006263#endif
6264
6265 default:
6266 return PCRE2_ERROR_INTERNAL;
6267 }
6268#undef LBL
6269}
6270
6271
6272/*************************************************
6273* Match a Regular Expression *
6274*************************************************/
6275
6276/* This function applies a compiled pattern to a subject string and picks out
6277portions of the string if it matches. Two elements in the vector are set for
6278each substring: the offsets to the start and end of the substring.
6279
6280Arguments:
6281 code points to the compiled expression
6282 subject points to the subject string
6283 length length of subject string (may contain binary zeros)
6284 start_offset where to start in the subject string
6285 options option bits
6286 match_data points to a match_data block
6287 mcontext points a PCRE2 context
6288
6289Returns: > 0 => success; value is the number of ovector pairs filled
6290 = 0 => success, but ovector is not big enough
6291 = -1 => failed to match (PCRE2_ERROR_NOMATCH)
6292 = -2 => partial match (PCRE2_ERROR_PARTIAL)
6293 < -2 => some kind of unexpected problem
6294*/
6295
6296PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
6297pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
6298 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
6299 pcre2_match_context *mcontext)
6300{
6301int rc;
6302int was_zero_terminated = 0;
6303const uint8_t *start_bits = NULL;
6304const pcre2_real_code *re = (const pcre2_real_code *)code;
6305
6306BOOL anchored;
6307BOOL firstline;
6308BOOL has_first_cu = FALSE;
6309BOOL has_req_cu = FALSE;
6310BOOL startline;
6311
6312#if PCRE2_CODE_UNIT_WIDTH == 8
6313PCRE2_SPTR memchr_found_first_cu;
6314PCRE2_SPTR memchr_found_first_cu2;
6315#endif
6316
6317PCRE2_UCHAR first_cu = 0;
6318PCRE2_UCHAR first_cu2 = 0;
6319PCRE2_UCHAR req_cu = 0;
6320PCRE2_UCHAR req_cu2 = 0;
6321
6322PCRE2_SPTR bumpalong_limit;
6323PCRE2_SPTR end_subject;
6324PCRE2_SPTR true_end_subject;
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07006325PCRE2_SPTR start_match;
6326PCRE2_SPTR req_cu_ptr;
Elliott Hughes5b808042021-10-01 10:56:10 -07006327PCRE2_SPTR start_partial;
6328PCRE2_SPTR match_partial;
6329
6330#ifdef SUPPORT_JIT
6331BOOL use_jit;
6332#endif
6333
6334/* This flag is needed even when Unicode is not supported for convenience
6335(it is used by the IS_NEWLINE macro). */
6336
6337BOOL utf = FALSE;
6338
6339#ifdef SUPPORT_UNICODE
6340BOOL ucp = FALSE;
6341BOOL allow_invalid;
6342uint32_t fragment_options = 0;
6343#ifdef SUPPORT_JIT
6344BOOL jit_checked_utf = FALSE;
6345#endif
6346#endif /* SUPPORT_UNICODE */
6347
6348PCRE2_SIZE frame_size;
6349
6350/* We need to have mb as a pointer to a match block, because the IS_NEWLINE
6351macro is used below, and it expects NLBLOCK to be defined as a pointer. */
6352
6353pcre2_callout_block cb;
6354match_block actual_match_block;
6355match_block *mb = &actual_match_block;
6356
6357/* Allocate an initial vector of backtracking frames on the stack. If this
6358proves to be too small, it is replaced by a larger one on the heap. To get a
6359vector of the size required that is aligned for pointers, allocate it as a
6360vector of pointers. */
6361
6362PCRE2_SPTR stack_frames_vector[START_FRAMES_SIZE/sizeof(PCRE2_SPTR)]
6363 PCRE2_KEEP_UNINITIALIZED;
6364mb->stack_frames = (heapframe *)stack_frames_vector;
6365
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07006366/* Recognize NULL, length 0 as an empty string. */
Elliott Hughes5b808042021-10-01 10:56:10 -07006367
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07006368if (subject == NULL && length == 0) subject = (PCRE2_SPTR)"";
6369
6370/* Plausibility checks */
6371
6372if ((options & ~PUBLIC_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
6373if (code == NULL || subject == NULL || match_data == NULL)
6374 return PCRE2_ERROR_NULL;
6375
6376start_match = subject + start_offset;
6377req_cu_ptr = start_match - 1;
Elliott Hughes5b808042021-10-01 10:56:10 -07006378if (length == PCRE2_ZERO_TERMINATED)
6379 {
6380 length = PRIV(strlen)(subject);
6381 was_zero_terminated = 1;
6382 }
6383true_end_subject = end_subject = subject + length;
6384
Elliott Hughes5b808042021-10-01 10:56:10 -07006385if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
6386
6387/* Check that the first field in the block is the magic number. */
6388
6389if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
6390
6391/* Check the code unit width. */
6392
6393if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
6394 return PCRE2_ERROR_BADMODE;
6395
6396/* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
6397options variable for this function. Users of PCRE2 who are not calling the
6398function directly would like to have a way of setting these flags, in the same
6399way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
6400constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
6401(*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which we now
6402transfer to the options for this function. The bits are guaranteed to be
6403adjacent, but do not have the same values. This bit of Boolean trickery assumes
6404that the match-time bits are not more significant than the flag bits. If by
6405accident this is not the case, a compile-time division by zero error will
6406occur. */
6407
6408#define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
6409#define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
6410options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
6411#undef FF
6412#undef OO
6413
6414/* If the pattern was successfully studied with JIT support, we will run the
6415JIT executable instead of the rest of this function. Most options must be set
6416at compile time for the JIT code to be usable. */
6417
6418#ifdef SUPPORT_JIT
6419use_jit = (re->executable_jit != NULL &&
6420 (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0);
6421#endif
6422
6423/* Initialize UTF/UCP parameters. */
6424
6425#ifdef SUPPORT_UNICODE
6426utf = (re->overall_options & PCRE2_UTF) != 0;
6427allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0;
6428ucp = (re->overall_options & PCRE2_UCP) != 0;
6429#endif /* SUPPORT_UNICODE */
6430
6431/* Convert the partial matching flags into an integer. */
6432
6433mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 :
6434 ((options & PCRE2_PARTIAL_SOFT) != 0)? 1 : 0;
6435
6436/* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
6437time. */
6438
6439if (mb->partial != 0 &&
6440 ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
6441 return PCRE2_ERROR_BADOPTION;
6442
6443/* It is an error to set an offset limit without setting the flag at compile
6444time. */
6445
6446if (mcontext != NULL && mcontext->offset_limit != PCRE2_UNSET &&
6447 (re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
6448 return PCRE2_ERROR_BADOFFSETLIMIT;
6449
6450/* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,
6451free the memory that was obtained. Set the field to NULL for no match cases. */
6452
6453if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
6454 {
6455 match_data->memctl.free((void *)match_data->subject,
6456 match_data->memctl.memory_data);
6457 match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
6458 }
6459match_data->subject = NULL;
6460
6461/* Zero the error offset in case the first code unit is invalid UTF. */
6462
6463match_data->startchar = 0;
6464
6465
6466/* ============================= JIT matching ============================== */
6467
6468/* Prepare for JIT matching. Check a UTF string for validity unless no check is
6469requested or invalid UTF can be handled. We check only the portion of the
6470subject that might be be inspected during matching - from the offset minus the
6471maximum lookbehind to the given length. This saves time when a small part of a
6472large subject is being matched by the use of a starting offset. Note that the
6473maximum lookbehind is a number of characters, not code units. */
6474
6475#ifdef SUPPORT_JIT
6476if (use_jit)
6477 {
6478#ifdef SUPPORT_UNICODE
6479 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0 && !allow_invalid)
6480 {
6481#if PCRE2_CODE_UNIT_WIDTH != 32
6482 unsigned int i;
6483#endif
6484
6485 /* For 8-bit and 16-bit UTF, check that the first code unit is a valid
6486 character start. */
6487
6488#if PCRE2_CODE_UNIT_WIDTH != 32
6489 if (start_match < end_subject && NOT_FIRSTCU(*start_match))
6490 {
6491 if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET;
6492#if PCRE2_CODE_UNIT_WIDTH == 8
6493 return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */
6494#else
6495 return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */
6496#endif
6497 }
6498#endif /* WIDTH != 32 */
6499
6500 /* Move back by the maximum lookbehind, just in case it happens at the very
6501 start of matching. */
6502
6503#if PCRE2_CODE_UNIT_WIDTH != 32
6504 for (i = re->max_lookbehind; i > 0 && start_match > subject; i--)
6505 {
6506 start_match--;
6507 while (start_match > subject &&
6508#if PCRE2_CODE_UNIT_WIDTH == 8
6509 (*start_match & 0xc0) == 0x80)
6510#else /* 16-bit */
6511 (*start_match & 0xfc00) == 0xdc00)
6512#endif
6513 start_match--;
6514 }
6515#else /* PCRE2_CODE_UNIT_WIDTH != 32 */
6516
6517 /* In the 32-bit library, one code unit equals one character. However,
6518 we cannot just subtract the lookbehind and then compare pointers, because
6519 a very large lookbehind could create an invalid pointer. */
6520
6521 if (start_offset >= re->max_lookbehind)
6522 start_match -= re->max_lookbehind;
6523 else
6524 start_match = subject;
6525#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
6526
6527 /* Validate the relevant portion of the subject. Adjust the offset of an
6528 invalid code point to be an absolute offset in the whole string. */
6529
6530 match_data->rc = PRIV(valid_utf)(start_match,
6531 length - (start_match - subject), &(match_data->startchar));
6532 if (match_data->rc != 0)
6533 {
6534 match_data->startchar += start_match - subject;
6535 return match_data->rc;
6536 }
6537 jit_checked_utf = TRUE;
6538 }
6539#endif /* SUPPORT_UNICODE */
6540
6541 /* If JIT returns BADOPTION, which means that the selected complete or
6542 partial matching mode was not compiled, fall through to the interpreter. */
6543
6544 rc = pcre2_jit_match(code, subject, length, start_offset, options,
6545 match_data, mcontext);
6546 if (rc != PCRE2_ERROR_JIT_BADOPTION)
6547 {
6548 if (rc >= 0 && (options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
6549 {
6550 length = CU2BYTES(length + was_zero_terminated);
6551 match_data->subject = match_data->memctl.malloc(length,
6552 match_data->memctl.memory_data);
6553 if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
6554 memcpy((void *)match_data->subject, subject, length);
6555 match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
6556 }
6557 return rc;
6558 }
6559 }
6560#endif /* SUPPORT_JIT */
6561
6562/* ========================= End of JIT matching ========================== */
6563
6564
6565/* Proceed with non-JIT matching. The default is to allow lookbehinds to the
6566start of the subject. A UTF check when there is a non-zero offset may change
6567this. */
6568
6569mb->check_subject = subject;
6570
6571/* If a UTF subject string was not checked for validity in the JIT code above,
6572check it here, and handle support for invalid UTF strings. The check above
6573happens only when invalid UTF is not supported and PCRE2_NO_CHECK_UTF is unset.
6574If we get here in those circumstances, it means the subject string is valid,
6575but for some reason JIT matching was not successful. There is no need to check
6576the subject again.
6577
6578We check only the portion of the subject that might be be inspected during
6579matching - from the offset minus the maximum lookbehind to the given length.
6580This saves time when a small part of a large subject is being matched by the
6581use of a starting offset. Note that the maximum lookbehind is a number of
6582characters, not code units.
6583
6584Note also that support for invalid UTF forces a check, overriding the setting
6585of PCRE2_NO_CHECK_UTF. */
6586
6587#ifdef SUPPORT_UNICODE
6588if (utf &&
6589#ifdef SUPPORT_JIT
6590 !jit_checked_utf &&
6591#endif
6592 ((options & PCRE2_NO_UTF_CHECK) == 0 || allow_invalid))
6593 {
6594#if PCRE2_CODE_UNIT_WIDTH != 32
6595 BOOL skipped_bad_start = FALSE;
6596#endif
6597
6598 /* For 8-bit and 16-bit UTF, check that the first code unit is a valid
6599 character start. If we are handling invalid UTF, just skip over such code
6600 units. Otherwise, give an appropriate error. */
6601
6602#if PCRE2_CODE_UNIT_WIDTH != 32
6603 if (allow_invalid)
6604 {
6605 while (start_match < end_subject && NOT_FIRSTCU(*start_match))
6606 {
6607 start_match++;
6608 skipped_bad_start = TRUE;
6609 }
6610 }
6611 else if (start_match < end_subject && NOT_FIRSTCU(*start_match))
6612 {
6613 if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET;
6614#if PCRE2_CODE_UNIT_WIDTH == 8
6615 return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */
6616#else
6617 return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */
6618#endif
6619 }
6620#endif /* WIDTH != 32 */
6621
6622 /* The mb->check_subject field points to the start of UTF checking;
6623 lookbehinds can go back no further than this. */
6624
6625 mb->check_subject = start_match;
6626
6627 /* Move back by the maximum lookbehind, just in case it happens at the very
6628 start of matching, but don't do this if we skipped bad 8-bit or 16-bit code
6629 units above. */
6630
6631#if PCRE2_CODE_UNIT_WIDTH != 32
6632 if (!skipped_bad_start)
6633 {
6634 unsigned int i;
6635 for (i = re->max_lookbehind; i > 0 && mb->check_subject > subject; i--)
6636 {
6637 mb->check_subject--;
6638 while (mb->check_subject > subject &&
6639#if PCRE2_CODE_UNIT_WIDTH == 8
6640 (*mb->check_subject & 0xc0) == 0x80)
6641#else /* 16-bit */
6642 (*mb->check_subject & 0xfc00) == 0xdc00)
6643#endif
6644 mb->check_subject--;
6645 }
6646 }
6647#else /* PCRE2_CODE_UNIT_WIDTH != 32 */
6648
6649 /* In the 32-bit library, one code unit equals one character. However,
6650 we cannot just subtract the lookbehind and then compare pointers, because
6651 a very large lookbehind could create an invalid pointer. */
6652
6653 if (start_offset >= re->max_lookbehind)
6654 mb->check_subject -= re->max_lookbehind;
6655 else
6656 mb->check_subject = subject;
6657#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
6658
6659 /* Validate the relevant portion of the subject. There's a loop in case we
6660 encounter bad UTF in the characters preceding start_match which we are
6661 scanning because of a lookbehind. */
6662
6663 for (;;)
6664 {
6665 match_data->rc = PRIV(valid_utf)(mb->check_subject,
6666 length - (mb->check_subject - subject), &(match_data->startchar));
6667
6668 if (match_data->rc == 0) break; /* Valid UTF string */
6669
6670 /* Invalid UTF string. Adjust the offset to be an absolute offset in the
6671 whole string. If we are handling invalid UTF strings, set end_subject to
6672 stop before the bad code unit, and set the options to "not end of line".
6673 Otherwise return the error. */
6674
6675 match_data->startchar += mb->check_subject - subject;
6676 if (!allow_invalid || match_data->rc > 0) return match_data->rc;
6677 end_subject = subject + match_data->startchar;
6678
6679 /* If the end precedes start_match, it means there is invalid UTF in the
6680 extra code units we reversed over because of a lookbehind. Advance past the
6681 first bad code unit, and then skip invalid character starting code units in
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07006682 8-bit and 16-bit modes, and try again with the original end point. */
Elliott Hughes5b808042021-10-01 10:56:10 -07006683
6684 if (end_subject < start_match)
6685 {
6686 mb->check_subject = end_subject + 1;
6687#if PCRE2_CODE_UNIT_WIDTH != 32
6688 while (mb->check_subject < start_match && NOT_FIRSTCU(*mb->check_subject))
6689 mb->check_subject++;
6690#endif
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07006691 end_subject = true_end_subject;
Elliott Hughes5b808042021-10-01 10:56:10 -07006692 }
6693
6694 /* Otherwise, set the not end of line option, and do the match. */
6695
6696 else
6697 {
6698 fragment_options = PCRE2_NOTEOL;
6699 break;
6700 }
6701 }
6702 }
6703#endif /* SUPPORT_UNICODE */
6704
6705/* A NULL match context means "use a default context", but we take the memory
6706control functions from the pattern. */
6707
6708if (mcontext == NULL)
6709 {
6710 mcontext = (pcre2_match_context *)(&PRIV(default_match_context));
6711 mb->memctl = re->memctl;
6712 }
6713else mb->memctl = mcontext->memctl;
6714
6715anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0;
6716firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
6717startline = (re->flags & PCRE2_STARTLINE) != 0;
6718bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)?
6719 true_end_subject : subject + mcontext->offset_limit;
6720
6721/* Initialize and set up the fixed fields in the callout block, with a pointer
6722in the match block. */
6723
6724mb->cb = &cb;
6725cb.version = 2;
6726cb.subject = subject;
6727cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
6728cb.callout_flags = 0;
6729
6730/* Fill in the remaining fields in the match block, except for moptions, which
6731gets set later. */
6732
6733mb->callout = mcontext->callout;
6734mb->callout_data = mcontext->callout_data;
6735
6736mb->start_subject = subject;
6737mb->start_offset = start_offset;
6738mb->end_subject = end_subject;
6739mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0;
6740mb->allowemptypartial = (re->max_lookbehind > 0) ||
6741 (re->flags & PCRE2_MATCH_EMPTY) != 0;
6742mb->poptions = re->overall_options; /* Pattern options */
6743mb->ignore_skip_arg = 0;
6744mb->mark = mb->nomatch_mark = NULL; /* In case never set */
6745
6746/* The name table is needed for finding all the numbers associated with a
6747given name, for condition testing. The code follows the name table. */
6748
6749mb->name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
6750mb->name_count = re->name_count;
6751mb->name_entry_size = re->name_entry_size;
6752mb->start_code = mb->name_table + re->name_count * re->name_entry_size;
6753
6754/* Process the \R and newline settings. */
6755
6756mb->bsr_convention = re->bsr_convention;
6757mb->nltype = NLTYPE_FIXED;
6758switch(re->newline_convention)
6759 {
6760 case PCRE2_NEWLINE_CR:
6761 mb->nllen = 1;
6762 mb->nl[0] = CHAR_CR;
6763 break;
6764
6765 case PCRE2_NEWLINE_LF:
6766 mb->nllen = 1;
6767 mb->nl[0] = CHAR_NL;
6768 break;
6769
6770 case PCRE2_NEWLINE_NUL:
6771 mb->nllen = 1;
6772 mb->nl[0] = CHAR_NUL;
6773 break;
6774
6775 case PCRE2_NEWLINE_CRLF:
6776 mb->nllen = 2;
6777 mb->nl[0] = CHAR_CR;
6778 mb->nl[1] = CHAR_NL;
6779 break;
6780
6781 case PCRE2_NEWLINE_ANY:
6782 mb->nltype = NLTYPE_ANY;
6783 break;
6784
6785 case PCRE2_NEWLINE_ANYCRLF:
6786 mb->nltype = NLTYPE_ANYCRLF;
6787 break;
6788
6789 default: return PCRE2_ERROR_INTERNAL;
6790 }
6791
6792/* The backtracking frames have fixed data at the front, and a PCRE2_SIZE
6793vector at the end, whose size depends on the number of capturing parentheses in
6794the pattern. It is not used at all if there are no capturing parentheses.
6795
6796 frame_size is the total size of each frame
6797 mb->frame_vector_size is the total usable size of the vector (rounded down
6798 to a whole number of frames)
6799
6800The last of these is changed within the match() function if the frame vector
6801has to be expanded. We therefore put it into the match block so that it is
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07006802correct when calling match() more than once for non-anchored patterns.
Elliott Hughes5b808042021-10-01 10:56:10 -07006803
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07006804We must also pad frame_size for alignment to ensure subsequent frames are as
6805aligned as heapframe. Whilst ovector is word-aligned due to being a PCRE2_SIZE
6806array, that does not guarantee it is suitably aligned for pointers, as some
6807architectures have pointers that are larger than a size_t. */
6808
6809frame_size = (offsetof(heapframe, ovector) +
6810 re->top_bracket * 2 * sizeof(PCRE2_SIZE) + HEAPFRAME_ALIGNMENT - 1) &
6811 ~(HEAPFRAME_ALIGNMENT - 1);
Elliott Hughes5b808042021-10-01 10:56:10 -07006812
6813/* Limits set in the pattern override the match context only if they are
6814smaller. */
6815
6816mb->heap_limit = (mcontext->heap_limit < re->limit_heap)?
6817 mcontext->heap_limit : re->limit_heap;
6818
6819mb->match_limit = (mcontext->match_limit < re->limit_match)?
6820 mcontext->match_limit : re->limit_match;
6821
6822mb->match_limit_depth = (mcontext->depth_limit < re->limit_depth)?
6823 mcontext->depth_limit : re->limit_depth;
6824
6825/* If a pattern has very many capturing parentheses, the frame size may be very
6826large. Ensure that there are at least 10 available frames by getting an initial
6827vector on the heap if necessary, except when the heap limit prevents this. Get
6828fewer if possible. (The heap limit is in kibibytes.) */
6829
6830if (frame_size <= START_FRAMES_SIZE/10)
6831 {
6832 mb->match_frames = mb->stack_frames; /* Initial frame vector on the stack */
6833 mb->frame_vector_size = ((START_FRAMES_SIZE/frame_size) * frame_size);
6834 }
6835else
6836 {
6837 mb->frame_vector_size = frame_size * 10;
6838 if ((mb->frame_vector_size / 1024) > mb->heap_limit)
6839 {
6840 if (frame_size > mb->heap_limit * 1024) return PCRE2_ERROR_HEAPLIMIT;
6841 mb->frame_vector_size = ((mb->heap_limit * 1024)/frame_size) * frame_size;
6842 }
6843 mb->match_frames = mb->memctl.malloc(mb->frame_vector_size,
6844 mb->memctl.memory_data);
6845 if (mb->match_frames == NULL) return PCRE2_ERROR_NOMEMORY;
6846 }
6847
6848mb->match_frames_top =
6849 (heapframe *)((char *)mb->match_frames + mb->frame_vector_size);
6850
6851/* Write to the ovector within the first frame to mark every capture unset and
6852to avoid uninitialized memory read errors when it is copied to a new frame. */
6853
6854memset((char *)(mb->match_frames) + offsetof(heapframe, ovector), 0xff,
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07006855 frame_size - offsetof(heapframe, ovector));
Elliott Hughes5b808042021-10-01 10:56:10 -07006856
6857/* Pointers to the individual character tables */
6858
6859mb->lcc = re->tables + lcc_offset;
6860mb->fcc = re->tables + fcc_offset;
6861mb->ctypes = re->tables + ctypes_offset;
6862
6863/* Set up the first code unit to match, if available. If there's no first code
6864unit there may be a bitmap of possible first characters. */
6865
6866if ((re->flags & PCRE2_FIRSTSET) != 0)
6867 {
6868 has_first_cu = TRUE;
6869 first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
6870 if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
6871 {
6872 first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
6873#ifdef SUPPORT_UNICODE
6874#if PCRE2_CODE_UNIT_WIDTH == 8
6875 if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu);
6876#else
6877 if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu);
6878#endif
6879#endif /* SUPPORT_UNICODE */
6880 }
6881 }
6882else
6883 if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
6884 start_bits = re->start_bitmap;
6885
6886/* There may also be a "last known required character" set. */
6887
6888if ((re->flags & PCRE2_LASTSET) != 0)
6889 {
6890 has_req_cu = TRUE;
6891 req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
6892 if ((re->flags & PCRE2_LASTCASELESS) != 0)
6893 {
6894 req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu);
6895#ifdef SUPPORT_UNICODE
6896#if PCRE2_CODE_UNIT_WIDTH == 8
6897 if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu);
6898#else
6899 if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu);
6900#endif
6901#endif /* SUPPORT_UNICODE */
6902 }
6903 }
6904
6905
6906/* ==========================================================================*/
6907
6908/* Loop for handling unanchored repeated matching attempts; for anchored regexs
6909the loop runs just once. */
6910
6911#ifdef SUPPORT_UNICODE
6912FRAGMENT_RESTART:
6913#endif
6914
6915start_partial = match_partial = NULL;
6916mb->hitend = FALSE;
6917
6918#if PCRE2_CODE_UNIT_WIDTH == 8
6919memchr_found_first_cu = NULL;
6920memchr_found_first_cu2 = NULL;
6921#endif
6922
6923for(;;)
6924 {
6925 PCRE2_SPTR new_start_match;
6926
6927 /* ----------------- Start of match optimizations ---------------- */
6928
6929 /* There are some optimizations that avoid running the match if a known
6930 starting point is not found, or if a known later code unit is not present.
6931 However, there is an option (settable at compile time) that disables these,
6932 for testing and for ensuring that all callouts do actually occur. */
6933
6934 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
6935 {
6936 /* If firstline is TRUE, the start of the match is constrained to the first
6937 line of a multiline string. That is, the match must be before or at the
6938 first newline following the start of matching. Temporarily adjust
6939 end_subject so that we stop the scans for a first code unit at a newline.
6940 If the match fails at the newline, later code breaks the loop. */
6941
6942 if (firstline)
6943 {
6944 PCRE2_SPTR t = start_match;
6945#ifdef SUPPORT_UNICODE
6946 if (utf)
6947 {
6948 while (t < end_subject && !IS_NEWLINE(t))
6949 {
6950 t++;
6951 ACROSSCHAR(t < end_subject, t, t++);
6952 }
6953 }
6954 else
6955#endif
6956 while (t < end_subject && !IS_NEWLINE(t)) t++;
6957 end_subject = t;
6958 }
6959
6960 /* Anchored: check the first code unit if one is recorded. This may seem
6961 pointless but it can help in detecting a no match case without scanning for
6962 the required code unit. */
6963
6964 if (anchored)
6965 {
6966 if (has_first_cu || start_bits != NULL)
6967 {
6968 BOOL ok = start_match < end_subject;
6969 if (ok)
6970 {
6971 PCRE2_UCHAR c = UCHAR21TEST(start_match);
6972 ok = has_first_cu && (c == first_cu || c == first_cu2);
6973 if (!ok && start_bits != NULL)
6974 {
6975#if PCRE2_CODE_UNIT_WIDTH != 8
6976 if (c > 255) c = 255;
6977#endif
6978 ok = (start_bits[c/8] & (1u << (c&7))) != 0;
6979 }
6980 }
6981 if (!ok)
6982 {
6983 rc = MATCH_NOMATCH;
6984 break;
6985 }
6986 }
6987 }
6988
6989 /* Not anchored. Advance to a unique first code unit if there is one. */
6990
6991 else
6992 {
6993 if (has_first_cu)
6994 {
6995 if (first_cu != first_cu2) /* Caseless */
6996 {
6997 /* In 16-bit and 32_bit modes we have to do our own search, so can
6998 look for both cases at once. */
6999
7000#if PCRE2_CODE_UNIT_WIDTH != 8
7001 PCRE2_UCHAR smc;
7002 while (start_match < end_subject &&
7003 (smc = UCHAR21TEST(start_match)) != first_cu &&
7004 smc != first_cu2)
7005 start_match++;
7006#else
7007 /* In 8-bit mode, the use of memchr() gives a big speed up, even
7008 though we have to call it twice in order to find the earliest
7009 occurrence of the code unit in either of its cases. Caching is used
7010 to remember the positions of previously found code units. This can
7011 make a huge difference when the strings are very long and only one
7012 case is actually present. */
7013
7014 PCRE2_SPTR pp1 = NULL;
7015 PCRE2_SPTR pp2 = NULL;
7016 PCRE2_SIZE searchlength = end_subject - start_match;
7017
7018 /* If we haven't got a previously found position for first_cu, or if
7019 the current starting position is later, we need to do a search. If
7020 the code unit is not found, set it to the end. */
7021
7022 if (memchr_found_first_cu == NULL ||
7023 start_match > memchr_found_first_cu)
7024 {
7025 pp1 = memchr(start_match, first_cu, searchlength);
7026 memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
7027 }
7028
7029 /* If the start is before a previously found position, use the
7030 previous position, or NULL if a previous search failed. */
7031
7032 else pp1 = (memchr_found_first_cu == end_subject)? NULL :
7033 memchr_found_first_cu;
7034
7035 /* Do the same thing for the other case. */
7036
7037 if (memchr_found_first_cu2 == NULL ||
7038 start_match > memchr_found_first_cu2)
7039 {
7040 pp2 = memchr(start_match, first_cu2, searchlength);
7041 memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
7042 }
7043
7044 else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
7045 memchr_found_first_cu2;
7046
7047 /* Set the start to the end of the subject if neither case was found.
7048 Otherwise, use the earlier found point. */
7049
7050 if (pp1 == NULL)
7051 start_match = (pp2 == NULL)? end_subject : pp2;
7052 else
7053 start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
7054
7055#endif /* 8-bit handling */
7056 }
7057
7058 /* The caseful case is much simpler. */
7059
7060 else
7061 {
7062#if PCRE2_CODE_UNIT_WIDTH != 8
7063 while (start_match < end_subject && UCHAR21TEST(start_match) !=
7064 first_cu)
7065 start_match++;
7066#else
7067 start_match = memchr(start_match, first_cu, end_subject - start_match);
7068 if (start_match == NULL) start_match = end_subject;
7069#endif
7070 }
7071
7072 /* If we can't find the required first code unit, having reached the
7073 true end of the subject, break the bumpalong loop, to force a match
7074 failure, except when doing partial matching, when we let the next cycle
7075 run at the end of the subject. To see why, consider the pattern
7076 /(?<=abc)def/, which partially matches "abc", even though the string
7077 does not contain the starting character "d". If we have not reached the
7078 true end of the subject (PCRE2_FIRSTLINE caused end_subject to be
7079 temporarily modified) we also let the cycle run, because the matching
7080 string is legitimately allowed to start with the first code unit of a
7081 newline. */
7082
7083 if (mb->partial == 0 && start_match >= mb->end_subject)
7084 {
7085 rc = MATCH_NOMATCH;
7086 break;
7087 }
7088 }
7089
7090 /* If there's no first code unit, advance to just after a linebreak for a
7091 multiline match if required. */
7092
7093 else if (startline)
7094 {
7095 if (start_match > mb->start_subject + start_offset)
7096 {
7097#ifdef SUPPORT_UNICODE
7098 if (utf)
7099 {
7100 while (start_match < end_subject && !WAS_NEWLINE(start_match))
7101 {
7102 start_match++;
7103 ACROSSCHAR(start_match < end_subject, start_match, start_match++);
7104 }
7105 }
7106 else
7107#endif
7108 while (start_match < end_subject && !WAS_NEWLINE(start_match))
7109 start_match++;
7110
7111 /* If we have just passed a CR and the newline option is ANY or
7112 ANYCRLF, and we are now at a LF, advance the match position by one
7113 more code unit. */
7114
7115 if (start_match[-1] == CHAR_CR &&
7116 (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
7117 start_match < end_subject &&
7118 UCHAR21TEST(start_match) == CHAR_NL)
7119 start_match++;
7120 }
7121 }
7122
7123 /* If there's no first code unit or a requirement for a multiline line
7124 start, advance to a non-unique first code unit if any have been
7125 identified. The bitmap contains only 256 bits. When code units are 16 or
7126 32 bits wide, all code units greater than 254 set the 255 bit. */
7127
7128 else if (start_bits != NULL)
7129 {
7130 while (start_match < end_subject)
7131 {
7132 uint32_t c = UCHAR21TEST(start_match);
7133#if PCRE2_CODE_UNIT_WIDTH != 8
7134 if (c > 255) c = 255;
7135#endif
7136 if ((start_bits[c/8] & (1u << (c&7))) != 0) break;
7137 start_match++;
7138 }
7139
7140 /* See comment above in first_cu checking about the next few lines. */
7141
7142 if (mb->partial == 0 && start_match >= mb->end_subject)
7143 {
7144 rc = MATCH_NOMATCH;
7145 break;
7146 }
7147 }
7148 } /* End first code unit handling */
7149
7150 /* Restore fudged end_subject */
7151
7152 end_subject = mb->end_subject;
7153
7154 /* The following two optimizations must be disabled for partial matching. */
7155
7156 if (mb->partial == 0)
7157 {
7158 PCRE2_SPTR p;
7159
7160 /* The minimum matching length is a lower bound; no string of that length
7161 may actually match the pattern. Although the value is, strictly, in
7162 characters, we treat it as code units to avoid spending too much time in
7163 this optimization. */
7164
7165 if (end_subject - start_match < re->minlength)
7166 {
7167 rc = MATCH_NOMATCH;
7168 break;
7169 }
7170
7171 /* If req_cu is set, we know that that code unit must appear in the
7172 subject for the (non-partial) match to succeed. If the first code unit is
7173 set, req_cu must be later in the subject; otherwise the test starts at
7174 the match point. This optimization can save a huge amount of backtracking
7175 in patterns with nested unlimited repeats that aren't going to match.
7176 Writing separate code for caseful/caseless versions makes it go faster,
7177 as does using an autoincrement and backing off on a match. As in the case
7178 of the first code unit, using memchr() in the 8-bit library gives a big
7179 speed up. Unlike the first_cu check above, we do not need to call
7180 memchr() twice in the caseless case because we only need to check for the
7181 presence of the character in either case, not find the first occurrence.
7182
7183 The search can be skipped if the code unit was found later than the
7184 current starting point in a previous iteration of the bumpalong loop.
7185
7186 HOWEVER: when the subject string is very, very long, searching to its end
7187 can take a long time, and give bad performance on quite ordinary
7188 anchored patterns. This showed up when somebody was matching something
7189 like /^\d+C/ on a 32-megabyte string... so we don't do this when the
7190 string is sufficiently long, but it's worth searching a lot more for
7191 unanchored patterns. */
7192
7193 p = start_match + (has_first_cu? 1:0);
7194 if (has_req_cu && p > req_cu_ptr)
7195 {
7196 PCRE2_SIZE check_length = end_subject - start_match;
7197
7198 if (check_length < REQ_CU_MAX ||
7199 (!anchored && check_length < REQ_CU_MAX * 1000))
7200 {
7201 if (req_cu != req_cu2) /* Caseless */
7202 {
7203#if PCRE2_CODE_UNIT_WIDTH != 8
7204 while (p < end_subject)
7205 {
7206 uint32_t pp = UCHAR21INCTEST(p);
7207 if (pp == req_cu || pp == req_cu2) { p--; break; }
7208 }
7209#else /* 8-bit code units */
7210 PCRE2_SPTR pp = p;
7211 p = memchr(pp, req_cu, end_subject - pp);
7212 if (p == NULL)
7213 {
7214 p = memchr(pp, req_cu2, end_subject - pp);
7215 if (p == NULL) p = end_subject;
7216 }
7217#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
7218 }
7219
7220 /* The caseful case */
7221
7222 else
7223 {
7224#if PCRE2_CODE_UNIT_WIDTH != 8
7225 while (p < end_subject)
7226 {
7227 if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
7228 }
7229
7230#else /* 8-bit code units */
7231 p = memchr(p, req_cu, end_subject - p);
7232 if (p == NULL) p = end_subject;
7233#endif
7234 }
7235
7236 /* If we can't find the required code unit, break the bumpalong loop,
7237 forcing a match failure. */
7238
7239 if (p >= end_subject)
7240 {
7241 rc = MATCH_NOMATCH;
7242 break;
7243 }
7244
7245 /* If we have found the required code unit, save the point where we
7246 found it, so that we don't search again next time round the bumpalong
7247 loop if the start hasn't yet passed this code unit. */
7248
7249 req_cu_ptr = p;
7250 }
7251 }
7252 }
7253 }
7254
7255 /* ------------ End of start of match optimizations ------------ */
7256
7257 /* Give no match if we have passed the bumpalong limit. */
7258
7259 if (start_match > bumpalong_limit)
7260 {
7261 rc = MATCH_NOMATCH;
7262 break;
7263 }
7264
7265 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
7266 first starting point for which a partial match was found. */
7267
7268 cb.start_match = (PCRE2_SIZE)(start_match - subject);
7269 cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH;
7270
7271 mb->start_used_ptr = start_match;
7272 mb->last_used_ptr = start_match;
7273#ifdef SUPPORT_UNICODE
7274 mb->moptions = options | fragment_options;
7275#else
7276 mb->moptions = options;
7277#endif
7278 mb->match_call_count = 0;
7279 mb->end_offset_top = 0;
7280 mb->skip_arg_count = 0;
7281
7282 rc = match(start_match, mb->start_code, match_data->ovector,
7283 match_data->oveccount, re->top_bracket, frame_size, mb);
7284
7285 if (mb->hitend && start_partial == NULL)
7286 {
7287 start_partial = mb->start_used_ptr;
7288 match_partial = start_match;
7289 }
7290
7291 switch(rc)
7292 {
7293 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
7294 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
7295 entirely. The only way we can do that is to re-do the match at the same
7296 point, with a flag to force SKIP with an argument to be ignored. Just
7297 treating this case as NOMATCH does not work because it does not check other
7298 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
7299
7300 case MATCH_SKIP_ARG:
7301 new_start_match = start_match;
7302 mb->ignore_skip_arg = mb->skip_arg_count;
7303 break;
7304
7305 /* SKIP passes back the next starting point explicitly, but if it is no
7306 greater than the match we have just done, treat it as NOMATCH. */
7307
7308 case MATCH_SKIP:
7309 if (mb->verb_skip_ptr > start_match)
7310 {
7311 new_start_match = mb->verb_skip_ptr;
7312 break;
7313 }
7314 /* Fall through */
7315
7316 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
7317 exactly like PRUNE. Unset ignore SKIP-with-argument. */
7318
7319 case MATCH_NOMATCH:
7320 case MATCH_PRUNE:
7321 case MATCH_THEN:
7322 mb->ignore_skip_arg = 0;
7323 new_start_match = start_match + 1;
7324#ifdef SUPPORT_UNICODE
7325 if (utf)
7326 ACROSSCHAR(new_start_match < end_subject, new_start_match,
7327 new_start_match++);
7328#endif
7329 break;
7330
7331 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
7332
7333 case MATCH_COMMIT:
7334 rc = MATCH_NOMATCH;
7335 goto ENDLOOP;
7336
7337 /* Any other return is either a match, or some kind of error. */
7338
7339 default:
7340 goto ENDLOOP;
7341 }
7342
7343 /* Control reaches here for the various types of "no match at this point"
7344 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
7345
7346 rc = MATCH_NOMATCH;
7347
7348 /* If PCRE2_FIRSTLINE is set, the match must happen before or at the first
7349 newline in the subject (though it may continue over the newline). Therefore,
7350 if we have just failed to match, starting at a newline, do not continue. */
7351
7352 if (firstline && IS_NEWLINE(start_match)) break;
7353
7354 /* Advance to new matching position */
7355
7356 start_match = new_start_match;
7357
7358 /* Break the loop if the pattern is anchored or if we have passed the end of
7359 the subject. */
7360
7361 if (anchored || start_match > end_subject) break;
7362
7363 /* If we have just passed a CR and we are now at a LF, and the pattern does
7364 not contain any explicit matches for \r or \n, and the newline option is CRLF
7365 or ANY or ANYCRLF, advance the match position by one more code unit. In
7366 normal matching start_match will aways be greater than the first position at
7367 this stage, but a failed *SKIP can cause a return at the same point, which is
7368 why the first test exists. */
7369
7370 if (start_match > subject + start_offset &&
7371 start_match[-1] == CHAR_CR &&
7372 start_match < end_subject &&
7373 *start_match == CHAR_NL &&
7374 (re->flags & PCRE2_HASCRORLF) == 0 &&
7375 (mb->nltype == NLTYPE_ANY ||
7376 mb->nltype == NLTYPE_ANYCRLF ||
7377 mb->nllen == 2))
7378 start_match++;
7379
7380 mb->mark = NULL; /* Reset for start of next match attempt */
7381 } /* End of for(;;) "bumpalong" loop */
7382
7383/* ==========================================================================*/
7384
7385/* When we reach here, one of the following stopping conditions is true:
7386
7387(1) The match succeeded, either completely, or partially;
7388
7389(2) The pattern is anchored or the match was failed after (*COMMIT);
7390
7391(3) We are past the end of the subject or the bumpalong limit;
7392
7393(4) PCRE2_FIRSTLINE is set and we have failed to match at a newline, because
7394 this option requests that a match occur at or before the first newline in
7395 the subject.
7396
7397(5) Some kind of error occurred.
7398
7399*/
7400
7401ENDLOOP:
7402
7403/* If end_subject != true_end_subject, it means we are handling invalid UTF,
7404and have just processed a non-terminal fragment. If this resulted in no match
7405or a partial match we must carry on to the next fragment (a partial match is
7406returned to the caller only at the very end of the subject). A loop is used to
7407avoid trying to match against empty fragments; if the pattern can match an
7408empty string it would have done so already. */
7409
7410#ifdef SUPPORT_UNICODE
7411if (utf && end_subject != true_end_subject &&
7412 (rc == MATCH_NOMATCH || rc == PCRE2_ERROR_PARTIAL))
7413 {
7414 for (;;)
7415 {
7416 /* Advance past the first bad code unit, and then skip invalid character
7417 starting code units in 8-bit and 16-bit modes. */
7418
7419 start_match = end_subject + 1;
7420
7421#if PCRE2_CODE_UNIT_WIDTH != 32
7422 while (start_match < true_end_subject && NOT_FIRSTCU(*start_match))
7423 start_match++;
7424#endif
7425
7426 /* If we have hit the end of the subject, there isn't another non-empty
7427 fragment, so give up. */
7428
7429 if (start_match >= true_end_subject)
7430 {
7431 rc = MATCH_NOMATCH; /* In case it was partial */
7432 break;
7433 }
7434
7435 /* Check the rest of the subject */
7436
7437 mb->check_subject = start_match;
7438 rc = PRIV(valid_utf)(start_match, length - (start_match - subject),
7439 &(match_data->startchar));
7440
7441 /* The rest of the subject is valid UTF. */
7442
7443 if (rc == 0)
7444 {
7445 mb->end_subject = end_subject = true_end_subject;
7446 fragment_options = PCRE2_NOTBOL;
7447 goto FRAGMENT_RESTART;
7448 }
7449
7450 /* A subsequent UTF error has been found; if the next fragment is
7451 non-empty, set up to process it. Otherwise, let the loop advance. */
7452
7453 else if (rc < 0)
7454 {
7455 mb->end_subject = end_subject = start_match + match_data->startchar;
7456 if (end_subject > start_match)
7457 {
7458 fragment_options = PCRE2_NOTBOL|PCRE2_NOTEOL;
7459 goto FRAGMENT_RESTART;
7460 }
7461 }
7462 }
7463 }
7464#endif /* SUPPORT_UNICODE */
7465
7466/* Release an enlarged frame vector that is on the heap. */
7467
7468if (mb->match_frames != mb->stack_frames)
7469 mb->memctl.free(mb->match_frames, mb->memctl.memory_data);
7470
7471/* Fill in fields that are always returned in the match data. */
7472
7473match_data->code = re;
7474match_data->mark = mb->mark;
7475match_data->matchedby = PCRE2_MATCHEDBY_INTERPRETER;
7476
7477/* Handle a fully successful match. Set the return code to the number of
7478captured strings, or 0 if there were too many to fit into the ovector, and then
7479set the remaining returned values before returning. Make a copy of the subject
7480string if requested. */
7481
7482if (rc == MATCH_MATCH)
7483 {
7484 match_data->rc = ((int)mb->end_offset_top >= 2 * match_data->oveccount)?
7485 0 : (int)mb->end_offset_top/2 + 1;
7486 match_data->startchar = start_match - subject;
7487 match_data->leftchar = mb->start_used_ptr - subject;
7488 match_data->rightchar = ((mb->last_used_ptr > mb->end_match_ptr)?
7489 mb->last_used_ptr : mb->end_match_ptr) - subject;
7490 if ((options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
7491 {
7492 length = CU2BYTES(length + was_zero_terminated);
7493 match_data->subject = match_data->memctl.malloc(length,
7494 match_data->memctl.memory_data);
7495 if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
7496 memcpy((void *)match_data->subject, subject, length);
7497 match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
7498 }
7499 else match_data->subject = subject;
7500 return match_data->rc;
7501 }
7502
7503/* Control gets here if there has been a partial match, an error, or if the
7504overall match attempt has failed at all permitted starting positions. Any mark
7505data is in the nomatch_mark field. */
7506
7507match_data->mark = mb->nomatch_mark;
7508
7509/* For anything other than nomatch or partial match, just return the code. */
7510
7511if (rc != MATCH_NOMATCH && rc != PCRE2_ERROR_PARTIAL) match_data->rc = rc;
7512
7513/* Handle a partial match. If a "soft" partial match was requested, searching
7514for a complete match will have continued, and the value of rc at this point
7515will be MATCH_NOMATCH. For a "hard" partial match, it will already be
7516PCRE2_ERROR_PARTIAL. */
7517
7518else if (match_partial != NULL)
7519 {
7520 match_data->subject = subject;
7521 match_data->ovector[0] = match_partial - subject;
7522 match_data->ovector[1] = end_subject - subject;
7523 match_data->startchar = match_partial - subject;
7524 match_data->leftchar = start_partial - subject;
7525 match_data->rightchar = end_subject - subject;
7526 match_data->rc = PCRE2_ERROR_PARTIAL;
7527 }
7528
7529/* Else this is the classic nomatch case. */
7530
7531else match_data->rc = PCRE2_ERROR_NOMATCH;
7532
7533return match_data->rc;
7534}
7535
7536/* End of pcre2_match.c */