blob: bb141a0c501ea571f3a429e36d3d0563b8ea8c13 [file] [log] [blame]
Elliott Hughes5b808042021-10-01 10:56:10 -07001/*************************************************
2* Perl-Compatible Regular Expressions *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016 University of Cambridge
11
12-----------------------------------------------------------------------------
13Redistribution and use in source and binary forms, with or without
14modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE.
38-----------------------------------------------------------------------------
39*/
40
41#ifdef HAVE_CONFIG_H
42#include "config.h"
43#endif
44
45#include <stdio.h>
46#include <string.h>
47
48#define PCRE2_CODE_UNIT_WIDTH 0
49#include "pcre2.h"
50
51/*
52 Letter characters:
53 \xe6\x92\xad = 0x64ad = 25773 (kanji)
54 Non-letter characters:
55 \xc2\xa1 = 0xa1 = (Inverted Exclamation Mark)
56 \xf3\xa9\xb7\x80 = 0xe9dc0 = 957888
57 \xed\xa0\x80 = 55296 = 0xd800 (Invalid UTF character)
58 \xed\xb0\x80 = 56320 = 0xdc00 (Invalid UTF character)
59 Newlines:
60 \xc2\x85 = 0x85 = 133 (NExt Line = NEL)
61 \xe2\x80\xa8 = 0x2028 = 8232 (Line Separator)
62 Othercase pairs:
63 \xc3\xa9 = 0xe9 = 233 (e')
64 \xc3\x89 = 0xc9 = 201 (E')
65 \xc3\xa1 = 0xe1 = 225 (a')
66 \xc3\x81 = 0xc1 = 193 (A')
67 \x53 = 0x53 = S
68 \x73 = 0x73 = s
69 \xc5\xbf = 0x17f = 383 (long S)
70 \xc8\xba = 0x23a = 570
71 \xe2\xb1\xa5 = 0x2c65 = 11365
72 \xe1\xbd\xb8 = 0x1f78 = 8056
73 \xe1\xbf\xb8 = 0x1ff8 = 8184
74 \xf0\x90\x90\x80 = 0x10400 = 66560
75 \xf0\x90\x90\xa8 = 0x10428 = 66600
76 \xc7\x84 = 0x1c4 = 452
77 \xc7\x85 = 0x1c5 = 453
78 \xc7\x86 = 0x1c6 = 454
79 Caseless sets:
80 ucp_Armenian - \x{531}-\x{556} -> \x{561}-\x{586}
81 ucp_Coptic - \x{2c80}-\x{2ce3} -> caseless: XOR 0x1
82 ucp_Latin - \x{ff21}-\x{ff3a} -> \x{ff41]-\x{ff5a}
83
84 Mark property:
85 \xcc\x8d = 0x30d = 781
86 Special:
87 \xc2\x80 = 0x80 = 128 (lowest 2 byte character)
88 \xdf\xbf = 0x7ff = 2047 (highest 2 byte character)
89 \xe0\xa0\x80 = 0x800 = 2048 (lowest 2 byte character)
90 \xef\xbf\xbf = 0xffff = 65535 (highest 3 byte character)
91 \xf0\x90\x80\x80 = 0x10000 = 65536 (lowest 4 byte character)
92 \xf4\x8f\xbf\xbf = 0x10ffff = 1114111 (highest allowed utf character)
93*/
94
95static int regression_tests(void);
96static int invalid_utf8_regression_tests(void);
97static int invalid_utf16_regression_tests(void);
98static int invalid_utf32_regression_tests(void);
99
100int main(void)
101{
102 int jit = 0;
103#if defined SUPPORT_PCRE2_8
104 pcre2_config_8(PCRE2_CONFIG_JIT, &jit);
105#elif defined SUPPORT_PCRE2_16
106 pcre2_config_16(PCRE2_CONFIG_JIT, &jit);
107#elif defined SUPPORT_PCRE2_32
108 pcre2_config_32(PCRE2_CONFIG_JIT, &jit);
109#endif
110 if (!jit) {
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700111 printf("JIT must be enabled to run pcre2_jit_test\n");
Elliott Hughes5b808042021-10-01 10:56:10 -0700112 return 1;
113 }
114 return regression_tests()
115 | invalid_utf8_regression_tests()
116 | invalid_utf16_regression_tests()
117 | invalid_utf32_regression_tests();
118}
119
120/* --------------------------------------------------------------------------------------- */
121
122#if !(defined SUPPORT_PCRE2_8) && !(defined SUPPORT_PCRE2_16) && !(defined SUPPORT_PCRE2_32)
123#error SUPPORT_PCRE2_8 or SUPPORT_PCRE2_16 or SUPPORT_PCRE2_32 must be defined
124#endif
125
126#define MU (PCRE2_MULTILINE | PCRE2_UTF)
127#define MUP (PCRE2_MULTILINE | PCRE2_UTF | PCRE2_UCP)
128#define CMU (PCRE2_CASELESS | PCRE2_MULTILINE | PCRE2_UTF)
129#define CMUP (PCRE2_CASELESS | PCRE2_MULTILINE | PCRE2_UTF | PCRE2_UCP)
130#define M (PCRE2_MULTILINE)
131#define MP (PCRE2_MULTILINE | PCRE2_UCP)
132#define U (PCRE2_UTF)
133#define CM (PCRE2_CASELESS | PCRE2_MULTILINE)
134
135#define BSR(x) ((x) << 16)
136#define A PCRE2_NEWLINE_ANYCRLF
137
138#define GET_NEWLINE(x) ((x) & 0xffff)
139#define GET_BSR(x) ((x) >> 16)
140
141#define OFFSET_MASK 0x00ffff
142#define F_NO8 0x010000
143#define F_NO16 0x020000
144#define F_NO32 0x020000
145#define F_NOMATCH 0x040000
146#define F_DIFF 0x080000
147#define F_FORCECONV 0x100000
148#define F_PROPERTY 0x200000
149
150struct regression_test_case {
151 int compile_options;
152 int newline;
153 int match_options;
154 int start_offset;
155 const char *pattern;
156 const char *input;
157};
158
159static struct regression_test_case regression_test_cases[] = {
160 /* Constant strings. */
161 { MU, A, 0, 0, "AbC", "AbAbC" },
162 { MU, A, 0, 0, "ACCEPT", "AACACCACCEACCEPACCEPTACCEPTT" },
163 { CMU, A, 0, 0, "aA#\xc3\xa9\xc3\x81", "aA#Aa#\xc3\x89\xc3\xa1" },
164 { M, A, 0, 0, "[^a]", "aAbB" },
165 { CM, A, 0, 0, "[^m]", "mMnN" },
166 { M, A, 0, 0, "a[^b][^#]", "abacd" },
167 { CM, A, 0, 0, "A[^B][^E]", "abacd" },
168 { CMU, A, 0, 0, "[^x][^#]", "XxBll" },
169 { MU, A, 0, 0, "[^a]", "aaa\xc3\xa1#Ab" },
170 { CMU, A, 0, 0, "[^A]", "aA\xe6\x92\xad" },
171 { MU, A, 0, 0, "\\W(\\W)?\\w", "\r\n+bc" },
172 { MU, A, 0, 0, "\\W(\\W)?\\w", "\n\r+bc" },
173 { MU, A, 0, 0, "\\W(\\W)?\\w", "\r\r+bc" },
174 { MU, A, 0, 0, "\\W(\\W)?\\w", "\n\n+bc" },
175 { MU, A, 0, 0, "[axd]", "sAXd" },
176 { CMU, A, 0, 0, "[axd]", "sAXd" },
177 { CMU, A, 0, 0 | F_NOMATCH, "[^axd]", "DxA" },
178 { MU, A, 0, 0, "[a-dA-C]", "\xe6\x92\xad\xc3\xa9.B" },
179 { MU, A, 0, 0, "[^a-dA-C]", "\xe6\x92\xad\xc3\xa9" },
180 { CMU, A, 0, 0, "[^\xc3\xa9]", "\xc3\xa9\xc3\x89." },
181 { MU, A, 0, 0, "[^\xc3\xa9]", "\xc3\xa9\xc3\x89." },
182 { MU, A, 0, 0, "[^a]", "\xc2\x80[]" },
183 { CMU, A, 0, 0, "\xf0\x90\x90\xa7", "\xf0\x90\x91\x8f" },
184 { CM, A, 0, 0, "1a2b3c4", "1a2B3c51A2B3C4" },
185 { PCRE2_CASELESS, 0, 0, 0, "\xff#a", "\xff#\xff\xfe##\xff#A" },
186 { PCRE2_CASELESS, 0, 0, 0, "\xfe", "\xff\xfc#\xfe\xfe" },
187 { PCRE2_CASELESS, 0, 0, 0, "a1", "Aa1" },
188#ifndef NEVER_BACKSLASH_C
189 { M, A, 0, 0, "\\Ca", "cda" },
190 { CM, A, 0, 0, "\\Ca", "CDA" },
191 { M, A, 0, 0 | F_NOMATCH, "\\Cx", "cda" },
192 { CM, A, 0, 0 | F_NOMATCH, "\\Cx", "CDA" },
193#endif /* !NEVER_BACKSLASH_C */
194 { CMUP, A, 0, 0, "\xf0\x90\x90\x80\xf0\x90\x90\xa8", "\xf0\x90\x90\xa8\xf0\x90\x90\x80" },
195 { CMUP, A, 0, 0, "\xf0\x90\x90\x80{2}", "\xf0\x90\x90\x80#\xf0\x90\x90\xa8\xf0\x90\x90\x80" },
196 { CMUP, A, 0, 0, "\xf0\x90\x90\xa8{2}", "\xf0\x90\x90\x80#\xf0\x90\x90\xa8\xf0\x90\x90\x80" },
197 { CMUP, A, 0, 0, "\xe1\xbd\xb8\xe1\xbf\xb8", "\xe1\xbf\xb8\xe1\xbd\xb8" },
198 { M, A, 0, 0, "[3-57-9]", "5" },
199 { PCRE2_AUTO_CALLOUT, A, 0, 0, "12345678901234567890123456789012345678901234567890123456789012345678901234567890",
200 "12345678901234567890123456789012345678901234567890123456789012345678901234567890" },
201
202 /* Assertions. */
203 { MU, A, 0, 0, "\\b[^A]", "A_B#" },
204 { M, A, 0, 0 | F_NOMATCH, "\\b\\W", "\n*" },
205 { MU, A, 0, 0, "\\B[^,]\\b[^s]\\b", "#X" },
206 { MP, A, 0, 0, "\\B", "_\xa1" },
207 { MP, A, 0, 0 | F_PROPERTY, "\\b_\\b[,A]\\B", "_," },
208 { MUP, A, 0, 0, "\\b", "\xe6\x92\xad!" },
209 { MUP, A, 0, 0, "\\B", "_\xc2\xa1\xc3\xa1\xc2\x85" },
210 { MUP, A, 0, 0, "\\b[^A]\\B[^c]\\b[^_]\\B", "_\xc3\xa1\xe2\x80\xa8" },
211 { MUP, A, 0, 0, "\\b\\w+\\B", "\xc3\x89\xc2\xa1\xe6\x92\xad\xc3\x81\xc3\xa1" },
212 { MU, A, 0, 0 | F_NOMATCH, "\\b.", "\xcd\xbe" },
213 { CMUP, A, 0, 0, "\\By", "\xf0\x90\x90\xa8y" },
214 { M, A, 0, 0 | F_NOMATCH, "\\R^", "\n" },
215 { M, A, 0, 1 | F_NOMATCH, "^", "\n" },
216 { 0, 0, 0, 0, "^ab", "ab" },
217 { 0, 0, 0, 0 | F_NOMATCH, "^ab", "aab" },
218 { M, PCRE2_NEWLINE_CRLF, 0, 0, "^a", "\r\raa\n\naa\r\naa" },
219 { MU, A, 0, 0, "^-", "\xe2\x80\xa8--\xc2\x85-\r\n-" },
220 { M, PCRE2_NEWLINE_ANY, 0, 0, "^-", "a--b--\x85--" },
221 { MU, PCRE2_NEWLINE_ANY, 0, 0, "^-", "a--\xe2\x80\xa8--" },
222 { MU, PCRE2_NEWLINE_ANY, 0, 0, "^-", "a--\xc2\x85--" },
223 { 0, 0, 0, 0, "ab$", "ab" },
224 { 0, 0, 0, 0 | F_NOMATCH, "ab$", "abab\n\n" },
225 { PCRE2_DOLLAR_ENDONLY, 0, 0, 0 | F_NOMATCH, "ab$", "abab\r\n" },
226 { M, PCRE2_NEWLINE_CRLF, 0, 0, "a$", "\r\raa\n\naa\r\naa" },
227 { M, PCRE2_NEWLINE_ANY, 0, 0, "a$", "aaa" },
228 { MU, PCRE2_NEWLINE_ANYCRLF, 0, 0, "#$", "#\xc2\x85###\r#" },
229 { MU, PCRE2_NEWLINE_ANY, 0, 0, "#$", "#\xe2\x80\xa9" },
230 { 0, PCRE2_NEWLINE_ANY, PCRE2_NOTBOL, 0 | F_NOMATCH, "^a", "aa\naa" },
231 { M, PCRE2_NEWLINE_ANY, PCRE2_NOTBOL, 0, "^a", "aa\naa" },
232 { 0, PCRE2_NEWLINE_ANY, PCRE2_NOTEOL, 0 | F_NOMATCH, "a$", "aa\naa" },
233 { 0, PCRE2_NEWLINE_ANY, PCRE2_NOTEOL, 0 | F_NOMATCH, "a$", "aa\r\n" },
234 { U | PCRE2_DOLLAR_ENDONLY, PCRE2_NEWLINE_ANY, 0, 0 | F_PROPERTY, "\\p{Any}{2,}$", "aa\r\n" },
235 { M, PCRE2_NEWLINE_ANY, PCRE2_NOTEOL, 0, "a$", "aa\naa" },
236 { 0, PCRE2_NEWLINE_CR, 0, 0, ".\\Z", "aaa" },
237 { U, PCRE2_NEWLINE_CR, 0, 0, "a\\Z", "aaa\r" },
238 { 0, PCRE2_NEWLINE_CR, 0, 0, ".\\Z", "aaa\n" },
239 { 0, PCRE2_NEWLINE_CRLF, 0, 0, ".\\Z", "aaa\r" },
240 { U, PCRE2_NEWLINE_CRLF, 0, 0, ".\\Z", "aaa\n" },
241 { 0, PCRE2_NEWLINE_CRLF, 0, 0, ".\\Z", "aaa\r\n" },
242 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa" },
243 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\r" },
244 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\n" },
245 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\r\n" },
246 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\xe2\x80\xa8" },
247 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa" },
248 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\r" },
249 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\n" },
250 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\r\n" },
251 { U, PCRE2_NEWLINE_ANY, 0, 0, ".\\Z", "aaa\xc2\x85" },
252 { U, PCRE2_NEWLINE_ANY, 0, 0, ".\\Z", "aaa\xe2\x80\xa8" },
253 { M, A, 0, 0, "\\Aa", "aaa" },
254 { M, A, 0, 1 | F_NOMATCH, "\\Aa", "aaa" },
255 { M, A, 0, 1, "\\Ga", "aaa" },
256 { M, A, 0, 1 | F_NOMATCH, "\\Ga", "aba" },
257 { M, A, 0, 0, "a\\z", "aaa" },
258 { M, A, 0, 0 | F_NOMATCH, "a\\z", "aab" },
259
260 /* Brackets and alternatives. */
261 { MU, A, 0, 0, "(ab|bb|cd)", "bacde" },
262 { MU, A, 0, 0, "(?:ab|a)(bc|c)", "ababc" },
263 { MU, A, 0, 0, "((ab|(cc))|(bb)|(?:cd|efg))", "abac" },
264 { CMU, A, 0, 0, "((aB|(Cc))|(bB)|(?:cd|EFg))", "AcCe" },
265 { MU, A, 0, 0, "((ab|(cc))|(bb)|(?:cd|ebg))", "acebebg" },
266 { MU, A, 0, 0, "(?:(a)|(?:b))(cc|(?:d|e))(a|b)k", "accabdbbccbk" },
267 { MU, A, 0, 0, "\xc7\x82|\xc6\x82", "\xf1\x83\x82\x82\xc7\x82\xc7\x83" },
268 { MU, A, 0, 0, "=\xc7\x82|#\xc6\x82", "\xf1\x83\x82\x82=\xc7\x82\xc7\x83" },
269 { MU, A, 0, 0, "\xc7\x82\xc7\x83|\xc6\x82\xc6\x82", "\xf1\x83\x82\x82\xc7\x82\xc7\x83" },
270 { MU, A, 0, 0, "\xc6\x82\xc6\x82|\xc7\x83\xc7\x83|\xc8\x84\xc8\x84", "\xf1\x83\x82\x82\xc8\x84\xc8\x84" },
271 { U, A, 0, 0, "\xe1\x81\x80|\xe2\x82\x80|\xe4\x84\x80", "\xdf\xbf\xc2\x80\xe4\x84\x80" },
272 { U, A, 0, 0, "(?:\xe1\x81\x80|\xe2\x82\x80|\xe4\x84\x80)#", "\xdf\xbf\xc2\x80#\xe4\x84\x80#" },
273 { CM, A, 0, 0, "ab|cd", "CD" },
274 { CM, A, 0, 0, "a1277|a1377|bX487", "bx487" },
275 { CM, A, 0, 0, "a1277|a1377|bx487", "bX487" },
276
277 /* Greedy and non-greedy ? operators. */
278 { MU, A, 0, 0, "(?:a)?a", "laab" },
279 { CMU, A, 0, 0, "(A)?A", "llaab" },
280 { MU, A, 0, 0, "(a)?\?a", "aab" }, /* ?? is the prefix of trygraphs in GCC. */
281 { MU, A, 0, 0, "(a)?a", "manm" },
282 { CMU, A, 0, 0, "(a|b)?\?d((?:e)?)", "ABABdx" },
283 { MU, A, 0, 0, "(a|b)?\?d((?:e)?)", "abcde" },
284 { MU, A, 0, 0, "((?:ab)?\?g|b(?:g(nn|d)?\?)?)?\?(?:n)?m", "abgnbgnnbgdnmm" },
285
286 /* Greedy and non-greedy + operators */
287 { MU, A, 0, 0, "(aa)+aa", "aaaaaaa" },
288 { MU, A, 0, 0, "(aa)+?aa", "aaaaaaa" },
289 { MU, A, 0, 0, "(?:aba|ab|a)+l", "ababamababal" },
290 { MU, A, 0, 0, "(?:aba|ab|a)+?l", "ababamababal" },
291 { MU, A, 0, 0, "(a(?:bc|cb|b|c)+?|ss)+e", "accssabccbcacbccbbXaccssabccbcacbccbbe" },
292 { MU, A, 0, 0, "(a(?:bc|cb|b|c)+|ss)+?e", "accssabccbcacbccbbXaccssabccbcacbccbbe" },
293 { MU, A, 0, 0, "(?:(b(c)+?)+)?\?(?:(bc)+|(cb)+)+(?:m)+", "bccbcccbcbccbcbPbccbcccbcbccbcbmmn" },
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700294 { MU, A, 0, 0, "(aa|bb){8,1000}", "abaabbaabbaabbaab_aabbaabbaabbaabbaabbaabb_" },
Elliott Hughes5b808042021-10-01 10:56:10 -0700295
296 /* Greedy and non-greedy * operators */
297 { CMU, A, 0, 0, "(?:AA)*AB", "aaaaaaamaaaaaaab" },
298 { MU, A, 0, 0, "(?:aa)*?ab", "aaaaaaamaaaaaaab" },
299 { MU, A, 0, 0, "(aa|ab)*ab", "aaabaaab" },
300 { CMU, A, 0, 0, "(aa|Ab)*?aB", "aaabaaab" },
301 { MU, A, 0, 0, "(a|b)*(?:a)*(?:b)*m", "abbbaaababanabbbaaababamm" },
302 { MU, A, 0, 0, "(a|b)*?(?:a)*?(?:b)*?m", "abbbaaababanabbbaaababamm" },
303 { M, A, 0, 0, "a(a(\\1*)a|(b)b+){0}a", "aa" },
304 { M, A, 0, 0, "((?:a|)*){0}a", "a" },
305
306 /* Combining ? + * operators */
307 { MU, A, 0, 0, "((bm)+)?\?(?:a)*(bm)+n|((am)+?)?(?:a)+(am)*n", "bmbmabmamaaamambmaman" },
308 { MU, A, 0, 0, "(((ab)?cd)*ef)+g", "abcdcdefcdefefmabcdcdefcdefefgg" },
309 { MU, A, 0, 0, "(((ab)?\?cd)*?ef)+?g", "abcdcdefcdefefmabcdcdefcdefefgg" },
310 { MU, A, 0, 0, "(?:(ab)?c|(?:ab)+?d)*g", "ababcdccababddg" },
311 { MU, A, 0, 0, "(?:(?:ab)?\?c|(ab)+d)*?g", "ababcdccababddg" },
312
313 /* Single character iterators. */
314 { MU, A, 0, 0, "(a+aab)+aaaab", "aaaabcaaaabaabcaabcaaabaaaab" },
315 { MU, A, 0, 0, "(a*a*aab)+x", "aaaaabaabaaabmaabx" },
316 { MU, A, 0, 0, "(a*?(b|ab)a*?)+x", "aaaabcxbbaabaacbaaabaabax" },
317 { MU, A, 0, 0, "(a+(ab|ad)a+)+x", "aaabaaaadaabaaabaaaadaaax" },
318 { MU, A, 0, 0, "(a?(a)a?)+(aaa)", "abaaabaaaaaaaa" },
319 { MU, A, 0, 0, "(a?\?(a)a?\?)+(b)", "aaaacaaacaacacbaaab" },
320 { MU, A, 0, 0, "(a{0,4}(b))+d", "aaaaaabaabcaaaaabaaaaabd" },
321 { MU, A, 0, 0, "(a{0,4}?[^b])+d+(a{0,4}[^b])d+", "aaaaadaaaacaadddaaddd" },
322 { MU, A, 0, 0, "(ba{2})+c", "baabaaabacbaabaac" },
323 { MU, A, 0, 0, "(a*+bc++)+", "aaabbcaaabcccab" },
324 { MU, A, 0, 0, "(a?+[^b])+", "babaacacb" },
325 { MU, A, 0, 0, "(a{0,3}+b)(a{0,3}+b)(a{0,3}+)[^c]", "abaabaaacbaabaaaac" },
326 { CMU, A, 0, 0, "([a-c]+[d-f]+?)+?g", "aBdacdehAbDaFgA" },
327 { CMU, A, 0, 0, "[c-f]+k", "DemmFke" },
328 { MU, A, 0, 0, "([DGH]{0,4}M)+", "GGDGHDGMMHMDHHGHM" },
329 { MU, A, 0, 0, "([a-c]{4,}s)+", "abasabbasbbaabsbba" },
330 { CMU, A, 0, 0, "[ace]{3,7}", "AcbDAcEEcEd" },
331 { CMU, A, 0, 0, "[ace]{3,7}?", "AcbDAcEEcEd" },
332 { CMU, A, 0, 0, "[ace]{3,}", "AcbDAcEEcEd" },
333 { CMU, A, 0, 0, "[ace]{3,}?", "AcbDAcEEcEd" },
334 { MU, A, 0, 0, "[ckl]{2,}?g", "cdkkmlglglkcg" },
335 { CMU, A, 0, 0, "[ace]{5}?", "AcCebDAcEEcEd" },
336 { MU, A, 0, 0, "([AbC]{3,5}?d)+", "BACaAbbAEAACCbdCCbdCCAAbb" },
337 { MU, A, 0, 0, "([^ab]{0,}s){2}", "abaabcdsABamsDDs" },
338 { MU, A, 0, 0, "\\b\\w+\\B", "x,a_cd" },
339 { MUP, A, 0, 0, "\\b[^\xc2\xa1]+\\B", "\xc3\x89\xc2\xa1\xe6\x92\xad\xc3\x81\xc3\xa1" },
340 { CMU, A, 0, 0, "[^b]+(a*)([^c]?d{3})", "aaaaddd" },
341 { CMUP, A, 0, 0, "\xe1\xbd\xb8{2}", "\xe1\xbf\xb8#\xe1\xbf\xb8\xe1\xbd\xb8" },
342 { CMU, A, 0, 0, "[^\xf0\x90\x90\x80]{2,4}@", "\xf0\x90\x90\xa8\xf0\x90\x90\x80###\xf0\x90\x90\x80@@@" },
343 { CMU, A, 0, 0, "[^\xe1\xbd\xb8][^\xc3\xa9]", "\xe1\xbd\xb8\xe1\xbf\xb8\xc3\xa9\xc3\x89#" },
344 { MU, A, 0, 0, "[^\xe1\xbd\xb8][^\xc3\xa9]", "\xe1\xbd\xb8\xe1\xbf\xb8\xc3\xa9\xc3\x89#" },
345 { MU, A, 0, 0, "[^\xe1\xbd\xb8]{3,}?", "##\xe1\xbd\xb8#\xe1\xbd\xb8#\xc3\x89#\xe1\xbd\xb8" },
346 { MU, A, 0, 0, "\\d+123", "987654321,01234" },
347 { MU, A, 0, 0, "abcd*|\\w+xy", "aaaaa,abxyz" },
348 { MU, A, 0, 0, "(?:abc|((?:amc|\\b\\w*xy)))", "aaaaa,abxyz" },
349 { MU, A, 0, 0, "a(?R)|([a-z]++)#", ".abcd.abcd#."},
350 { MU, A, 0, 0, "a(?R)|([a-z]++)#", ".abcd.mbcd#."},
351 { MU, A, 0, 0, ".[ab]*.", "xx" },
352 { MU, A, 0, 0, ".[ab]*a", "xxa" },
353 { MU, A, 0, 0, ".[ab]?.", "xx" },
354 { MU, A, 0, 0, "_[ab]+_*a", "_aa" },
355 { MU, A, 0, 0, "#(A+)#\\d+", "#A#A#0" },
Elliott Hughes16619d62021-10-29 12:10:38 -0700356 { MU, A, 0, 0, "(?P<size>\\d+)m|M", "4M" },
Elliott Hughes5b808042021-10-01 10:56:10 -0700357
358 /* Bracket repeats with limit. */
359 { MU, A, 0, 0, "(?:(ab){2}){5}M", "abababababababababababM" },
360 { MU, A, 0, 0, "(?:ab|abab){1,5}M", "abababababababababababM" },
361 { MU, A, 0, 0, "(?>ab|abab){1,5}M", "abababababababababababM" },
362 { MU, A, 0, 0, "(?:ab|abab){1,5}?M", "abababababababababababM" },
363 { MU, A, 0, 0, "(?>ab|abab){1,5}?M", "abababababababababababM" },
364 { MU, A, 0, 0, "(?:(ab){1,4}?){1,3}?M", "abababababababababababababM" },
365 { MU, A, 0, 0, "(?:(ab){1,4}){1,3}abababababababababababM", "ababababababababababababM" },
366 { MU, A, 0, 0 | F_NOMATCH, "(?:(ab){1,4}){1,3}abababababababababababM", "abababababababababababM" },
367 { MU, A, 0, 0, "(ab){4,6}?M", "abababababababM" },
368
369 /* Basic character sets. */
370 { MU, A, 0, 0, "(?:\\s)+(?:\\S)+", "ab \t\xc3\xa9\xe6\x92\xad " },
371 { MU, A, 0, 0, "(\\w)*(k)(\\W)?\?", "abcdef abck11" },
372 { MU, A, 0, 0, "\\((\\d)+\\)\\D", "a() (83 (8)2 (9)ab" },
373 { MU, A, 0, 0, "\\w(\\s|(?:\\d)*,)+\\w\\wb", "a 5, 4,, bb 5, 4,, aab" },
374 { MU, A, 0, 0, "(\\v+)(\\V+)", "\x0e\xc2\x85\xe2\x80\xa8\x0b\x09\xe2\x80\xa9" },
375 { MU, A, 0, 0, "(\\h+)(\\H+)", "\xe2\x80\xa8\xe2\x80\x80\x20\xe2\x80\x8a\xe2\x81\x9f\xe3\x80\x80\x09\x20\xc2\xa0\x0a" },
376 { MU, A, 0, 0, "x[bcef]+", "xaxdxecbfg" },
377 { MU, A, 0, 0, "x[bcdghij]+", "xaxexfxdgbjk" },
378 { MU, A, 0, 0, "x[^befg]+", "xbxexacdhg" },
379 { MU, A, 0, 0, "x[^bcdl]+", "xlxbxaekmd" },
380 { MU, A, 0, 0, "x[^bcdghi]+", "xbxdxgxaefji" },
381 { MU, A, 0, 0, "x[B-Fb-f]+", "xaxAxgxbfBFG" },
382 { CMU, A, 0, 0, "\\x{e9}+", "#\xf0\x90\x90\xa8\xc3\xa8\xc3\xa9\xc3\x89\xc3\x88" },
383 { CMU, A, 0, 0, "[^\\x{e9}]+", "\xc3\xa9#\xf0\x90\x90\xa8\xc3\xa8\xc3\x88\xc3\x89" },
384 { MU, A, 0, 0, "[\\x02\\x7e]+", "\xc3\x81\xe1\xbf\xb8\xf0\x90\x90\xa8\x01\x02\x7e\x7f" },
385 { MU, A, 0, 0, "[^\\x02\\x7e]+", "\x02\xc3\x81\xe1\xbf\xb8\xf0\x90\x90\xa8\x01\x7f\x7e" },
386 { MU, A, 0, 0, "[\\x{81}-\\x{7fe}]+", "#\xe1\xbf\xb8\xf0\x90\x90\xa8\xc2\x80\xc2\x81\xdf\xbe\xdf\xbf" },
387 { MU, A, 0, 0, "[^\\x{81}-\\x{7fe}]+", "\xc2\x81#\xe1\xbf\xb8\xf0\x90\x90\xa8\xc2\x80\xdf\xbf\xdf\xbe" },
388 { MU, A, 0, 0, "[\\x{801}-\\x{fffe}]+", "#\xc3\xa9\xf0\x90\x90\x80\xe0\xa0\x80\xe0\xa0\x81\xef\xbf\xbe\xef\xbf\xbf" },
389 { MU, A, 0, 0, "[^\\x{801}-\\x{fffe}]+", "\xe0\xa0\x81#\xc3\xa9\xf0\x90\x90\x80\xe0\xa0\x80\xef\xbf\xbf\xef\xbf\xbe" },
390 { MU, A, 0, 0, "[\\x{10001}-\\x{10fffe}]+", "#\xc3\xa9\xe2\xb1\xa5\xf0\x90\x80\x80\xf0\x90\x80\x81\xf4\x8f\xbf\xbe\xf4\x8f\xbf\xbf" },
391 { MU, A, 0, 0, "[^\\x{10001}-\\x{10fffe}]+", "\xf0\x90\x80\x81#\xc3\xa9\xe2\xb1\xa5\xf0\x90\x80\x80\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbe" },
392 { CMU, A, 0, 0 | F_NOMATCH, "^[\\x{0100}-\\x{017f}]", " " },
393
394 /* Unicode properties. */
395 { MUP, A, 0, 0, "[1-5\xc3\xa9\\w]", "\xc3\xa1_" },
396 { MUP, A, 0, 0 | F_PROPERTY, "[\xc3\x81\\p{Ll}]", "A_\xc3\x89\xc3\xa1" },
397 { MUP, A, 0, 0, "[\\Wd-h_x-z]+", "a\xc2\xa1#_yhzdxi" },
398 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[\\P{Any}]", "abc" },
399 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[^\\p{Any}]", "abc" },
400 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[\\P{Any}\xc3\xa1-\xc3\xa8]", "abc" },
401 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[^\\p{Any}\xc3\xa1-\xc3\xa8]", "abc" },
402 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[\xc3\xa1-\xc3\xa8\\P{Any}]", "abc" },
403 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[^\xc3\xa1-\xc3\xa8\\p{Any}]", "abc" },
404 { MUP, A, 0, 0 | F_PROPERTY, "[\xc3\xa1-\xc3\xa8\\p{Any}]", "abc" },
405 { MUP, A, 0, 0 | F_PROPERTY, "[^\xc3\xa1-\xc3\xa8\\P{Any}]", "abc" },
406 { MUP, A, 0, 0, "[b-\xc3\xa9\\s]", "a\xc\xe6\x92\xad" },
407 { CMUP, A, 0, 0, "[\xc2\x85-\xc2\x89\xc3\x89]", "\xc2\x84\xc3\xa9" },
408 { MUP, A, 0, 0, "[^b-d^&\\s]{3,}", "db^ !a\xe2\x80\xa8_ae" },
409 { MUP, A, 0, 0 | F_PROPERTY, "[^\\S\\P{Any}][\\sN]{1,3}[\\P{N}]{4}", "\xe2\x80\xaa\xa N\x9\xc3\xa9_0" },
410 { MU, A, 0, 0 | F_PROPERTY, "[^\\P{L}\x9!D-F\xa]{2,3}", "\x9,.DF\xa.CG\xc3\x81" },
411 { CMUP, A, 0, 0, "[\xc3\xa1-\xc3\xa9_\xe2\x80\xa0-\xe2\x80\xaf]{1,5}[^\xe2\x80\xa0-\xe2\x80\xaf]", "\xc2\xa1\xc3\x89\xc3\x89\xe2\x80\xaf_\xe2\x80\xa0" },
412 { MUP, A, 0, 0 | F_PROPERTY, "[\xc3\xa2-\xc3\xa6\xc3\x81-\xc3\x84\xe2\x80\xa8-\xe2\x80\xa9\xe6\x92\xad\\p{Zs}]{2,}", "\xe2\x80\xa7\xe2\x80\xa9\xe6\x92\xad \xe6\x92\xae" },
413 { MUP, A, 0, 0 | F_PROPERTY, "[\\P{L&}]{2}[^\xc2\x85-\xc2\x89\\p{Ll}\\p{Lu}]{2}", "\xc3\xa9\xe6\x92\xad.a\xe6\x92\xad|\xc2\x8a#" },
414 { PCRE2_UCP, 0, 0, 0 | F_PROPERTY, "[a-b\\s]{2,5}[^a]", "AB baaa" },
415 { MUP, 0, 0, 0 | F_NOMATCH, "[^\\p{Hangul}\\p{Z}]", " " },
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700416 { MUP, 0, 0, 0, "[\\p{Lu}\\P{Latin}]+", "c\xEA\xA4\xAE,A,b" },
417 { MUP, 0, 0, 0, "[\\x{a92e}\\p{Lu}\\P{Latin}]+", "c\xEA\xA4\xAE,A,b" },
418 { CMUP, 0, 0, 0, "[^S]\\B", "\xe2\x80\x8a" },
Elliott Hughes5b808042021-10-01 10:56:10 -0700419
420 /* Possible empty brackets. */
421 { MU, A, 0, 0, "(?:|ab||bc|a)+d", "abcxabcabd" },
422 { MU, A, 0, 0, "(|ab||bc|a)+d", "abcxabcabd" },
423 { MU, A, 0, 0, "(?:|ab||bc|a)*d", "abcxabcabd" },
424 { MU, A, 0, 0, "(|ab||bc|a)*d", "abcxabcabd" },
425 { MU, A, 0, 0, "(?:|ab||bc|a)+?d", "abcxabcabd" },
426 { MU, A, 0, 0, "(|ab||bc|a)+?d", "abcxabcabd" },
427 { MU, A, 0, 0, "(?:|ab||bc|a)*?d", "abcxabcabd" },
428 { MU, A, 0, 0, "(|ab||bc|a)*?d", "abcxabcabd" },
429 { MU, A, 0, 0, "(((a)*?|(?:ba)+)+?|(?:|c|ca)*)*m", "abaacaccabacabalabaacaccabacabamm" },
430 { MU, A, 0, 0, "(?:((?:a)*|(ba)+?)+|(|c|ca)*?)*?m", "abaacaccabacabalabaacaccabacabamm" },
431
432 /* Start offset. */
433 { MU, A, 0, 3, "(\\d|(?:\\w)*\\w)+", "0ac01Hb" },
434 { MU, A, 0, 4 | F_NOMATCH, "(\\w\\W\\w)+", "ab#d" },
435 { MU, A, 0, 2 | F_NOMATCH, "(\\w\\W\\w)+", "ab#d" },
436 { MU, A, 0, 1, "(\\w\\W\\w)+", "ab#d" },
437
438 /* Newline. */
439 { M, PCRE2_NEWLINE_CRLF, 0, 0, "\\W{0,2}[^#]{3}", "\r\n#....." },
440 { M, PCRE2_NEWLINE_CR, 0, 0, "\\W{0,2}[^#]{3}", "\r\n#....." },
441 { M, PCRE2_NEWLINE_CRLF, 0, 0, "\\W{1,3}[^#]", "\r\n##...." },
442 { MU, A, PCRE2_NO_UTF_CHECK, 1, "^.a", "\n\x80\nxa" },
443 { MU, A, 0, 1, "^", "\r\n" },
444 { M, PCRE2_NEWLINE_CRLF, 0, 1 | F_NOMATCH, "^", "\r\n" },
445 { M, PCRE2_NEWLINE_CRLF, 0, 1, "^", "\r\na" },
446
447 /* Any character except newline or any newline. */
448 { 0, PCRE2_NEWLINE_CRLF, 0, 0, ".", "\r" },
449 { U, PCRE2_NEWLINE_CRLF, 0, 0, ".(.).", "a\xc3\xa1\r\n\n\r\r" },
450 { 0, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".(.)", "a\rb\nc\r\n\xc2\x85\xe2\x80\xa8" },
451 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".(.)", "a\rb\nc\r\n\xc2\x85\xe2\x80\xa8" },
452 { U, PCRE2_NEWLINE_ANY, 0, 0, "(.).", "a\rb\nc\r\n\xc2\x85\xe2\x80\xa9$de" },
453 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0 | F_NOMATCH, ".(.).", "\xe2\x80\xa8\nb\r" },
454 { 0, PCRE2_NEWLINE_ANY, 0, 0, "(.)(.)", "#\x85#\r#\n#\r\n#\x84" },
455 { U, PCRE2_NEWLINE_ANY, 0, 0, "(.+)#", "#\rMn\xc2\x85#\n###" },
456 { 0, BSR(PCRE2_BSR_ANYCRLF), 0, 0, "\\R", "\r" },
457 { 0, BSR(PCRE2_BSR_ANYCRLF), 0, 0, "\\R", "\x85#\r\n#" },
458 { U, BSR(PCRE2_BSR_UNICODE), 0, 0, "\\R", "ab\xe2\x80\xa8#c" },
459 { U, BSR(PCRE2_BSR_UNICODE), 0, 0, "\\R", "ab\r\nc" },
460 { U, PCRE2_NEWLINE_CRLF | BSR(PCRE2_BSR_UNICODE), 0, 0, "(\\R.)+", "\xc2\x85\r\n#\xe2\x80\xa8\n\r\n\r" },
461 { MU, A, 0, 0 | F_NOMATCH, "\\R+", "ab" },
462 { MU, A, 0, 0, "\\R+", "ab\r\n\r" },
463 { MU, A, 0, 0, "\\R*", "ab\r\n\r" },
464 { MU, A, 0, 0, "\\R*", "\r\n\r" },
465 { MU, A, 0, 0, "\\R{2,4}", "\r\nab\r\r" },
466 { MU, A, 0, 0, "\\R{2,4}", "\r\nab\n\n\n\r\r\r" },
467 { MU, A, 0, 0, "\\R{2,}", "\r\nab\n\n\n\r\r\r" },
468 { MU, A, 0, 0, "\\R{0,3}", "\r\n\r\n\r\n\r\n\r\n" },
469 { MU, A, 0, 0 | F_NOMATCH, "\\R+\\R\\R", "\r\n\r\n" },
470 { MU, A, 0, 0, "\\R+\\R\\R", "\r\r\r" },
471 { MU, A, 0, 0, "\\R*\\R\\R", "\n\r" },
472 { MU, A, 0, 0 | F_NOMATCH, "\\R{2,4}\\R\\R", "\r\r\r" },
473 { MU, A, 0, 0, "\\R{2,4}\\R\\R", "\r\r\r\r" },
474
475 /* Atomic groups (no fallback from "next" direction). */
476 { MU, A, 0, 0 | F_NOMATCH, "(?>ab)ab", "bab" },
477 { MU, A, 0, 0 | F_NOMATCH, "(?>(ab))ab", "bab" },
478 { MU, A, 0, 0, "(?>ab)+abc(?>de)*def(?>gh)?ghe(?>ij)+?k(?>lm)*?n(?>op)?\?op",
479 "bababcdedefgheijijklmlmnop" },
480 { MU, A, 0, 0, "(?>a(b)+a|(ab)?\?(b))an", "abban" },
481 { MU, A, 0, 0, "(?>ab+a|(?:ab)?\?b)an", "abban" },
482 { MU, A, 0, 0, "((?>ab|ad|)*?)(?>|c)*abad", "abababcababad" },
483 { MU, A, 0, 0, "(?>(aa|b|)*+(?>(##)|###)*d|(aa)(?>(baa)?)m)", "aabaa#####da" },
484 { MU, A, 0, 0, "((?>a|)+?)b", "aaacaaab" },
485 { MU, A, 0, 0, "(?>x|)*$", "aaa" },
486 { MU, A, 0, 0, "(?>(x)|)*$", "aaa" },
487 { MU, A, 0, 0, "(?>x|())*$", "aaa" },
488 { MU, A, 0, 0, "((?>[cxy]a|[a-d])*?)b", "aaa+ aaab" },
489 { MU, A, 0, 0, "((?>[cxy](a)|[a-d])*?)b", "aaa+ aaab" },
490 { MU, A, 0, 0, "(?>((?>(a+))))bab|(?>((?>(a+))))bb", "aaaabaaabaabab" },
491 { MU, A, 0, 0, "(?>(?>a+))bab|(?>(?>a+))bb", "aaaabaaabaabab" },
492 { MU, A, 0, 0, "(?>(a)c|(?>(c)|(a))a)b*?bab", "aaaabaaabaabab" },
493 { MU, A, 0, 0, "(?>ac|(?>c|a)a)b*?bab", "aaaabaaabaabab" },
494 { MU, A, 0, 0, "(?>(b)b|(a))*b(?>(c)|d)?x", "ababcaaabdbx" },
495 { MU, A, 0, 0, "(?>bb|a)*b(?>c|d)?x", "ababcaaabdbx" },
496 { MU, A, 0, 0, "(?>(bb)|a)*b(?>c|(d))?x", "ababcaaabdbx" },
497 { MU, A, 0, 0, "(?>(a))*?(?>(a))+?(?>(a))??x", "aaaaaacccaaaaabax" },
498 { MU, A, 0, 0, "(?>a)*?(?>a)+?(?>a)??x", "aaaaaacccaaaaabax" },
499 { MU, A, 0, 0, "(?>(a)|)*?(?>(a)|)+?(?>(a)|)??x", "aaaaaacccaaaaabax" },
500 { MU, A, 0, 0, "(?>a|)*?(?>a|)+?(?>a|)??x", "aaaaaacccaaaaabax" },
501 { MU, A, 0, 0, "(?>a(?>(a{0,2}))*?b|aac)+b", "aaaaaaacaaaabaaaaacaaaabaacaaabb" },
502 { CM, A, 0, 0, "(?>((?>a{32}|b+|(a*))?(?>c+|d*)?\?)+e)+?f", "aaccebbdde bbdaaaccebbdee bbdaaaccebbdeef" },
503 { MU, A, 0, 0, "(?>(?:(?>aa|a||x)+?b|(?>aa|a||(x))+?c)?(?>[ad]{0,2})*?d)+d", "aaacdbaabdcabdbaaacd aacaabdbdcdcaaaadaabcbaadd" },
504 { MU, A, 0, 0, "(?>(?:(?>aa|a||(x))+?b|(?>aa|a||x)+?c)?(?>[ad]{0,2})*?d)+d", "aaacdbaabdcabdbaaacd aacaabdbdcdcaaaadaabcbaadd" },
505 { MU, A, 0, 0 | F_PROPERTY, "\\X", "\xcc\x8d\xcc\x8d" },
506 { MU, A, 0, 0 | F_PROPERTY, "\\X", "\xcc\x8d\xcc\x8d#\xcc\x8d\xcc\x8d" },
507 { MU, A, 0, 0 | F_PROPERTY, "\\X+..", "\xcc\x8d#\xcc\x8d#\xcc\x8d\xcc\x8d" },
508 { MU, A, 0, 0 | F_PROPERTY, "\\X{2,4}", "abcdef" },
509 { MU, A, 0, 0 | F_PROPERTY, "\\X{2,4}?", "abcdef" },
510 { MU, A, 0, 0 | F_NOMATCH | F_PROPERTY, "\\X{2,4}..", "#\xcc\x8d##" },
511 { MU, A, 0, 0 | F_PROPERTY, "\\X{2,4}..", "#\xcc\x8d#\xcc\x8d##" },
512 { MU, A, 0, 0, "(c(ab)?+ab)+", "cabcababcab" },
513 { MU, A, 0, 0, "(?>(a+)b)+aabab", "aaaabaaabaabab" },
514
515 /* Possessive quantifiers. */
516 { MU, A, 0, 0, "(?:a|b)++m", "mababbaaxababbaam" },
517 { MU, A, 0, 0, "(?:a|b)*+m", "mababbaaxababbaam" },
518 { MU, A, 0, 0, "(?:a|b)*+m", "ababbaaxababbaam" },
519 { MU, A, 0, 0, "(a|b)++m", "mababbaaxababbaam" },
520 { MU, A, 0, 0, "(a|b)*+m", "mababbaaxababbaam" },
521 { MU, A, 0, 0, "(a|b)*+m", "ababbaaxababbaam" },
522 { MU, A, 0, 0, "(a|b(*ACCEPT))++m", "maaxab" },
523 { MU, A, 0, 0, "(?:b*)++m", "bxbbxbbbxm" },
524 { MU, A, 0, 0, "(?:b*)++m", "bxbbxbbbxbbm" },
525 { MU, A, 0, 0, "(?:b*)*+m", "bxbbxbbbxm" },
526 { MU, A, 0, 0, "(?:b*)*+m", "bxbbxbbbxbbm" },
527 { MU, A, 0, 0, "(b*)++m", "bxbbxbbbxm" },
528 { MU, A, 0, 0, "(b*)++m", "bxbbxbbbxbbm" },
529 { MU, A, 0, 0, "(b*)*+m", "bxbbxbbbxm" },
530 { MU, A, 0, 0, "(b*)*+m", "bxbbxbbbxbbm" },
531 { MU, A, 0, 0, "(?:a|(b))++m", "mababbaaxababbaam" },
532 { MU, A, 0, 0, "(?:(a)|b)*+m", "mababbaaxababbaam" },
533 { MU, A, 0, 0, "(?:(a)|(b))*+m", "ababbaaxababbaam" },
534 { MU, A, 0, 0, "(a|(b))++m", "mababbaaxababbaam" },
535 { MU, A, 0, 0, "((a)|b)*+m", "mababbaaxababbaam" },
536 { MU, A, 0, 0, "((a)|(b))*+m", "ababbaaxababbaam" },
537 { MU, A, 0, 0, "(a|(b)(*ACCEPT))++m", "maaxab" },
538 { MU, A, 0, 0, "(?:(b*))++m", "bxbbxbbbxm" },
539 { MU, A, 0, 0, "(?:(b*))++m", "bxbbxbbbxbbm" },
540 { MU, A, 0, 0, "(?:(b*))*+m", "bxbbxbbbxm" },
541 { MU, A, 0, 0, "(?:(b*))*+m", "bxbbxbbbxbbm" },
542 { MU, A, 0, 0, "((b*))++m", "bxbbxbbbxm" },
543 { MU, A, 0, 0, "((b*))++m", "bxbbxbbbxbbm" },
544 { MU, A, 0, 0, "((b*))*+m", "bxbbxbbbxm" },
545 { MU, A, 0, 0, "((b*))*+m", "bxbbxbbbxbbm" },
546 { MU, A, 0, 0 | F_NOMATCH, "(?>(b{2,4}))(?:(?:(aa|c))++m|(?:(aa|c))+n)", "bbaacaaccaaaacxbbbmbn" },
547 { MU, A, 0, 0, "((?:b)++a)+(cd)*+m", "bbababbacdcdnbbababbacdcdm" },
548 { MU, A, 0, 0, "((?:(b))++a)+((c)d)*+m", "bbababbacdcdnbbababbacdcdm" },
549 { MU, A, 0, 0, "(?:(?:(?:ab)*+k)++(?:n(?:cd)++)*+)*+m", "ababkkXababkkabkncXababkkabkncdcdncdXababkkabkncdcdncdkkabkncdXababkkabkncdcdncdkkabkncdm" },
550 { MU, A, 0, 0, "(?:((ab)*+(k))++(n(?:c(d))++)*+)*+m", "ababkkXababkkabkncXababkkabkncdcdncdXababkkabkncdcdncdkkabkncdXababkkabkncdcdncdkkabkncdm" },
551
552 /* Back references. */
553 { MU, A, 0, 0, "(aa|bb)(\\1*)(ll|)(\\3*)bbbbbbc", "aaaaaabbbbbbbbc" },
554 { CMU, A, 0, 0, "(aa|bb)(\\1+)(ll|)(\\3+)bbbbbbc", "bBbbBbCbBbbbBbbcbbBbbbBBbbC" },
555 { CM, A, 0, 0, "(a{2,4})\\1", "AaAaaAaA" },
556 { MU, A, 0, 0, "(aa|bb)(\\1?)aa(\\1?)(ll|)(\\4+)bbc", "aaaaaaaabbaabbbbaabbbbc" },
557 { MU, A, 0, 0, "(aa|bb)(\\1{0,5})(ll|)(\\3{0,5})cc", "bbxxbbbbxxaaaaaaaaaaaaaaaacc" },
558 { MU, A, 0, 0, "(aa|bb)(\\1{3,5})(ll|)(\\3{3,5})cc", "bbbbbbbbbbbbaaaaaaccbbbbbbbbbbbbbbcc" },
559 { MU, A, 0, 0, "(aa|bb)(\\1{3,})(ll|)(\\3{3,})cc", "bbbbbbbbbbbbaaaaaaccbbbbbbbbbbbbbbcc" },
560 { MU, A, 0, 0, "(\\w+)b(\\1+)c", "GabGaGaDbGaDGaDc" },
561 { MU, A, 0, 0, "(?:(aa)|b)\\1?b", "bb" },
562 { CMU, A, 0, 0, "(aa|bb)(\\1*?)aa(\\1+?)", "bBBbaaAAaaAAaa" },
563 { MU, A, 0, 0, "(aa|bb)(\\1*?)(dd|)cc(\\3+?)", "aaaaaccdd" },
564 { CMU, A, 0, 0, "(?:(aa|bb)(\\1?\?)cc){2}(\\1?\?)", "aAaABBbbAAaAcCaAcCaA" },
565 { MU, A, 0, 0, "(?:(aa|bb)(\\1{3,5}?)){2}(dd|)(\\3{3,5}?)", "aaaaaabbbbbbbbbbaaaaaaaaaaaaaa" },
566 { CM, A, 0, 0, "(?:(aa|bb)(\\1{3,}?)){2}(dd|)(\\3{3,}?)", "aaaaaabbbbbbbbbbaaaaaaaaaaaaaa" },
567 { MU, A, 0, 0, "(?:(aa|bb)(\\1{0,3}?)){2}(dd|)(\\3{0,3}?)b(\\1{0,3}?)(\\1{0,3})", "aaaaaaaaaaaaaaabaaaaa" },
568 { MU, A, 0, 0, "(a(?:\\1|)a){3}b", "aaaaaaaaaaab" },
569 { M, A, 0, 0, "(a?)b(\\1\\1*\\1+\\1?\\1*?\\1+?\\1??\\1*+\\1++\\1?+\\1{4}\\1{3,5}\\1{4,}\\1{0,5}\\1{3,5}?\\1{4,}?\\1{0,5}?\\1{3,5}+\\1{4,}+\\1{0,5}+#){2}d", "bb#b##d" },
570 { MUP, A, 0, 0 | F_PROPERTY, "(\\P{N})\\1{2,}", ".www." },
571 { MUP, A, 0, 0 | F_PROPERTY, "(\\P{N})\\1{0,2}", "wwwww." },
572 { MUP, A, 0, 0 | F_PROPERTY, "(\\P{N})\\1{1,2}ww", "wwww" },
573 { MUP, A, 0, 0 | F_PROPERTY, "(\\P{N})\\1{1,2}ww", "wwwww" },
574 { PCRE2_UCP, 0, 0, 0 | F_PROPERTY, "(\\P{N})\\1{2,}", ".www." },
575 { CMUP, A, 0, 0, "(\xf0\x90\x90\x80)\\1", "\xf0\x90\x90\xa8\xf0\x90\x90\xa8" },
576 { MU | PCRE2_DUPNAMES, A, 0, 0 | F_NOMATCH, "\\k<A>{1,3}(?<A>aa)(?<A>bb)", "aabb" },
577 { MU | PCRE2_DUPNAMES | PCRE2_MATCH_UNSET_BACKREF, A, 0, 0, "\\k<A>{1,3}(?<A>aa)(?<A>bb)", "aabb" },
578 { MU | PCRE2_DUPNAMES | PCRE2_MATCH_UNSET_BACKREF, A, 0, 0, "\\k<A>*(?<A>aa)(?<A>bb)", "aabb" },
579 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?<A>aa)(?<A>bb)\\k<A>{0,3}aaaaaa", "aabbaaaaaa" },
580 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?<A>aa)(?<A>bb)\\k<A>{2,5}bb", "aabbaaaabb" },
581 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>aa)|(?<A>bb))\\k<A>{0,3}m", "aaaaaaaabbbbaabbbbm" },
582 { MU | PCRE2_DUPNAMES, A, 0, 0 | F_NOMATCH, "\\k<A>{1,3}?(?<A>aa)(?<A>bb)", "aabb" },
583 { MU | PCRE2_DUPNAMES | PCRE2_MATCH_UNSET_BACKREF, A, 0, 0, "\\k<A>{1,3}?(?<A>aa)(?<A>bb)", "aabb" },
584 { MU | PCRE2_DUPNAMES, A, 0, 0, "\\k<A>*?(?<A>aa)(?<A>bb)", "aabb" },
585 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>aa)|(?<A>bb))\\k<A>{0,3}?m", "aaaaaabbbbbbaabbbbbbbbbbm" },
586 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>aa)|(?<A>bb))\\k<A>*?m", "aaaaaabbbbbbaabbbbbbbbbbm" },
587 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>aa)|(?<A>bb))\\k<A>{2,3}?", "aaaabbbbaaaabbbbbbbbbb" },
588 { CMU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>AA)|(?<A>BB))\\k<A>{0,3}M", "aaaaaaaabbbbaabbbbm" },
589 { CMU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>AA)|(?<A>BB))\\k<A>{1,3}M", "aaaaaaaabbbbaabbbbm" },
590 { CMU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>AA)|(?<A>BB))\\k<A>{0,3}?M", "aaaaaabbbbbbaabbbbbbbbbbm" },
591 { CMU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>AA)|(?<A>BB))\\k<A>{2,3}?", "aaaabbbbaaaabbbbbbbbbb" },
592
593 /* Assertions. */
594 { MU, A, 0, 0, "(?=xx|yy|zz)\\w{4}", "abczzdefg" },
595 { MU, A, 0, 0, "(?=((\\w+)b){3}|ab)", "dbbbb ab" },
596 { MU, A, 0, 0, "(?!ab|bc|cd)[a-z]{2}", "Xabcdef" },
597 { MU, A, 0, 0, "(?<=aaa|aa|a)a", "aaa" },
598 { MU, A, 0, 2, "(?<=aaa|aa|a)a", "aaa" },
599 { M, A, 0, 0, "(?<=aaa|aa|a)a", "aaa" },
600 { M, A, 0, 2, "(?<=aaa|aa|a)a", "aaa" },
601 { MU, A, 0, 0, "(\\d{2})(?!\\w+c|(((\\w?)m){2}n)+|\\1)", "x5656" },
602 { MU, A, 0, 0, "((?=((\\d{2,6}\\w){2,}))\\w{5,20}K){2,}", "567v09708K12l00M00 567v09708K12l00M00K45K" },
603 { MU, A, 0, 0, "(?=(?:(?=\\S+a)\\w*(b)){3})\\w+\\d", "bba bbab nbbkba nbbkba0kl" },
604 { MU, A, 0, 0, "(?>a(?>(b+))a(?=(..)))*?k", "acabbcabbaabacabaabbakk" },
605 { MU, A, 0, 0, "((?(?=(a))a)+k)", "bbak" },
606 { MU, A, 0, 0, "((?(?=a)a)+k)", "bbak" },
607 { MU, A, 0, 0 | F_NOMATCH, "(?=(?>(a))m)amk", "a k" },
608 { MU, A, 0, 0 | F_NOMATCH, "(?!(?>(a))m)amk", "a k" },
609 { MU, A, 0, 0 | F_NOMATCH, "(?>(?=(a))am)amk", "a k" },
610 { MU, A, 0, 0, "(?=(?>a|(?=(?>(b+))a|c)[a-c]+)*?m)[a-cm]+k", "aaam bbam baaambaam abbabba baaambaamk" },
611 { MU, A, 0, 0, "(?> ?\?\\b(?(?=\\w{1,4}(a))m)\\w{0,8}bc){2,}?", "bca ssbc mabd ssbc mabc" },
612 { MU, A, 0, 0, "(?:(?=ab)?[^n][^n])+m", "ababcdabcdcdabnababcdabcdcdabm" },
613 { MU, A, 0, 0, "(?:(?=a(b))?[^n][^n])+m", "ababcdabcdcdabnababcdabcdcdabm" },
614 { MU, A, 0, 0, "(?:(?=.(.))??\\1.)+m", "aabbbcbacccanaabbbcbacccam" },
615 { MU, A, 0, 0, "(?:(?=.)??[a-c])+m", "abacdcbacacdcaccam" },
616 { MU, A, 0, 0, "((?!a)?(?!([^a]))?)+$", "acbab" },
617 { MU, A, 0, 0, "((?!a)?\?(?!([^a]))?\?)+$", "acbab" },
618 { MU, A, 0, 0, "a(?=(?C)\\B(?C`x`))b", "ab" },
619 { MU, A, 0, 0, "a(?!(?C)\\B(?C`x`))bb|ab", "abb" },
620 { MU, A, 0, 0, "a(?=\\b|(?C)\\B(?C`x`))b", "ab" },
621 { MU, A, 0, 0, "a(?!\\b|(?C)\\B(?C`x`))bb|ab", "abb" },
622 { MU, A, 0, 0, "c(?(?=(?C)\\B(?C`x`))ab|a)", "cab" },
623 { MU, A, 0, 0, "c(?(?!(?C)\\B(?C`x`))ab|a)", "cab" },
624 { MU, A, 0, 0, "c(?(?=\\b|(?C)\\B(?C`x`))ab|a)", "cab" },
625 { MU, A, 0, 0, "c(?(?!\\b|(?C)\\B(?C`x`))ab|a)", "cab" },
626 { MU, A, 0, 0, "a(?=)b", "ab" },
627 { MU, A, 0, 0 | F_NOMATCH, "a(?!)b", "ab" },
628
629 /* Not empty, ACCEPT, FAIL */
630 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "a*", "bcx" },
631 { MU, A, PCRE2_NOTEMPTY, 0, "a*", "bcaad" },
632 { MU, A, PCRE2_NOTEMPTY, 0, "a*?", "bcaad" },
633 { MU, A, PCRE2_NOTEMPTY_ATSTART, 0, "a*", "bcaad" },
634 { MU, A, 0, 0, "a(*ACCEPT)b", "ab" },
635 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "a*(*ACCEPT)b", "bcx" },
636 { MU, A, PCRE2_NOTEMPTY, 0, "a*(*ACCEPT)b", "bcaad" },
637 { MU, A, PCRE2_NOTEMPTY, 0, "a*?(*ACCEPT)b", "bcaad" },
638 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "(?:z|a*(*ACCEPT)b)", "bcx" },
639 { MU, A, PCRE2_NOTEMPTY, 0, "(?:z|a*(*ACCEPT)b)", "bcaad" },
640 { MU, A, PCRE2_NOTEMPTY, 0, "(?:z|a*?(*ACCEPT)b)", "bcaad" },
641 { MU, A, PCRE2_NOTEMPTY_ATSTART, 0, "a*(*ACCEPT)b", "bcx" },
642 { MU, A, PCRE2_NOTEMPTY_ATSTART, 0 | F_NOMATCH, "a*(*ACCEPT)b", "" },
643 { MU, A, 0, 0, "((a(*ACCEPT)b))", "ab" },
644 { MU, A, 0, 0, "(a(*FAIL)a|a)", "aaa" },
645 { MU, A, 0, 0, "(?=ab(*ACCEPT)b)a", "ab" },
646 { MU, A, 0, 0, "(?=(?:x|ab(*ACCEPT)b))", "ab" },
647 { MU, A, 0, 0, "(?=(a(b(*ACCEPT)b)))a", "ab" },
648 { MU, A, PCRE2_NOTEMPTY, 0, "(?=a*(*ACCEPT))c", "c" },
649 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "(?=A)", "AB" },
650
651 /* Conditional blocks. */
652 { MU, A, 0, 0, "(?(?=(a))a|b)+k", "ababbalbbadabak" },
653 { MU, A, 0, 0, "(?(?!(b))a|b)+k", "ababbalbbadabak" },
654 { MU, A, 0, 0, "(?(?=a)a|b)+k", "ababbalbbadabak" },
655 { MU, A, 0, 0, "(?(?!b)a|b)+k", "ababbalbbadabak" },
656 { MU, A, 0, 0, "(?(?=(a))a*|b*)+k", "ababbalbbadabak" },
657 { MU, A, 0, 0, "(?(?!(b))a*|b*)+k", "ababbalbbadabak" },
658 { MU, A, 0, 0, "(?(?!(b))(?:aaaaaa|a)|(?:bbbbbb|b))+aaaak", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb aaaaaaak" },
659 { MU, A, 0, 0, "(?(?!b)(?:aaaaaa|a)|(?:bbbbbb|b))+aaaak", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb aaaaaaak" },
660 { MU, A, 0, 0 | F_DIFF, "(?(?!(b))(?:aaaaaa|a)|(?:bbbbbb|b))+bbbbk", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb bbbbbbbk" },
661 { MU, A, 0, 0, "(?(?!b)(?:aaaaaa|a)|(?:bbbbbb|b))+bbbbk", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb bbbbbbbk" },
662 { MU, A, 0, 0, "(?(?=a)a*|b*)+k", "ababbalbbadabak" },
663 { MU, A, 0, 0, "(?(?!b)a*|b*)+k", "ababbalbbadabak" },
664 { MU, A, 0, 0, "(?(?=a)ab)", "a" },
665 { MU, A, 0, 0, "(?(?<!b)c)", "b" },
666 { MU, A, 0, 0, "(?(DEFINE)a(b))", "a" },
667 { MU, A, 0, 0, "a(?(DEFINE)(?:b|(?:c?)+)*)", "a" },
668 { MU, A, 0, 0, "(?(?=.[a-c])[k-l]|[A-D])", "kdB" },
669 { MU, A, 0, 0, "(?(?!.{0,4}[cd])(aa|bb)|(cc|dd))+", "aabbccddaa" },
670 { MU, A, 0, 0, "(?(?=[^#@]*@)(aaab|aa|aba)|(aba|aab)){3,}", "aaabaaaba#aaabaaaba#aaabaaaba@" },
671 { MU, A, 0, 0, "((?=\\w{5})\\w(?(?=\\w*k)\\d|[a-f_])*\\w\\s)+", "mol m10kk m088k _f_a_ mbkkl" },
672 { MU, A, 0, 0, "(c)?\?(?(1)a|b)", "cdcaa" },
673 { MU, A, 0, 0, "(c)?\?(?(1)a|b)", "cbb" },
674 { MU, A, 0, 0 | F_DIFF, "(?(?=(a))(aaaa|a?))+aak", "aaaaab aaaaak" },
675 { MU, A, 0, 0, "(?(?=a)(aaaa|a?))+aak", "aaaaab aaaaak" },
676 { MU, A, 0, 0, "(?(?!(b))(aaaa|a?))+aak", "aaaaab aaaaak" },
677 { MU, A, 0, 0, "(?(?!b)(aaaa|a?))+aak", "aaaaab aaaaak" },
678 { MU, A, 0, 0 | F_DIFF, "(?(?=(a))a*)+aak", "aaaaab aaaaak" },
679 { MU, A, 0, 0, "(?(?=a)a*)+aak", "aaaaab aaaaak" },
680 { MU, A, 0, 0, "(?(?!(b))a*)+aak", "aaaaab aaaaak" },
681 { MU, A, 0, 0, "(?(?!b)a*)+aak", "aaaaab aaaaak" },
682 { MU, A, 0, 0, "(?(?=(?=(?!(x))a)aa)aaa|(?(?=(?!y)bb)bbb))*k", "abaabbaaabbbaaabbb abaabbaaabbbaaabbbk" },
683 { MU, A, 0, 0, "(?P<Name>a)?(?P<Name2>b)?(?(Name)c|d)*l", "bc ddd abccabccl" },
684 { MU, A, 0, 0, "(?P<Name>a)?(?P<Name2>b)?(?(Name)c|d)+?dd", "bcabcacdb bdddd" },
685 { MU, A, 0, 0, "(?P<Name>a)?(?P<Name2>b)?(?(Name)c|d)+l", "ababccddabdbccd abcccl" },
686 { MU, A, 0, 0, "((?:a|aa)(?(1)aaa))x", "aax" },
687 { MU, A, 0, 0, "(?(?!)a|b)", "ab" },
688 { MU, A, 0, 0, "(?(?!)a)", "ab" },
689 { MU, A, 0, 0 | F_NOMATCH, "(?(?!)a|b)", "ac" },
690
691 /* Set start of match. */
692 { MU, A, 0, 0, "(?:\\Ka)*aaaab", "aaaaaaaa aaaaaaabb" },
693 { MU, A, 0, 0, "(?>\\Ka\\Ka)*aaaab", "aaaaaaaa aaaaaaaaaabb" },
694 { MU, A, 0, 0, "a+\\K(?<=\\Gaa)a", "aaaaaa" },
695 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "a\\K(*ACCEPT)b", "aa" },
696 { MU, A, PCRE2_NOTEMPTY_ATSTART, 0, "a\\K(*ACCEPT)b", "aa" },
697
698 /* First line. */
699 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_PROPERTY, "\\p{Any}a", "bb\naaa" },
700 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH | F_PROPERTY, "\\p{Any}a", "bb\r\naaa" },
701 { MU | PCRE2_FIRSTLINE, A, 0, 0, "(?<=a)", "a" },
702 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "[^a][^b]", "ab" },
703 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "a", "\na" },
704 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "[abc]", "\na" },
705 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "^a", "\na" },
706 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "^(?<=\n)", "\na" },
707 { MU | PCRE2_FIRSTLINE, A, 0, 0, "\xf0\x90\x90\x80", "\xf0\x90\x90\x80" },
708 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_ANY, 0, 0 | F_NOMATCH, "#", "\xc2\x85#" },
709 { M | PCRE2_FIRSTLINE, PCRE2_NEWLINE_ANY, 0, 0 | F_NOMATCH, "#", "\x85#" },
710 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_ANY, 0, 0 | F_NOMATCH, "^#", "\xe2\x80\xa8#" },
711 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0 | F_PROPERTY, "\\p{Any}", "\r\na" },
712 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0, ".", "\r" },
713 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0, "a", "\ra" },
714 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0 | F_NOMATCH, "ba", "bbb\r\nba" },
715 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0 | F_NOMATCH | F_PROPERTY, "\\p{Any}{4}|a", "\r\na" },
716 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 1, ".", "\r\n" },
717 { PCRE2_FIRSTLINE | PCRE2_DOTALL, PCRE2_NEWLINE_LF, 0, 0 | F_NOMATCH, "ab.", "ab" },
718 { MU | PCRE2_FIRSTLINE, A, 0, 1 | F_NOMATCH, "^[a-d0-9]", "\nxx\nd" },
719 { PCRE2_FIRSTLINE | PCRE2_DOTALL, PCRE2_NEWLINE_ANY, 0, 0, "....a", "012\n0a" },
720 { MU | PCRE2_FIRSTLINE, A, 0, 0, "[aC]", "a" },
721
722 /* Recurse. */
723 { MU, A, 0, 0, "(a)(?1)", "aa" },
724 { MU, A, 0, 0, "((a))(?1)", "aa" },
725 { MU, A, 0, 0, "(b|a)(?1)", "aa" },
726 { MU, A, 0, 0, "(b|(a))(?1)", "aa" },
727 { MU, A, 0, 0 | F_NOMATCH, "((a)(b)(?:a*))(?1)", "aba" },
728 { MU, A, 0, 0, "((a)(b)(?:a*))(?1)", "abab" },
729 { MU, A, 0, 0, "((a+)c(?2))b(?1)", "aacaabaca" },
730 { MU, A, 0, 0, "((?2)b|(a)){2}(?1)", "aabab" },
731 { MU, A, 0, 0, "(?1)(a)*+(?2)(b(?1))", "aababa" },
732 { MU, A, 0, 0, "(?1)(((a(*ACCEPT)))b)", "axaa" },
733 { MU, A, 0, 0, "(?1)(?(DEFINE) (((ac(*ACCEPT)))b) )", "akaac" },
734 { MU, A, 0, 0, "(a+)b(?1)b\\1", "abaaabaaaaa" },
735 { MU, A, 0, 0, "(?(DEFINE)(aa|a))(?1)ab", "aab" },
736 { MU, A, 0, 0, "(?(DEFINE)(a\\Kb))(?1)+ababc", "abababxabababc" },
737 { MU, A, 0, 0, "(a\\Kb)(?1)+ababc", "abababxababababc" },
738 { MU, A, 0, 0 | F_NOMATCH, "(a\\Kb)(?1)+ababc", "abababxababababxc" },
739 { MU, A, 0, 0, "b|<(?R)*>", "<<b>" },
740 { MU, A, 0, 0, "(a\\K){0}(?:(?1)b|ac)", "ac" },
741 { MU, A, 0, 0, "(?(DEFINE)(a(?2)|b)(b(?1)|(a)))(?:(?1)|(?2))m", "ababababnababababaam" },
742 { MU, A, 0, 0, "(a)((?(R)a|b))(?2)", "aabbabaa" },
743 { MU, A, 0, 0, "(a)((?(R2)a|b))(?2)", "aabbabaa" },
744 { MU, A, 0, 0, "(a)((?(R1)a|b))(?2)", "ababba" },
745 { MU, A, 0, 0, "(?(R0)aa|bb(?R))", "abba aabb bbaa" },
746 { MU, A, 0, 0, "((?(R)(?:aaaa|a)|(?:(aaaa)|(a)))+)(?1)$", "aaaaaaaaaa aaaa" },
747 { MU, A, 0, 0, "(?P<Name>a(?(R&Name)a|b))(?1)", "aab abb abaa" },
748 { MU, A, 0, 0, "((?(R)a|(?1)){3})", "XaaaaaaaaaX" },
749 { MU, A, 0, 0, "((?:(?(R)a|(?1))){3})", "XaaaaaaaaaX" },
750 { MU, A, 0, 0, "((?(R)a|(?1)){1,3})aaaaaa", "aaaaaaaaXaaaaaaaaa" },
751 { MU, A, 0, 0, "((?(R)a|(?1)){1,3}?)M", "aaaM" },
752 { MU, A, 0, 0, "((.)(?:.|\\2(?1))){0}#(?1)#", "#aabbccdde# #aabbccddee#" },
753 { MU, A, 0, 0, "((.)(?:\\2|\\2{4}b)){0}#(?:(?1))+#", "#aaaab# #aaaaab#" },
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700754 { MU, A, 0, 0 | F_NOMATCH, "(?1)$((.|\\2xx){1,2})", "abc" },
Elliott Hughes5b808042021-10-01 10:56:10 -0700755
756 /* 16 bit specific tests. */
757 { CM, A, 0, 0 | F_FORCECONV, "\xc3\xa1", "\xc3\x81\xc3\xa1" },
758 { CM, A, 0, 0 | F_FORCECONV, "\xe1\xbd\xb8", "\xe1\xbf\xb8\xe1\xbd\xb8" },
759 { CM, A, 0, 0 | F_FORCECONV, "[\xc3\xa1]", "\xc3\x81\xc3\xa1" },
760 { CM, A, 0, 0 | F_FORCECONV, "[\xe1\xbd\xb8]", "\xe1\xbf\xb8\xe1\xbd\xb8" },
761 { CM, A, 0, 0 | F_FORCECONV, "[a-\xed\xb0\x80]", "A" },
762 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "[a-\\x{dc00}]", "B" },
763 { CM, A, 0, 0 | F_NO8 | F_NOMATCH | F_FORCECONV, "[b-\\x{dc00}]", "a" },
764 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "\xed\xa0\x80\\x{d800}\xed\xb0\x80\\x{dc00}", "\xed\xa0\x80\xed\xa0\x80\xed\xb0\x80\xed\xb0\x80" },
765 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "[\xed\xa0\x80\\x{d800}]{1,2}?[\xed\xb0\x80\\x{dc00}]{1,2}?#", "\xed\xa0\x80\xed\xa0\x80\xed\xb0\x80\xed\xb0\x80#" },
766 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80\xed\xb0\x80#]{0,3}(?<=\xed\xb0\x80.)", "\xed\xa0\x80#\xed\xa0\x80##\xed\xb0\x80\xed\xa0\x80" },
767 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80-\xed\xb3\xbf]", "\xed\x9f\xbf\xed\xa0\x83" },
768 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80-\xed\xb3\xbf]", "\xed\xb4\x80\xed\xb3\xb0" },
769 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "[\\x{d800}-\\x{dcff}]", "\xed\x9f\xbf\xed\xa0\x83" },
770 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "[\\x{d800}-\\x{dcff}]", "\xed\xb4\x80\xed\xb3\xb0" },
771 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80-\xef\xbf\xbf]+[\x1-\xed\xb0\x80]+#", "\xed\xa0\x85\xc3\x81\xed\xa0\x85\xef\xbf\xb0\xc2\x85\xed\xa9\x89#" },
772 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80][\xed\xb0\x80]{2,}", "\xed\xa0\x80\xed\xb0\x80\xed\xa0\x80\xed\xb0\x80\xed\xb0\x80\xed\xb0\x80" },
773 { M, A, 0, 0 | F_FORCECONV, "[^\xed\xb0\x80]{3,}?", "##\xed\xb0\x80#\xed\xb0\x80#\xc3\x89#\xed\xb0\x80" },
774 { M, A, 0, 0 | F_NO8 | F_FORCECONV, "[^\\x{dc00}]{3,}?", "##\xed\xb0\x80#\xed\xb0\x80#\xc3\x89#\xed\xb0\x80" },
775 { CM, A, 0, 0 | F_FORCECONV, ".\\B.", "\xed\xa0\x80\xed\xb0\x80" },
776 { CM, A, 0, 0 | F_FORCECONV, "\\D+(?:\\d+|.)\\S+(?:\\s+|.)\\W+(?:\\w+|.)\xed\xa0\x80\xed\xa0\x80", "\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80" },
777 { CM, A, 0, 0 | F_FORCECONV, "\\d*\\s*\\w*\xed\xa0\x80\xed\xa0\x80", "\xed\xa0\x80\xed\xa0\x80" },
778 { CM, A, 0, 0 | F_FORCECONV | F_NOMATCH, "\\d*?\\D*?\\s*?\\S*?\\w*?\\W*?##", "\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80#" },
779 { CM | PCRE2_EXTENDED, A, 0, 0 | F_FORCECONV, "\xed\xa0\x80 \xed\xb0\x80 !", "\xed\xa0\x80\xed\xb0\x80!" },
780 { CM, A, 0, 0 | F_FORCECONV, "\xed\xa0\x80+#[^#]+\xed\xa0\x80", "\xed\xa0\x80#a\xed\xa0\x80" },
781 { CM, A, 0, 0 | F_FORCECONV, "(\xed\xa0\x80+)#\\1", "\xed\xa0\x80\xed\xa0\x80#\xed\xa0\x80\xed\xa0\x80" },
782 { M, PCRE2_NEWLINE_ANY, 0, 0 | F_NO8 | F_FORCECONV, "^-", "a--\xe2\x80\xa8--" },
783 { 0, BSR(PCRE2_BSR_UNICODE), 0, 0 | F_NO8 | F_FORCECONV, "\\R", "ab\xe2\x80\xa8" },
784 { 0, 0, 0, 0 | F_NO8 | F_FORCECONV, "\\v", "ab\xe2\x80\xa9" },
785 { 0, 0, 0, 0 | F_NO8 | F_FORCECONV, "\\h", "ab\xe1\xa0\x8e" },
786 { 0, 0, 0, 0 | F_NO8 | F_FORCECONV, "\\v+?\\V+?#", "\xe2\x80\xa9\xe2\x80\xa9\xef\xbf\xbf\xef\xbf\xbf#" },
787 { 0, 0, 0, 0 | F_NO8 | F_FORCECONV, "\\h+?\\H+?#", "\xe1\xa0\x8e\xe1\xa0\x8e\xef\xbf\xbf\xef\xbf\xbf#" },
788
789 /* Partial matching. */
790 { MU, A, PCRE2_PARTIAL_SOFT, 0, "ab", "a" },
791 { MU, A, PCRE2_PARTIAL_SOFT, 0, "ab|a", "a" },
792 { MU, A, PCRE2_PARTIAL_HARD, 0, "ab|a", "a" },
793 { MU, A, PCRE2_PARTIAL_SOFT, 0, "\\b#", "a" },
794 { MU, A, PCRE2_PARTIAL_SOFT, 0, "(?<=a)b", "a" },
795 { MU, A, PCRE2_PARTIAL_SOFT, 0, "abc|(?<=xxa)bc", "xxab" },
796 { MU, A, PCRE2_PARTIAL_SOFT, 0, "a\\B", "a" },
797 { MU, A, PCRE2_PARTIAL_HARD, 0, "a\\b", "a" },
798
799 /* (*MARK) verb. */
800 { MU, A, 0, 0, "a(*MARK:aa)a", "ababaa" },
801 { MU, A, 0, 0 | F_NOMATCH, "a(*:aa)a", "abab" },
802 { MU, A, 0, 0, "a(*:aa)(b(*:bb)b|bc)", "abc" },
803 { MU, A, 0, 0 | F_NOMATCH, "a(*:1)x|b(*:2)y", "abc" },
804 { MU, A, 0, 0, "(?>a(*:aa))b|ac", "ac" },
805 { MU, A, 0, 0, "(?(DEFINE)(a(*:aa)))(?1)", "a" },
806 { MU, A, 0, 0 | F_NOMATCH, "(?(DEFINE)((a)(*:aa)))(?1)b", "aa" },
807 { MU, A, 0, 0, "(?(DEFINE)(a(*:aa)))a(?1)b|aac", "aac" },
808 { MU, A, 0, 0, "(a(*:aa)){0}(?:b(?1)b|c)+c", "babbab cc" },
809 { MU, A, 0, 0, "(a(*:aa)){0}(?:b(?1)b)+", "babba" },
810 { MU, A, 0, 0 | F_NOMATCH, "(a(*:aa)){0}(?:b(?1)b)+", "ba" },
811 { MU, A, 0, 0, "(a\\K(*:aa)){0}(?:b(?1)b|c)+c", "babbab cc" },
812 { MU, A, 0, 0, "(a\\K(*:aa)){0}(?:b(?1)b)+", "babba" },
813 { MU, A, 0, 0 | F_NOMATCH, "(a\\K(*:aa)){0}(?:b(?1)b)+", "ba" },
814 { MU, A, 0, 0 | F_NOMATCH, "(*:mark)m", "a" },
815
816 /* (*COMMIT) verb. */
817 { MU, A, 0, 0 | F_NOMATCH, "a(*COMMIT)b", "ac" },
818 { MU, A, 0, 0, "aa(*COMMIT)b", "xaxaab" },
819 { MU, A, 0, 0 | F_NOMATCH, "a(*COMMIT)(*:msg)b|ac", "ac" },
820 { MU, A, 0, 0 | F_NOMATCH, "(a(*COMMIT)b)++", "abac" },
821 { MU, A, 0, 0 | F_NOMATCH, "((a)(*COMMIT)b)++", "abac" },
822 { MU, A, 0, 0 | F_NOMATCH, "(?=a(*COMMIT)b)ab|ad", "ad" },
823
824 /* (*PRUNE) verb. */
825 { MU, A, 0, 0, "aa\\K(*PRUNE)b", "aaab" },
826 { MU, A, 0, 0, "aa(*PRUNE:bb)b|a", "aa" },
827 { MU, A, 0, 0, "(a)(a)(*PRUNE)b|(a)", "aa" },
828 { MU, A, 0, 0, "(a)(a)(a)(a)(a)(a)(a)(a)(*PRUNE)b|(a)", "aaaaaaaa" },
829 { MU, A, PCRE2_PARTIAL_SOFT, 0, "a(*PRUNE)a|", "a" },
830 { MU, A, PCRE2_PARTIAL_SOFT, 0, "a(*PRUNE)a|m", "a" },
831 { MU, A, 0, 0 | F_NOMATCH, "(?=a(*PRUNE)b)ab|ad", "ad" },
832 { MU, A, 0, 0, "a(*COMMIT)(*PRUNE)d|bc", "abc" },
833 { MU, A, 0, 0, "(?=a(*COMMIT)b)a(*PRUNE)c|bc", "abc" },
834 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?=a(*COMMIT)b)a(*PRUNE)c|bc", "abc" },
835 { MU, A, 0, 0, "(?=(a)(*COMMIT)b)a(*PRUNE)c|bc", "abc" },
836 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?=(a)(*COMMIT)b)a(*PRUNE)c|bc", "abc" },
837 { MU, A, 0, 0, "(a(*COMMIT)b){0}a(?1)(*PRUNE)c|bc", "abc" },
838 { MU, A, 0, 0 | F_NOMATCH, "(a(*COMMIT)b){0}a(*COMMIT)(?1)(*PRUNE)c|bc", "abc" },
839 { MU, A, 0, 0, "(a(*COMMIT)b)++(*PRUNE)d|c", "ababc" },
840 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(a(*COMMIT)b)++(*PRUNE)d|c", "ababc" },
841 { MU, A, 0, 0, "((a)(*COMMIT)b)++(*PRUNE)d|c", "ababc" },
842 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)((a)(*COMMIT)b)++(*PRUNE)d|c", "ababc" },
843 { MU, A, 0, 0, "(?>a(*COMMIT)b)*abab(*PRUNE)d|ba", "ababab" },
844 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)*abab(*PRUNE)d|ba", "ababab" },
845 { MU, A, 0, 0, "(?>a(*COMMIT)b)+abab(*PRUNE)d|ba", "ababab" },
846 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)+abab(*PRUNE)d|ba", "ababab" },
847 { MU, A, 0, 0, "(?>a(*COMMIT)b)?ab(*PRUNE)d|ba", "aba" },
848 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)?ab(*PRUNE)d|ba", "aba" },
849 { MU, A, 0, 0, "(?>a(*COMMIT)b)*?n(*PRUNE)d|ba", "abababn" },
850 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)*?n(*PRUNE)d|ba", "abababn" },
851 { MU, A, 0, 0, "(?>a(*COMMIT)b)+?n(*PRUNE)d|ba", "abababn" },
852 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)+?n(*PRUNE)d|ba", "abababn" },
853 { MU, A, 0, 0, "(?>a(*COMMIT)b)??n(*PRUNE)d|bn", "abn" },
854 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)??n(*PRUNE)d|bn", "abn" },
855
856 /* (*SKIP) verb. */
857 { MU, A, 0, 0 | F_NOMATCH, "(?=a(*SKIP)b)ab|ad", "ad" },
858 { MU, A, 0, 0, "(\\w+(*SKIP)#)", "abcd,xyz#," },
859 { MU, A, 0, 0, "\\w+(*SKIP)#|mm", "abcd,xyz#," },
860 { MU, A, 0, 0 | F_NOMATCH, "b+(?<=(*SKIP)#c)|b+", "#bbb" },
861
862 /* (*THEN) verb. */
863 { MU, A, 0, 0, "((?:a(*THEN)|aab)(*THEN)c|a+)+m", "aabcaabcaabcaabcnacm" },
864 { MU, A, 0, 0 | F_NOMATCH, "((?:a(*THEN)|aab)(*THEN)c|a+)+m", "aabcm" },
865 { MU, A, 0, 0, "((?:a(*THEN)|aab)c|a+)+m", "aabcaabcnmaabcaabcm" },
866 { MU, A, 0, 0, "((?:a|aab)(*THEN)c|a+)+m", "aam" },
867 { MU, A, 0, 0, "((?:a(*COMMIT)|aab)(*THEN)c|a+)+m", "aam" },
868 { MU, A, 0, 0, "(?(?=a(*THEN)b)ab|ad)", "ad" },
869 { MU, A, 0, 0, "(?(?!a(*THEN)b)ad|add)", "add" },
870 { MU, A, 0, 0 | F_NOMATCH, "(?(?=a)a(*THEN)b|ad)", "ad" },
871 { MU, A, 0, 0, "(?!(?(?=a)ab|b(*THEN)d))bn|bnn", "bnn" },
872 { MU, A, 0, 0, "(?=(*THEN: ))* ", " " },
873 { MU, A, 0, 0, "a(*THEN)(?R) |", "a" },
874
875 /* Recurse and control verbs. */
876 { MU, A, 0, 0, "(a(*ACCEPT)b){0}a(?1)b", "aacaabb" },
877 { MU, A, 0, 0, "((a)\\2(*ACCEPT)b){0}a(?1)b", "aaacaaabb" },
878 { MU, A, 0, 0, "((ab|a(*ACCEPT)x)+|ababababax){0}_(?1)_", "_ababababax_ _ababababa_" },
879 { MU, A, 0, 0, "((.)(?:A(*ACCEPT)|(?1)\\2)){0}_(?1)_", "_bcdaAdcb_bcdaAdcb_" },
880 { MU, A, 0, 0, "((*MARK:m)(?:a|a(*COMMIT)b|aa)){0}_(?1)_", "_ab_" },
881 { MU, A, 0, 0, "((*MARK:m)(?:a|a(*COMMIT)b|aa)){0}_(?1)_|(_aa_)", "_aa_" },
882 { MU, A, 0, 0, "(a(*COMMIT)(?:b|bb)|c(*ACCEPT)d|dd){0}_(?1)+_", "_ax_ _cd_ _abbb_ _abcd_ _abbcdd_" },
883 { MU, A, 0, 0, "((.)(?:.|(*COMMIT)\\2{3}(*ACCEPT).*|.*)){0}_(?1){0,4}_", "_aaaabbbbccccddd_ _aaaabbbbccccdddd_" },
884
885#ifdef SUPPORT_UNICODE
886 /* Script runs and iterations. */
887 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)*#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
888 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)+#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
889 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)*?#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
890 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)+?#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
891 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)*+#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
892 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)++#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
893 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)?#", "!ab!abc!ab!ab#" },
894 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)??#", "!ab!abc!ab!ab#" },
895#endif
896
897 /* Deep recursion. */
898 { MU, A, 0, 0, "((((?:(?:(?:\\w)+)?)*|(?>\\w)+?)+|(?>\\w)?\?)*)?\\s", "aaaaa+ " },
899 { MU, A, 0, 0, "(?:((?:(?:(?:\\w*?)+)??|(?>\\w)?|\\w*+)*)+)+?\\s", "aa+ " },
900 { MU, A, 0, 0, "((a?)+)+b", "aaaaaaaaaaaa b" },
901
902 /* Deep recursion: Stack limit reached. */
903 { M, A, 0, 0 | F_NOMATCH, "a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?aaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaa" },
904 { M, A, 0, 0 | F_NOMATCH, "(?:a+)+b", "aaaaaaaaaaaaaaaaaaaaaaaa b" },
905 { M, A, 0, 0 | F_NOMATCH, "(?:a+?)+?b", "aaaaaaaaaaaaaaaaaaaaaaaa b" },
906 { M, A, 0, 0 | F_NOMATCH, "(?:a*)*b", "aaaaaaaaaaaaaaaaaaaaaaaa b" },
907 { M, A, 0, 0 | F_NOMATCH, "(?:a*?)*?b", "aaaaaaaaaaaaaaaaaaaaaaaa b" },
908
909 { 0, 0, 0, 0, NULL, NULL }
910};
911
912#ifdef SUPPORT_PCRE2_8
913static pcre2_jit_stack_8* callback8(void *arg)
914{
915 return (pcre2_jit_stack_8 *)arg;
916}
917#endif
918
919#ifdef SUPPORT_PCRE2_16
920static pcre2_jit_stack_16* callback16(void *arg)
921{
922 return (pcre2_jit_stack_16 *)arg;
923}
924#endif
925
926#ifdef SUPPORT_PCRE2_32
927static pcre2_jit_stack_32* callback32(void *arg)
928{
929 return (pcre2_jit_stack_32 *)arg;
930}
931#endif
932
933#ifdef SUPPORT_PCRE2_8
934static pcre2_jit_stack_8 *stack8;
935
936static pcre2_jit_stack_8 *getstack8(void)
937{
938 if (!stack8)
939 stack8 = pcre2_jit_stack_create_8(1, 1024 * 1024, NULL);
940 return stack8;
941}
942
943static void setstack8(pcre2_match_context_8 *mcontext)
944{
945 if (!mcontext) {
946 if (stack8)
947 pcre2_jit_stack_free_8(stack8);
948 stack8 = NULL;
949 return;
950 }
951
952 pcre2_jit_stack_assign_8(mcontext, callback8, getstack8());
953}
954#endif /* SUPPORT_PCRE2_8 */
955
956#ifdef SUPPORT_PCRE2_16
957static pcre2_jit_stack_16 *stack16;
958
959static pcre2_jit_stack_16 *getstack16(void)
960{
961 if (!stack16)
962 stack16 = pcre2_jit_stack_create_16(1, 1024 * 1024, NULL);
963 return stack16;
964}
965
966static void setstack16(pcre2_match_context_16 *mcontext)
967{
968 if (!mcontext) {
969 if (stack16)
970 pcre2_jit_stack_free_16(stack16);
971 stack16 = NULL;
972 return;
973 }
974
975 pcre2_jit_stack_assign_16(mcontext, callback16, getstack16());
976}
977#endif /* SUPPORT_PCRE2_16 */
978
979#ifdef SUPPORT_PCRE2_32
980static pcre2_jit_stack_32 *stack32;
981
982static pcre2_jit_stack_32 *getstack32(void)
983{
984 if (!stack32)
985 stack32 = pcre2_jit_stack_create_32(1, 1024 * 1024, NULL);
986 return stack32;
987}
988
989static void setstack32(pcre2_match_context_32 *mcontext)
990{
991 if (!mcontext) {
992 if (stack32)
993 pcre2_jit_stack_free_32(stack32);
994 stack32 = NULL;
995 return;
996 }
997
998 pcre2_jit_stack_assign_32(mcontext, callback32, getstack32());
999}
1000#endif /* SUPPORT_PCRE2_32 */
1001
1002#ifdef SUPPORT_PCRE2_16
1003
1004static int convert_utf8_to_utf16(PCRE2_SPTR8 input, PCRE2_UCHAR16 *output, int *offsetmap, int max_length)
1005{
1006 PCRE2_SPTR8 iptr = input;
1007 PCRE2_UCHAR16 *optr = output;
1008 unsigned int c;
1009
1010 if (max_length == 0)
1011 return 0;
1012
1013 while (*iptr && max_length > 1) {
1014 c = 0;
1015 if (offsetmap)
1016 *offsetmap++ = (int)(iptr - (unsigned char*)input);
1017
1018 if (*iptr < 0xc0)
1019 c = *iptr++;
1020 else if (!(*iptr & 0x20)) {
1021 c = ((iptr[0] & 0x1f) << 6) | (iptr[1] & 0x3f);
1022 iptr += 2;
1023 } else if (!(*iptr & 0x10)) {
1024 c = ((iptr[0] & 0x0f) << 12) | ((iptr[1] & 0x3f) << 6) | (iptr[2] & 0x3f);
1025 iptr += 3;
1026 } else if (!(*iptr & 0x08)) {
1027 c = ((iptr[0] & 0x07) << 18) | ((iptr[1] & 0x3f) << 12) | ((iptr[2] & 0x3f) << 6) | (iptr[3] & 0x3f);
1028 iptr += 4;
1029 }
1030
1031 if (c < 65536) {
1032 *optr++ = c;
1033 max_length--;
1034 } else if (max_length <= 2) {
1035 *optr = '\0';
1036 return (int)(optr - output);
1037 } else {
1038 c -= 0x10000;
1039 *optr++ = 0xd800 | ((c >> 10) & 0x3ff);
1040 *optr++ = 0xdc00 | (c & 0x3ff);
1041 max_length -= 2;
1042 if (offsetmap)
1043 offsetmap++;
1044 }
1045 }
1046 if (offsetmap)
1047 *offsetmap = (int)(iptr - (unsigned char*)input);
1048 *optr = '\0';
1049 return (int)(optr - output);
1050}
1051
1052static int copy_char8_to_char16(PCRE2_SPTR8 input, PCRE2_UCHAR16 *output, int max_length)
1053{
1054 PCRE2_SPTR8 iptr = input;
1055 PCRE2_UCHAR16 *optr = output;
1056
1057 if (max_length == 0)
1058 return 0;
1059
1060 while (*iptr && max_length > 1) {
1061 *optr++ = *iptr++;
1062 max_length--;
1063 }
1064 *optr = '\0';
1065 return (int)(optr - output);
1066}
1067
1068#define REGTEST_MAX_LENGTH16 4096
1069static PCRE2_UCHAR16 regtest_buf16[REGTEST_MAX_LENGTH16];
1070static int regtest_offsetmap16[REGTEST_MAX_LENGTH16];
1071
1072#endif /* SUPPORT_PCRE2_16 */
1073
1074#ifdef SUPPORT_PCRE2_32
1075
1076static int convert_utf8_to_utf32(PCRE2_SPTR8 input, PCRE2_UCHAR32 *output, int *offsetmap, int max_length)
1077{
1078 PCRE2_SPTR8 iptr = input;
1079 PCRE2_UCHAR32 *optr = output;
1080 unsigned int c;
1081
1082 if (max_length == 0)
1083 return 0;
1084
1085 while (*iptr && max_length > 1) {
1086 c = 0;
1087 if (offsetmap)
1088 *offsetmap++ = (int)(iptr - (unsigned char*)input);
1089
1090 if (*iptr < 0xc0)
1091 c = *iptr++;
1092 else if (!(*iptr & 0x20)) {
1093 c = ((iptr[0] & 0x1f) << 6) | (iptr[1] & 0x3f);
1094 iptr += 2;
1095 } else if (!(*iptr & 0x10)) {
1096 c = ((iptr[0] & 0x0f) << 12) | ((iptr[1] & 0x3f) << 6) | (iptr[2] & 0x3f);
1097 iptr += 3;
1098 } else if (!(*iptr & 0x08)) {
1099 c = ((iptr[0] & 0x07) << 18) | ((iptr[1] & 0x3f) << 12) | ((iptr[2] & 0x3f) << 6) | (iptr[3] & 0x3f);
1100 iptr += 4;
1101 }
1102
1103 *optr++ = c;
1104 max_length--;
1105 }
1106 if (offsetmap)
1107 *offsetmap = (int)(iptr - (unsigned char*)input);
1108 *optr = 0;
1109 return (int)(optr - output);
1110}
1111
1112static int copy_char8_to_char32(PCRE2_SPTR8 input, PCRE2_UCHAR32 *output, int max_length)
1113{
1114 PCRE2_SPTR8 iptr = input;
1115 PCRE2_UCHAR32 *optr = output;
1116
1117 if (max_length == 0)
1118 return 0;
1119
1120 while (*iptr && max_length > 1) {
1121 *optr++ = *iptr++;
1122 max_length--;
1123 }
1124 *optr = '\0';
1125 return (int)(optr - output);
1126}
1127
1128#define REGTEST_MAX_LENGTH32 4096
1129static PCRE2_UCHAR32 regtest_buf32[REGTEST_MAX_LENGTH32];
1130static int regtest_offsetmap32[REGTEST_MAX_LENGTH32];
1131
1132#endif /* SUPPORT_PCRE2_32 */
1133
1134static int check_ascii(const char *input)
1135{
1136 const unsigned char *ptr = (unsigned char *)input;
1137 while (*ptr) {
1138 if (*ptr > 127)
1139 return 0;
1140 ptr++;
1141 }
1142 return 1;
1143}
1144
1145#define OVECTOR_SIZE 15
1146
1147static int regression_tests(void)
1148{
1149 struct regression_test_case *current = regression_test_cases;
1150 int error;
1151 PCRE2_SIZE err_offs;
1152 int is_successful;
1153 int is_ascii;
1154 int total = 0;
1155 int successful = 0;
1156 int successful_row = 0;
1157 int counter = 0;
1158 int jit_compile_mode;
1159 int utf = 0;
1160 int disabled_options = 0;
1161 int i;
1162#ifdef SUPPORT_PCRE2_8
1163 pcre2_code_8 *re8;
1164 pcre2_compile_context_8 *ccontext8;
1165 pcre2_match_data_8 *mdata8_1;
1166 pcre2_match_data_8 *mdata8_2;
1167 pcre2_match_context_8 *mcontext8;
1168 PCRE2_SIZE *ovector8_1 = NULL;
1169 PCRE2_SIZE *ovector8_2 = NULL;
1170 int return_value8[2];
1171#endif
1172#ifdef SUPPORT_PCRE2_16
1173 pcre2_code_16 *re16;
1174 pcre2_compile_context_16 *ccontext16;
1175 pcre2_match_data_16 *mdata16_1;
1176 pcre2_match_data_16 *mdata16_2;
1177 pcre2_match_context_16 *mcontext16;
1178 PCRE2_SIZE *ovector16_1 = NULL;
1179 PCRE2_SIZE *ovector16_2 = NULL;
1180 int return_value16[2];
1181 int length16;
1182#endif
1183#ifdef SUPPORT_PCRE2_32
1184 pcre2_code_32 *re32;
1185 pcre2_compile_context_32 *ccontext32;
1186 pcre2_match_data_32 *mdata32_1;
1187 pcre2_match_data_32 *mdata32_2;
1188 pcre2_match_context_32 *mcontext32;
1189 PCRE2_SIZE *ovector32_1 = NULL;
1190 PCRE2_SIZE *ovector32_2 = NULL;
1191 int return_value32[2];
1192 int length32;
1193#endif
1194
1195#if defined SUPPORT_PCRE2_8
1196 PCRE2_UCHAR8 cpu_info[128];
1197#elif defined SUPPORT_PCRE2_16
1198 PCRE2_UCHAR16 cpu_info[128];
1199#elif defined SUPPORT_PCRE2_32
1200 PCRE2_UCHAR32 cpu_info[128];
1201#endif
1202#if defined SUPPORT_UNICODE && ((defined(SUPPORT_PCRE2_8) + defined(SUPPORT_PCRE2_16) + defined(SUPPORT_PCRE2_32)) >= 2)
1203 int return_value;
1204#endif
1205
1206 /* This test compares the behaviour of interpreter and JIT. Although disabling
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07001207 utf or ucp may make tests fail, if the pcre2_match result is the SAME, it is
1208 still considered successful from pcre2_jit_test point of view. */
Elliott Hughes5b808042021-10-01 10:56:10 -07001209
1210#if defined SUPPORT_PCRE2_8
1211 pcre2_config_8(PCRE2_CONFIG_JITTARGET, &cpu_info);
1212#elif defined SUPPORT_PCRE2_16
1213 pcre2_config_16(PCRE2_CONFIG_JITTARGET, &cpu_info);
1214#elif defined SUPPORT_PCRE2_32
1215 pcre2_config_32(PCRE2_CONFIG_JITTARGET, &cpu_info);
1216#endif
1217
1218 printf("Running JIT regression tests\n");
1219 printf(" target CPU of SLJIT compiler: ");
1220 for (i = 0; cpu_info[i]; i++)
1221 printf("%c", (char)(cpu_info[i]));
1222 printf("\n");
1223
1224#if defined SUPPORT_PCRE2_8
1225 pcre2_config_8(PCRE2_CONFIG_UNICODE, &utf);
1226#elif defined SUPPORT_PCRE2_16
1227 pcre2_config_16(PCRE2_CONFIG_UNICODE, &utf);
1228#elif defined SUPPORT_PCRE2_32
1229 pcre2_config_32(PCRE2_CONFIG_UNICODE, &utf);
1230#endif
1231
1232 if (!utf)
1233 disabled_options |= PCRE2_UTF;
1234#ifdef SUPPORT_PCRE2_8
1235 printf(" in 8 bit mode with UTF-8 %s:\n", utf ? "enabled" : "disabled");
1236#endif
1237#ifdef SUPPORT_PCRE2_16
1238 printf(" in 16 bit mode with UTF-16 %s:\n", utf ? "enabled" : "disabled");
1239#endif
1240#ifdef SUPPORT_PCRE2_32
1241 printf(" in 32 bit mode with UTF-32 %s:\n", utf ? "enabled" : "disabled");
1242#endif
1243
1244 while (current->pattern) {
1245 /* printf("\nPattern: %s :\n", current->pattern); */
1246 total++;
1247 is_ascii = 0;
1248 if (!(current->start_offset & F_PROPERTY))
1249 is_ascii = check_ascii(current->pattern) && check_ascii(current->input);
1250
1251 if (current->match_options & PCRE2_PARTIAL_SOFT)
1252 jit_compile_mode = PCRE2_JIT_PARTIAL_SOFT;
1253 else if (current->match_options & PCRE2_PARTIAL_HARD)
1254 jit_compile_mode = PCRE2_JIT_PARTIAL_HARD;
1255 else
1256 jit_compile_mode = PCRE2_JIT_COMPLETE;
1257 error = 0;
1258#ifdef SUPPORT_PCRE2_8
1259 re8 = NULL;
1260 ccontext8 = pcre2_compile_context_create_8(NULL);
1261 if (ccontext8) {
1262 if (GET_NEWLINE(current->newline))
1263 pcre2_set_newline_8(ccontext8, GET_NEWLINE(current->newline));
1264 if (GET_BSR(current->newline))
1265 pcre2_set_bsr_8(ccontext8, GET_BSR(current->newline));
1266
1267 if (!(current->start_offset & F_NO8)) {
1268 re8 = pcre2_compile_8((PCRE2_SPTR8)current->pattern, PCRE2_ZERO_TERMINATED,
1269 current->compile_options & ~disabled_options,
1270 &error, &err_offs, ccontext8);
1271
1272 if (!re8 && (utf || is_ascii))
1273 printf("\n8 bit: Cannot compile pattern \"%s\": %d\n", current->pattern, error);
1274 }
1275 pcre2_compile_context_free_8(ccontext8);
1276 }
1277 else
1278 printf("\n8 bit: Cannot allocate compile context\n");
1279#endif
1280#ifdef SUPPORT_PCRE2_16
1281 if ((current->compile_options & PCRE2_UTF) || (current->start_offset & F_FORCECONV))
1282 convert_utf8_to_utf16((PCRE2_SPTR8)current->pattern, regtest_buf16, NULL, REGTEST_MAX_LENGTH16);
1283 else
1284 copy_char8_to_char16((PCRE2_SPTR8)current->pattern, regtest_buf16, REGTEST_MAX_LENGTH16);
1285
1286 re16 = NULL;
1287 ccontext16 = pcre2_compile_context_create_16(NULL);
1288 if (ccontext16) {
1289 if (GET_NEWLINE(current->newline))
1290 pcre2_set_newline_16(ccontext16, GET_NEWLINE(current->newline));
1291 if (GET_BSR(current->newline))
1292 pcre2_set_bsr_16(ccontext16, GET_BSR(current->newline));
1293
1294 if (!(current->start_offset & F_NO16)) {
1295 re16 = pcre2_compile_16(regtest_buf16, PCRE2_ZERO_TERMINATED,
1296 current->compile_options & ~disabled_options,
1297 &error, &err_offs, ccontext16);
1298
1299 if (!re16 && (utf || is_ascii))
1300 printf("\n16 bit: Cannot compile pattern \"%s\": %d\n", current->pattern, error);
1301 }
1302 pcre2_compile_context_free_16(ccontext16);
1303 }
1304 else
1305 printf("\n16 bit: Cannot allocate compile context\n");
1306#endif
1307#ifdef SUPPORT_PCRE2_32
1308 if ((current->compile_options & PCRE2_UTF) || (current->start_offset & F_FORCECONV))
1309 convert_utf8_to_utf32((PCRE2_SPTR8)current->pattern, regtest_buf32, NULL, REGTEST_MAX_LENGTH32);
1310 else
1311 copy_char8_to_char32((PCRE2_SPTR8)current->pattern, regtest_buf32, REGTEST_MAX_LENGTH32);
1312
1313 re32 = NULL;
1314 ccontext32 = pcre2_compile_context_create_32(NULL);
1315 if (ccontext32) {
1316 if (GET_NEWLINE(current->newline))
1317 pcre2_set_newline_32(ccontext32, GET_NEWLINE(current->newline));
1318 if (GET_BSR(current->newline))
1319 pcre2_set_bsr_32(ccontext32, GET_BSR(current->newline));
1320
1321 if (!(current->start_offset & F_NO32)) {
1322 re32 = pcre2_compile_32(regtest_buf32, PCRE2_ZERO_TERMINATED,
1323 current->compile_options & ~disabled_options,
1324 &error, &err_offs, ccontext32);
1325
1326 if (!re32 && (utf || is_ascii))
1327 printf("\n32 bit: Cannot compile pattern \"%s\": %d\n", current->pattern, error);
1328 }
1329 pcre2_compile_context_free_32(ccontext32);
1330 }
1331 else
1332 printf("\n32 bit: Cannot allocate compile context\n");
1333#endif
1334
1335 counter++;
1336 if ((counter & 0x3) != 0) {
1337#ifdef SUPPORT_PCRE2_8
1338 setstack8(NULL);
1339#endif
1340#ifdef SUPPORT_PCRE2_16
1341 setstack16(NULL);
1342#endif
1343#ifdef SUPPORT_PCRE2_32
1344 setstack32(NULL);
1345#endif
1346 }
1347
1348#ifdef SUPPORT_PCRE2_8
1349 return_value8[0] = -1000;
1350 return_value8[1] = -1000;
1351 mdata8_1 = pcre2_match_data_create_8(OVECTOR_SIZE, NULL);
1352 mdata8_2 = pcre2_match_data_create_8(OVECTOR_SIZE, NULL);
1353 mcontext8 = pcre2_match_context_create_8(NULL);
1354 if (!mdata8_1 || !mdata8_2 || !mcontext8) {
1355 printf("\n8 bit: Cannot allocate match data\n");
1356 pcre2_match_data_free_8(mdata8_1);
1357 pcre2_match_data_free_8(mdata8_2);
1358 pcre2_match_context_free_8(mcontext8);
1359 pcre2_code_free_8(re8);
1360 re8 = NULL;
1361 } else {
1362 ovector8_1 = pcre2_get_ovector_pointer_8(mdata8_1);
1363 ovector8_2 = pcre2_get_ovector_pointer_8(mdata8_2);
1364 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1365 ovector8_1[i] = -2;
1366 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1367 ovector8_2[i] = -2;
1368 pcre2_set_match_limit_8(mcontext8, 10000000);
1369 }
1370 if (re8) {
1371 return_value8[1] = pcre2_match_8(re8, (PCRE2_SPTR8)current->input, strlen(current->input),
1372 current->start_offset & OFFSET_MASK, current->match_options, mdata8_2, mcontext8);
1373
1374 if (pcre2_jit_compile_8(re8, jit_compile_mode)) {
1375 printf("\n8 bit: JIT compiler does not support \"%s\"\n", current->pattern);
1376 } else if ((counter & 0x1) != 0) {
1377 setstack8(mcontext8);
1378 return_value8[0] = pcre2_match_8(re8, (PCRE2_SPTR8)current->input, strlen(current->input),
1379 current->start_offset & OFFSET_MASK, current->match_options, mdata8_1, mcontext8);
1380 } else {
1381 pcre2_jit_stack_assign_8(mcontext8, NULL, getstack8());
1382 return_value8[0] = pcre2_jit_match_8(re8, (PCRE2_SPTR8)current->input, strlen(current->input),
1383 current->start_offset & OFFSET_MASK, current->match_options, mdata8_1, mcontext8);
1384 }
1385 }
1386#endif
1387
1388#ifdef SUPPORT_PCRE2_16
1389 return_value16[0] = -1000;
1390 return_value16[1] = -1000;
1391 mdata16_1 = pcre2_match_data_create_16(OVECTOR_SIZE, NULL);
1392 mdata16_2 = pcre2_match_data_create_16(OVECTOR_SIZE, NULL);
1393 mcontext16 = pcre2_match_context_create_16(NULL);
1394 if (!mdata16_1 || !mdata16_2 || !mcontext16) {
1395 printf("\n16 bit: Cannot allocate match data\n");
1396 pcre2_match_data_free_16(mdata16_1);
1397 pcre2_match_data_free_16(mdata16_2);
1398 pcre2_match_context_free_16(mcontext16);
1399 pcre2_code_free_16(re16);
1400 re16 = NULL;
1401 } else {
1402 ovector16_1 = pcre2_get_ovector_pointer_16(mdata16_1);
1403 ovector16_2 = pcre2_get_ovector_pointer_16(mdata16_2);
1404 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1405 ovector16_1[i] = -2;
1406 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1407 ovector16_2[i] = -2;
1408 pcre2_set_match_limit_16(mcontext16, 10000000);
1409 }
1410 if (re16) {
1411 if ((current->compile_options & PCRE2_UTF) || (current->start_offset & F_FORCECONV))
1412 length16 = convert_utf8_to_utf16((PCRE2_SPTR8)current->input, regtest_buf16, regtest_offsetmap16, REGTEST_MAX_LENGTH16);
1413 else
1414 length16 = copy_char8_to_char16((PCRE2_SPTR8)current->input, regtest_buf16, REGTEST_MAX_LENGTH16);
1415
1416 return_value16[1] = pcre2_match_16(re16, regtest_buf16, length16,
1417 current->start_offset & OFFSET_MASK, current->match_options, mdata16_2, mcontext16);
1418
1419 if (pcre2_jit_compile_16(re16, jit_compile_mode)) {
1420 printf("\n16 bit: JIT compiler does not support \"%s\"\n", current->pattern);
1421 } else if ((counter & 0x1) != 0) {
1422 setstack16(mcontext16);
1423 return_value16[0] = pcre2_match_16(re16, regtest_buf16, length16,
1424 current->start_offset & OFFSET_MASK, current->match_options, mdata16_1, mcontext16);
1425 } else {
1426 pcre2_jit_stack_assign_16(mcontext16, NULL, getstack16());
1427 return_value16[0] = pcre2_jit_match_16(re16, regtest_buf16, length16,
1428 current->start_offset & OFFSET_MASK, current->match_options, mdata16_1, mcontext16);
1429 }
1430 }
1431#endif
1432
1433#ifdef SUPPORT_PCRE2_32
1434 return_value32[0] = -1000;
1435 return_value32[1] = -1000;
1436 mdata32_1 = pcre2_match_data_create_32(OVECTOR_SIZE, NULL);
1437 mdata32_2 = pcre2_match_data_create_32(OVECTOR_SIZE, NULL);
1438 mcontext32 = pcre2_match_context_create_32(NULL);
1439 if (!mdata32_1 || !mdata32_2 || !mcontext32) {
1440 printf("\n32 bit: Cannot allocate match data\n");
1441 pcre2_match_data_free_32(mdata32_1);
1442 pcre2_match_data_free_32(mdata32_2);
1443 pcre2_match_context_free_32(mcontext32);
1444 pcre2_code_free_32(re32);
1445 re32 = NULL;
1446 } else {
1447 ovector32_1 = pcre2_get_ovector_pointer_32(mdata32_1);
1448 ovector32_2 = pcre2_get_ovector_pointer_32(mdata32_2);
1449 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1450 ovector32_1[i] = -2;
1451 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1452 ovector32_2[i] = -2;
1453 pcre2_set_match_limit_32(mcontext32, 10000000);
1454 }
1455 if (re32) {
1456 if ((current->compile_options & PCRE2_UTF) || (current->start_offset & F_FORCECONV))
1457 length32 = convert_utf8_to_utf32((PCRE2_SPTR8)current->input, regtest_buf32, regtest_offsetmap32, REGTEST_MAX_LENGTH32);
1458 else
1459 length32 = copy_char8_to_char32((PCRE2_SPTR8)current->input, regtest_buf32, REGTEST_MAX_LENGTH32);
1460
1461 return_value32[1] = pcre2_match_32(re32, regtest_buf32, length32,
1462 current->start_offset & OFFSET_MASK, current->match_options, mdata32_2, mcontext32);
1463
1464 if (pcre2_jit_compile_32(re32, jit_compile_mode)) {
1465 printf("\n32 bit: JIT compiler does not support \"%s\"\n", current->pattern);
1466 } else if ((counter & 0x1) != 0) {
1467 setstack32(mcontext32);
1468 return_value32[0] = pcre2_match_32(re32, regtest_buf32, length32,
1469 current->start_offset & OFFSET_MASK, current->match_options, mdata32_1, mcontext32);
1470 } else {
1471 pcre2_jit_stack_assign_32(mcontext32, NULL, getstack32());
1472 return_value32[0] = pcre2_jit_match_32(re32, regtest_buf32, length32,
1473 current->start_offset & OFFSET_MASK, current->match_options, mdata32_1, mcontext32);
1474 }
1475 }
1476#endif
1477
1478 /* printf("[%d-%d-%d|%d-%d|%d-%d|%d-%d]%s",
1479 return_value8[0], return_value16[0], return_value32[0],
1480 (int)ovector8_1[0], (int)ovector8_1[1],
1481 (int)ovector16_1[0], (int)ovector16_1[1],
1482 (int)ovector32_1[0], (int)ovector32_1[1],
1483 (current->compile_options & PCRE2_CASELESS) ? "C" : ""); */
1484
1485 /* If F_DIFF is set, just run the test, but do not compare the results.
1486 Segfaults can still be captured. */
1487
1488 is_successful = 1;
1489 if (!(current->start_offset & F_DIFF)) {
1490#if defined SUPPORT_UNICODE && ((defined(SUPPORT_PCRE2_8) + defined(SUPPORT_PCRE2_16) + defined(SUPPORT_PCRE2_32)) >= 2)
1491 if (!(current->start_offset & F_FORCECONV)) {
1492
1493 /* All results must be the same. */
1494#ifdef SUPPORT_PCRE2_8
1495 if ((return_value = return_value8[0]) != return_value8[1]) {
1496 printf("\n8 bit: Return value differs(J8:%d,I8:%d): [%d] '%s' @ '%s'\n",
1497 return_value8[0], return_value8[1], total, current->pattern, current->input);
1498 is_successful = 0;
1499 } else
1500#endif
1501#ifdef SUPPORT_PCRE2_16
1502 if ((return_value = return_value16[0]) != return_value16[1]) {
1503 printf("\n16 bit: Return value differs(J16:%d,I16:%d): [%d] '%s' @ '%s'\n",
1504 return_value16[0], return_value16[1], total, current->pattern, current->input);
1505 is_successful = 0;
1506 } else
1507#endif
1508#ifdef SUPPORT_PCRE2_32
1509 if ((return_value = return_value32[0]) != return_value32[1]) {
1510 printf("\n32 bit: Return value differs(J32:%d,I32:%d): [%d] '%s' @ '%s'\n",
1511 return_value32[0], return_value32[1], total, current->pattern, current->input);
1512 is_successful = 0;
1513 } else
1514#endif
1515#if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_16
1516 if (return_value8[0] != return_value16[0]) {
1517 printf("\n8 and 16 bit: Return value differs(J8:%d,J16:%d): [%d] '%s' @ '%s'\n",
1518 return_value8[0], return_value16[0],
1519 total, current->pattern, current->input);
1520 is_successful = 0;
1521 } else
1522#endif
1523#if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_32
1524 if (return_value8[0] != return_value32[0]) {
1525 printf("\n8 and 32 bit: Return value differs(J8:%d,J32:%d): [%d] '%s' @ '%s'\n",
1526 return_value8[0], return_value32[0],
1527 total, current->pattern, current->input);
1528 is_successful = 0;
1529 } else
1530#endif
1531#if defined SUPPORT_PCRE2_16 && defined SUPPORT_PCRE2_32
1532 if (return_value16[0] != return_value32[0]) {
1533 printf("\n16 and 32 bit: Return value differs(J16:%d,J32:%d): [%d] '%s' @ '%s'\n",
1534 return_value16[0], return_value32[0],
1535 total, current->pattern, current->input);
1536 is_successful = 0;
1537 } else
1538#endif
1539 if (return_value >= 0 || return_value == PCRE2_ERROR_PARTIAL) {
1540 if (return_value == PCRE2_ERROR_PARTIAL) {
1541 return_value = 2;
1542 } else {
1543 return_value *= 2;
1544 }
1545#ifdef SUPPORT_PCRE2_8
1546 return_value8[0] = return_value;
1547#endif
1548#ifdef SUPPORT_PCRE2_16
1549 return_value16[0] = return_value;
1550#endif
1551#ifdef SUPPORT_PCRE2_32
1552 return_value32[0] = return_value;
1553#endif
1554 /* Transform back the results. */
1555 if (current->compile_options & PCRE2_UTF) {
1556#ifdef SUPPORT_PCRE2_16
1557 for (i = 0; i < return_value; ++i) {
1558 if (ovector16_1[i] != PCRE2_UNSET)
1559 ovector16_1[i] = regtest_offsetmap16[ovector16_1[i]];
1560 if (ovector16_2[i] != PCRE2_UNSET)
1561 ovector16_2[i] = regtest_offsetmap16[ovector16_2[i]];
1562 }
1563#endif
1564#ifdef SUPPORT_PCRE2_32
1565 for (i = 0; i < return_value; ++i) {
1566 if (ovector32_1[i] != PCRE2_UNSET)
1567 ovector32_1[i] = regtest_offsetmap32[ovector32_1[i]];
1568 if (ovector32_2[i] != PCRE2_UNSET)
1569 ovector32_2[i] = regtest_offsetmap32[ovector32_2[i]];
1570 }
1571#endif
1572 }
1573
1574 for (i = 0; i < return_value; ++i) {
1575#if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_16
1576 if (ovector8_1[i] != ovector8_2[i] || ovector8_1[i] != ovector16_1[i] || ovector8_1[i] != ovector16_2[i]) {
1577 printf("\n8 and 16 bit: Ovector[%d] value differs(J8:%d,I8:%d,J16:%d,I16:%d): [%d] '%s' @ '%s' \n",
1578 i, (int)ovector8_1[i], (int)ovector8_2[i], (int)ovector16_1[i], (int)ovector16_2[i],
1579 total, current->pattern, current->input);
1580 is_successful = 0;
1581 }
1582#endif
1583#if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_32
1584 if (ovector8_1[i] != ovector8_2[i] || ovector8_1[i] != ovector32_1[i] || ovector8_1[i] != ovector32_2[i]) {
1585 printf("\n8 and 32 bit: Ovector[%d] value differs(J8:%d,I8:%d,J32:%d,I32:%d): [%d] '%s' @ '%s' \n",
1586 i, (int)ovector8_1[i], (int)ovector8_2[i], (int)ovector32_1[i], (int)ovector32_2[i],
1587 total, current->pattern, current->input);
1588 is_successful = 0;
1589 }
1590#endif
1591#if defined SUPPORT_PCRE2_16 && defined SUPPORT_PCRE2_32
1592 if (ovector16_1[i] != ovector16_2[i] || ovector16_1[i] != ovector32_1[i] || ovector16_1[i] != ovector32_2[i]) {
1593 printf("\n16 and 32 bit: Ovector[%d] value differs(J16:%d,I16:%d,J32:%d,I32:%d): [%d] '%s' @ '%s' \n",
1594 i, (int)ovector16_1[i], (int)ovector16_2[i], (int)ovector32_1[i], (int)ovector32_2[i],
1595 total, current->pattern, current->input);
1596 is_successful = 0;
1597 }
1598#endif
1599 }
1600 }
1601 } else
1602#endif /* more than one of SUPPORT_PCRE2_8, SUPPORT_PCRE2_16 and SUPPORT_PCRE2_32 */
1603 {
1604#ifdef SUPPORT_PCRE2_8
1605 if (return_value8[0] != return_value8[1]) {
1606 printf("\n8 bit: Return value differs(%d:%d): [%d] '%s' @ '%s'\n",
1607 return_value8[0], return_value8[1], total, current->pattern, current->input);
1608 is_successful = 0;
1609 } else if (return_value8[0] >= 0 || return_value8[0] == PCRE2_ERROR_PARTIAL) {
1610 if (return_value8[0] == PCRE2_ERROR_PARTIAL)
1611 return_value8[0] = 2;
1612 else
1613 return_value8[0] *= 2;
1614
1615 for (i = 0; i < return_value8[0]; ++i)
1616 if (ovector8_1[i] != ovector8_2[i]) {
1617 printf("\n8 bit: Ovector[%d] value differs(%d:%d): [%d] '%s' @ '%s'\n",
1618 i, (int)ovector8_1[i], (int)ovector8_2[i], total, current->pattern, current->input);
1619 is_successful = 0;
1620 }
1621 }
1622#endif
1623
1624#ifdef SUPPORT_PCRE2_16
1625 if (return_value16[0] != return_value16[1]) {
1626 printf("\n16 bit: Return value differs(%d:%d): [%d] '%s' @ '%s'\n",
1627 return_value16[0], return_value16[1], total, current->pattern, current->input);
1628 is_successful = 0;
1629 } else if (return_value16[0] >= 0 || return_value16[0] == PCRE2_ERROR_PARTIAL) {
1630 if (return_value16[0] == PCRE2_ERROR_PARTIAL)
1631 return_value16[0] = 2;
1632 else
1633 return_value16[0] *= 2;
1634
1635 for (i = 0; i < return_value16[0]; ++i)
1636 if (ovector16_1[i] != ovector16_2[i]) {
1637 printf("\n16 bit: Ovector[%d] value differs(%d:%d): [%d] '%s' @ '%s'\n",
1638 i, (int)ovector16_1[i], (int)ovector16_2[i], total, current->pattern, current->input);
1639 is_successful = 0;
1640 }
1641 }
1642#endif
1643
1644#ifdef SUPPORT_PCRE2_32
1645 if (return_value32[0] != return_value32[1]) {
1646 printf("\n32 bit: Return value differs(%d:%d): [%d] '%s' @ '%s'\n",
1647 return_value32[0], return_value32[1], total, current->pattern, current->input);
1648 is_successful = 0;
1649 } else if (return_value32[0] >= 0 || return_value32[0] == PCRE2_ERROR_PARTIAL) {
1650 if (return_value32[0] == PCRE2_ERROR_PARTIAL)
1651 return_value32[0] = 2;
1652 else
1653 return_value32[0] *= 2;
1654
1655 for (i = 0; i < return_value32[0]; ++i)
1656 if (ovector32_1[i] != ovector32_2[i]) {
1657 printf("\n32 bit: Ovector[%d] value differs(%d:%d): [%d] '%s' @ '%s'\n",
1658 i, (int)ovector32_1[i], (int)ovector32_2[i], total, current->pattern, current->input);
1659 is_successful = 0;
1660 }
1661 }
1662#endif
1663 }
1664 }
1665
1666 if (is_successful) {
1667#ifdef SUPPORT_PCRE2_8
1668 if (!(current->start_offset & F_NO8) && (utf || is_ascii)) {
1669 if (return_value8[0] < 0 && !(current->start_offset & F_NOMATCH)) {
1670 printf("8 bit: Test should match: [%d] '%s' @ '%s'\n",
1671 total, current->pattern, current->input);
1672 is_successful = 0;
1673 }
1674
1675 if (return_value8[0] >= 0 && (current->start_offset & F_NOMATCH)) {
1676 printf("8 bit: Test should not match: [%d] '%s' @ '%s'\n",
1677 total, current->pattern, current->input);
1678 is_successful = 0;
1679 }
1680 }
1681#endif
1682#ifdef SUPPORT_PCRE2_16
1683 if (!(current->start_offset & F_NO16) && (utf || is_ascii)) {
1684 if (return_value16[0] < 0 && !(current->start_offset & F_NOMATCH)) {
1685 printf("16 bit: Test should match: [%d] '%s' @ '%s'\n",
1686 total, current->pattern, current->input);
1687 is_successful = 0;
1688 }
1689
1690 if (return_value16[0] >= 0 && (current->start_offset & F_NOMATCH)) {
1691 printf("16 bit: Test should not match: [%d] '%s' @ '%s'\n",
1692 total, current->pattern, current->input);
1693 is_successful = 0;
1694 }
1695 }
1696#endif
1697#ifdef SUPPORT_PCRE2_32
1698 if (!(current->start_offset & F_NO32) && (utf || is_ascii)) {
1699 if (return_value32[0] < 0 && !(current->start_offset & F_NOMATCH)) {
1700 printf("32 bit: Test should match: [%d] '%s' @ '%s'\n",
1701 total, current->pattern, current->input);
1702 is_successful = 0;
1703 }
1704
1705 if (return_value32[0] >= 0 && (current->start_offset & F_NOMATCH)) {
1706 printf("32 bit: Test should not match: [%d] '%s' @ '%s'\n",
1707 total, current->pattern, current->input);
1708 is_successful = 0;
1709 }
1710 }
1711#endif
1712 }
1713
1714 if (is_successful) {
1715#ifdef SUPPORT_PCRE2_8
1716 if (re8 && !(current->start_offset & F_NO8) && pcre2_get_mark_8(mdata8_1) != pcre2_get_mark_8(mdata8_2)) {
1717 printf("8 bit: Mark value mismatch: [%d] '%s' @ '%s'\n",
1718 total, current->pattern, current->input);
1719 is_successful = 0;
1720 }
1721#endif
1722#ifdef SUPPORT_PCRE2_16
1723 if (re16 && !(current->start_offset & F_NO16) && pcre2_get_mark_16(mdata16_1) != pcre2_get_mark_16(mdata16_2)) {
1724 printf("16 bit: Mark value mismatch: [%d] '%s' @ '%s'\n",
1725 total, current->pattern, current->input);
1726 is_successful = 0;
1727 }
1728#endif
1729#ifdef SUPPORT_PCRE2_32
1730 if (re32 && !(current->start_offset & F_NO32) && pcre2_get_mark_32(mdata32_1) != pcre2_get_mark_32(mdata32_2)) {
1731 printf("32 bit: Mark value mismatch: [%d] '%s' @ '%s'\n",
1732 total, current->pattern, current->input);
1733 is_successful = 0;
1734 }
1735#endif
1736 }
1737
1738#ifdef SUPPORT_PCRE2_8
1739 pcre2_code_free_8(re8);
1740 pcre2_match_data_free_8(mdata8_1);
1741 pcre2_match_data_free_8(mdata8_2);
1742 pcre2_match_context_free_8(mcontext8);
1743#endif
1744#ifdef SUPPORT_PCRE2_16
1745 pcre2_code_free_16(re16);
1746 pcre2_match_data_free_16(mdata16_1);
1747 pcre2_match_data_free_16(mdata16_2);
1748 pcre2_match_context_free_16(mcontext16);
1749#endif
1750#ifdef SUPPORT_PCRE2_32
1751 pcre2_code_free_32(re32);
1752 pcre2_match_data_free_32(mdata32_1);
1753 pcre2_match_data_free_32(mdata32_2);
1754 pcre2_match_context_free_32(mcontext32);
1755#endif
1756
1757 if (is_successful) {
1758 successful++;
1759 successful_row++;
1760 printf(".");
1761 if (successful_row >= 60) {
1762 successful_row = 0;
1763 printf("\n");
1764 }
1765 } else
1766 successful_row = 0;
1767
1768 fflush(stdout);
1769 current++;
1770 }
1771#ifdef SUPPORT_PCRE2_8
1772 setstack8(NULL);
1773#endif
1774#ifdef SUPPORT_PCRE2_16
1775 setstack16(NULL);
1776#endif
1777#ifdef SUPPORT_PCRE2_32
1778 setstack32(NULL);
1779#endif
1780
1781 if (total == successful) {
1782 printf("\nAll JIT regression tests are successfully passed.\n");
1783 return 0;
1784 } else {
1785 printf("\nSuccessful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
1786 return 1;
1787 }
1788}
1789
1790#if defined SUPPORT_UNICODE
1791
1792static int check_invalid_utf_result(int pattern_index, const char *type, int result,
1793 int match_start, int match_end, PCRE2_SIZE *ovector)
1794{
1795 if (match_start < 0) {
1796 if (result != -1) {
1797 printf("Pattern[%d] %s result is not -1.\n", pattern_index, type);
1798 return 1;
1799 }
1800 return 0;
1801 }
1802
1803 if (result <= 0) {
1804 printf("Pattern[%d] %s result (%d) is not greater than 0.\n", pattern_index, type, result);
1805 return 1;
1806 }
1807
1808 if (ovector[0] != (PCRE2_SIZE)match_start) {
1809 printf("Pattern[%d] %s ovector[0] is unexpected (%d instead of %d)\n",
1810 pattern_index, type, (int)ovector[0], match_start);
1811 return 1;
1812 }
1813
1814 if (ovector[1] != (PCRE2_SIZE)match_end) {
1815 printf("Pattern[%d] %s ovector[1] is unexpected (%d instead of %d)\n",
1816 pattern_index, type, (int)ovector[1], match_end);
1817 return 1;
1818 }
1819
1820 return 0;
1821}
1822
1823#endif /* SUPPORT_UNICODE */
1824
1825#if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_8
1826
1827#define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED)
1828#define CI (PCRE2_JIT_COMPLETE | PCRE2_JIT_INVALID_UTF)
1829#define CPI (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_INVALID_UTF)
1830
1831struct invalid_utf8_regression_test_case {
1832 int compile_options;
1833 int jit_compile_options;
1834 int start_offset;
1835 int skip_left;
1836 int skip_right;
1837 int match_start;
1838 int match_end;
1839 const char *pattern[2];
1840 const char *input;
1841};
1842
1843static const char invalid_utf8_newline_cr;
1844
1845static const struct invalid_utf8_regression_test_case invalid_utf8_regression_test_cases[] = {
1846 { UDA, CI, 0, 0, 0, 0, 4, { ".", NULL }, "\xf4\x8f\xbf\xbf" },
1847 { UDA, CI, 0, 0, 0, 0, 4, { ".", NULL }, "\xf0\x90\x80\x80" },
1848 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf4\x90\x80\x80" },
1849 { UDA, CI, 0, 0, 1, -1, -1, { ".", NULL }, "\xf4\x8f\xbf\xbf" },
1850 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x90\x80\x7f" },
1851 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x90\x80\xc0" },
1852 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x8f\xbf\xbf" },
1853 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xef\xbf\xbf#" },
1854 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xef\xbf\xbf" },
1855 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xe0\xa0\x80#" },
1856 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xe0\xa0\x80" },
1857 { UDA, CI, 0, 0, 2, -1, -1, { ".", NULL }, "\xef\xbf\xbf#" },
1858 { UDA, CI, 0, 0, 1, -1, -1, { ".", NULL }, "\xef\xbf\xbf" },
1859 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xef\xbf\x7f#" },
1860 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xef\xbf\xc0" },
1861 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x9f\xbf#" },
1862 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x9f\xbf" },
1863 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xed\x9f\xbf#" },
1864 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xed\xa0\x80#" },
1865 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xee\x80\x80#" },
1866 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xed\xbf\xbf#" },
1867 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf##" },
1868 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf#" },
1869 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf" },
1870 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80##" },
1871 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80#" },
1872 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80" },
1873 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x80##" },
1874 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xdf\xc0##" },
1875 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x80" },
1876 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xdf\xc0" },
1877 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xc1\xbf##" },
1878 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xc1\xbf" },
1879 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\x80###" },
1880 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\x80" },
1881 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf8###" },
1882 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf8" },
1883 { UDA, CI, 0, 0, 0, 0, 1, { ".", NULL }, "\x7f" },
1884
1885 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf4\x8f\xbf\xbf#" },
1886 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\xa0\x80\x80\xf4\xa0\x80\x80" },
1887 { UDA, CPI, 4, 1, 1, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf" },
1888 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "#\xef\xbf\xbf#" },
1889 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "#\xe0\xa0\x80#" },
1890 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf0\x90\x80\x80#" },
1891 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf3\xbf\xbf\xbf#" },
1892 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf0\x8f\xbf\xbf\xf0\x8f\xbf\xbf" },
1893 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf5\x80\x80\x80\xf5\x80\x80\x80" },
1894 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x90\x80\x80\xf4\x90\x80\x80" },
1895 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xff\xf4\x8f\xbf\xff" },
1896 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xff\xbf\xf4\x8f\xff\xbf" },
1897 { UDA, CPI, 4, 0, 1, -1, -1, { "\\B", "\\b" }, "\xef\x80\x80\x80\xef\x80\x80" },
1898 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\x80\x80\x80\x80\x80\x80\x80\x80" },
1899 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xe0\x9f\xbf\xe0\x9f\xbf#" },
1900 { UDA, CPI, 4, 2, 2, -1, -1, { "\\B", "\\b" }, "#\xe0\xa0\x80\xe0\xa0\x80#" },
1901 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xf0\x80\x80\xf0\x80\x80#" },
1902 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xed\xa0\x80\xed\xa0\x80#" },
1903 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "##\xdf\xbf#" },
1904 { UDA, CPI, 4, 2, 0, 2, 2, { "\\B", NULL }, "##\xdf\xbf#" },
1905 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "##\xc2\x80#" },
1906 { UDA, CPI, 4, 2, 0, 2, 2, { "\\B", NULL }, "##\xc2\x80#" },
1907 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xc1\xbf\xc1\xbf##" },
1908 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xdf\xc0\xdf\xc0##" },
1909 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xe0\x80\xe0\x80##" },
1910
1911 { UDA, CPI, 3, 0, 0, 3, 3, { "\\B", NULL }, "\xef\xbf\xbf#" },
1912 { UDA, CPI, 3, 0, 0, 3, 3, { "\\B", NULL }, "\xe0\xa0\x80#" },
1913 { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xe0\x9f\xbf\xe0\x9f\xbf" },
1914 { UDA, CPI, 3, 1, 1, -1, -1, { "\\B", "\\b" }, "\xef\xbf\xbf\xef\xbf\xbf" },
1915 { UDA, CPI, 3, 0, 1, -1, -1, { "\\B", "\\b" }, "\xdf\x80\x80\xdf\x80" },
1916 { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xef\xbf\xff\xef\xbf\xff" },
1917 { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xef\xff\xbf\xef\xff\xbf" },
1918 { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xed\xbf\xbf\xed\xbf\xbf" },
1919
1920 { UDA, CPI, 2, 0, 0, 2, 2, { "\\B", NULL }, "\xdf\xbf#" },
1921 { UDA, CPI, 2, 0, 0, 2, 2, { "\\B", NULL }, "\xc2\x80#" },
1922 { UDA, CPI, 2, 1, 1, -1, -1, { "\\B", "\\b" }, "\xdf\xbf\xdf\xbf" },
1923 { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xc1\xbf\xc1\xbf" },
1924 { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xe0\x80\xe0\x80" },
1925 { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xdf\xff\xdf\xff" },
1926 { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xff\xbf\xff\xbf" },
1927
1928 { UDA, CPI, 1, 0, 0, 1, 1, { "\\B", NULL }, "\x7f#" },
1929 { UDA, CPI, 1, 0, 0, 1, 1, { "\\B", NULL }, "\x01#" },
1930 { UDA, CPI, 1, 0, 0, -1, -1, { "\\B", "\\b" }, "\x80\x80" },
1931 { UDA, CPI, 1, 0, 0, -1, -1, { "\\B", "\\b" }, "\xb0\xb0" },
1932
1933 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { "(.)\\1", NULL }, "aA" },
1934 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, -1, { "(.)\\1", NULL }, "a\xff" },
1935 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" },
1936 { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" },
1937 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, -1, { "(.)\\1", NULL }, "\xc2\x80\x80" },
1938 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 6, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" },
1939 { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" },
1940 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 8, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" },
1941 { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" },
1942
1943 { UDA, CPI, 0, 0, 0, 0, 1, { "\\X", NULL }, "A" },
1944 { UDA, CPI, 0, 0, 0, -1, -1, { "\\X", NULL }, "\xff" },
1945 { UDA, CPI, 0, 0, 0, 0, 2, { "\\X", NULL }, "\xc3\xa1" },
1946 { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xc3\xa1" },
1947 { UDA, CPI, 0, 0, 0, -1, -1, { "\\X", NULL }, "\xc3\x7f" },
1948 { UDA, CPI, 0, 0, 0, 0, 3, { "\\X", NULL }, "\xe1\xbd\xb8" },
1949 { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xe1\xbd\xb8" },
1950 { UDA, CPI, 0, 0, 0, 0, 4, { "\\X", NULL }, "\xf0\x90\x90\x80" },
1951 { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xf0\x90\x90\x80" },
1952
1953 { UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "#" },
1954 { UDA, CPI, 0, 0, 0, 0, 4, { "[^#]", NULL }, "\xf4\x8f\xbf\xbf" },
1955 { UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "\xf4\x90\x80\x80" },
1956 { UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "\xc1\x80" },
1957
1958 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { "^\\W", NULL }, " \x0a#"},
1959 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 14, 15, { "^\\W", NULL }, " \xc0\x8a#\xe0\x80\x8a#\xf0\x80\x80\x8a#\x0a#"},
1960 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xf8\x0a#"},
1961 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xc3\x0a#"},
1962 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xf1\x0a#"},
1963 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { "^\\W", NULL }, " \xf2\xbf\x0a#"},
1964 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 5, 6, { "^\\W", NULL }, " \xf2\xbf\xbf\x0a#"},
1965 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xef\x0a#"},
1966 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { "^\\W", NULL }, " \xef\xbf\x0a#"},
1967 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 5, 6, { "^\\W", NULL }, " \x85#\xc2\x85#"},
1968 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 7, 8, { "^\\W", NULL }, " \xe2\x80\xf8\xe2\x80\xa8#"},
1969
1970 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, -1, -1, { "#", NULL }, "\xe2\x80\xf8\xe2\x80\xa8#"},
1971 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, 3, 4, { "#", NULL }, "\xe2\x80\xf8#\xe2\x80\xa8#"},
1972 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, -1, -1, { "#", NULL }, "abcd\xc2\x85#"},
1973 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, 1, 2, { "#", NULL }, "\x85#\xc2\x85#"},
1974 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, 5, 6, { "#", NULL }, "\xef,\x80,\xf8#\x0a"},
1975 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, -1, -1, { "#", NULL }, "\xef,\x80,\xf8\x0a#"},
1976
1977 { PCRE2_UTF | PCRE2_NO_START_OPTIMIZE, CI, 0, 0, 0, 4, 8, { "#\xc7\x85#", NULL }, "\x80\x80#\xc7#\xc7\x85#" },
1978 { PCRE2_UTF | PCRE2_NO_START_OPTIMIZE, CI, 0, 0, 0, 7, 11, { "#\xc7\x85#", NULL }, "\x80\x80#\xc7\x80\x80\x80#\xc7\x85#" },
1979 { PCRE2_UTF, CI, 0, 0, 0, 4, 8, { "#\xc7\x85#", NULL }, "\x80\x80#\xc7#\xc7\x85#" },
1980 { PCRE2_UTF, CI, 0, 0, 0, 7, 11, { "#\xc7\x85#", NULL }, "\x80\x80#\xc7\x80\x80\x80#\xc7\x85#" },
1981
1982 { PCRE2_UTF | PCRE2_UCP, CI, 0, 0, 0, -1, -1, { "[\\s]", NULL }, "\xed\xa0\x80" },
1983
1984 /* These two are not invalid UTF tests, but this infrastructure fits better for them. */
1985 { 0, PCRE2_JIT_COMPLETE, 0, 0, 1, -1, -1, { "\\X{2}", NULL }, "\r\n\n" },
1986 { 0, PCRE2_JIT_COMPLETE, 0, 0, 1, -1, -1, { "\\R{2}", NULL }, "\r\n\n" },
1987
1988 { PCRE2_UTF | PCRE2_MULTILINE, CI, 0, 0, 0, -1, -1, { "^.a", &invalid_utf8_newline_cr }, "\xc3\xa7#a" },
1989
1990 { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
1991};
1992
1993#undef UDA
1994#undef CI
1995#undef CPI
1996
1997static int run_invalid_utf8_test(const struct invalid_utf8_regression_test_case *current,
1998 int pattern_index, int i, pcre2_compile_context_8 *ccontext, pcre2_match_data_8 *mdata)
1999{
2000 pcre2_code_8 *code;
2001 int result, errorcode;
2002 PCRE2_SIZE length, erroroffset;
2003 PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_8(mdata);
2004
2005 if (current->pattern[i] == NULL)
2006 return 1;
2007
2008 code = pcre2_compile_8((PCRE2_UCHAR8*)current->pattern[i], PCRE2_ZERO_TERMINATED,
2009 current->compile_options, &errorcode, &erroroffset, ccontext);
2010
2011 if (!code) {
2012 printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
2013 return 0;
2014 }
2015
2016 if (pcre2_jit_compile_8(code, current->jit_compile_options) != 0) {
2017 printf("Pattern[%d:0] cannot be compiled by the JIT compiler.\n", pattern_index);
2018 pcre2_code_free_8(code);
2019 return 0;
2020 }
2021
2022 length = (PCRE2_SIZE)(strlen(current->input) - current->skip_left - current->skip_right);
2023
2024 if (current->jit_compile_options & PCRE2_JIT_COMPLETE) {
2025 result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left),
2026 length, current->start_offset - current->skip_left, 0, mdata, NULL);
2027
2028 if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
2029 pcre2_code_free_8(code);
2030 return 0;
2031 }
2032 }
2033
2034 if (current->jit_compile_options & PCRE2_JIT_PARTIAL_SOFT) {
2035 result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left),
2036 length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
2037
2038 if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
2039 pcre2_code_free_8(code);
2040 return 0;
2041 }
2042 }
2043
2044 pcre2_code_free_8(code);
2045 return 1;
2046}
2047
2048static int invalid_utf8_regression_tests(void)
2049{
2050 const struct invalid_utf8_regression_test_case *current;
2051 pcre2_compile_context_8 *ccontext;
2052 pcre2_match_data_8 *mdata;
2053 int total = 0, successful = 0;
2054 int result;
2055
2056 printf("\nRunning invalid-utf8 JIT regression tests\n");
2057
2058 ccontext = pcre2_compile_context_create_8(NULL);
2059 pcre2_set_newline_8(ccontext, PCRE2_NEWLINE_ANY);
2060 mdata = pcre2_match_data_create_8(4, NULL);
2061
2062 for (current = invalid_utf8_regression_test_cases; current->pattern[0]; current++) {
2063 /* printf("\nPattern: %s :\n", current->pattern); */
2064 total++;
2065
2066 result = 1;
2067 if (current->pattern[1] != &invalid_utf8_newline_cr)
2068 {
2069 if (!run_invalid_utf8_test(current, total - 1, 0, ccontext, mdata))
2070 result = 0;
2071 if (!run_invalid_utf8_test(current, total - 1, 1, ccontext, mdata))
2072 result = 0;
2073 } else {
2074 pcre2_set_newline_8(ccontext, PCRE2_NEWLINE_CR);
2075 if (!run_invalid_utf8_test(current, total - 1, 0, ccontext, mdata))
2076 result = 0;
2077 pcre2_set_newline_8(ccontext, PCRE2_NEWLINE_ANY);
2078 }
2079
2080 if (result) {
2081 successful++;
2082 }
2083
2084 printf(".");
2085 if ((total % 60) == 0)
2086 printf("\n");
2087 }
2088
2089 if ((total % 60) != 0)
2090 printf("\n");
2091
2092 pcre2_match_data_free_8(mdata);
2093 pcre2_compile_context_free_8(ccontext);
2094
2095 if (total == successful) {
2096 printf("\nAll invalid UTF8 JIT regression tests are successfully passed.\n");
2097 return 0;
2098 } else {
2099 printf("\nInvalid UTF8 successful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
2100 return 1;
2101 }
2102}
2103
2104#else /* !SUPPORT_UNICODE || !SUPPORT_PCRE2_8 */
2105
2106static int invalid_utf8_regression_tests(void)
2107{
2108 return 0;
2109}
2110
2111#endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_8 */
2112
2113#if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_16
2114
2115#define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED)
2116#define CI (PCRE2_JIT_COMPLETE | PCRE2_JIT_INVALID_UTF)
2117#define CPI (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_INVALID_UTF)
2118
2119struct invalid_utf16_regression_test_case {
2120 int compile_options;
2121 int jit_compile_options;
2122 int start_offset;
2123 int skip_left;
2124 int skip_right;
2125 int match_start;
2126 int match_end;
2127 const PCRE2_UCHAR16 *pattern[2];
2128 const PCRE2_UCHAR16 *input;
2129};
2130
2131static PCRE2_UCHAR16 allany16[] = { '.', 0 };
2132static PCRE2_UCHAR16 non_word_boundary16[] = { '\\', 'B', 0 };
2133static PCRE2_UCHAR16 word_boundary16[] = { '\\', 'b', 0 };
2134static PCRE2_UCHAR16 backreference16[] = { '(', '.', ')', '\\', '1', 0 };
2135static PCRE2_UCHAR16 grapheme16[] = { '\\', 'X', 0 };
2136static PCRE2_UCHAR16 nothashmark16[] = { '[', '^', '#', ']', 0 };
2137static PCRE2_UCHAR16 afternl16[] = { '^', '\\', 'W', 0 };
2138static PCRE2_UCHAR16 generic16[] = { '#', 0xd800, 0xdc00, '#', 0 };
2139static PCRE2_UCHAR16 test16_1[] = { 0xd7ff, 0xe000, 0xffff, 0x01, '#', 0 };
2140static PCRE2_UCHAR16 test16_2[] = { 0xd800, 0xdc00, 0xd800, 0xdc00, 0 };
2141static PCRE2_UCHAR16 test16_3[] = { 0xdbff, 0xdfff, 0xdbff, 0xdfff, 0 };
2142static PCRE2_UCHAR16 test16_4[] = { 0xd800, 0xdbff, 0xd800, 0xdbff, 0 };
2143static PCRE2_UCHAR16 test16_5[] = { '#', 0xd800, 0xdc00, '#', 0 };
2144static PCRE2_UCHAR16 test16_6[] = { 'a', 'A', 0xdc28, 0 };
2145static PCRE2_UCHAR16 test16_7[] = { 0xd801, 0xdc00, 0xd801, 0xdc28, 0 };
2146static PCRE2_UCHAR16 test16_8[] = { '#', 0xd800, 0xdc00, 0 };
2147static PCRE2_UCHAR16 test16_9[] = { ' ', 0x2028, '#', 0 };
2148static PCRE2_UCHAR16 test16_10[] = { ' ', 0xdc00, 0xd800, 0x2028, '#', 0 };
2149static PCRE2_UCHAR16 test16_11[] = { 0xdc00, 0xdc00, 0xd800, 0xdc00, 0xdc00, '#', 0xd800, 0xdc00, '#', 0 };
2150static PCRE2_UCHAR16 test16_12[] = { '#', 0xd800, 0xdc00, 0xd800, '#', 0xd800, 0xdc00, 0xdc00, 0xdc00, '#', 0xd800, 0xdc00, '#', 0 };
2151
2152static const struct invalid_utf16_regression_test_case invalid_utf16_regression_test_cases[] = {
2153 { UDA, CI, 0, 0, 0, 0, 1, { allany16, NULL }, test16_1 },
2154 { UDA, CI, 1, 0, 0, 1, 2, { allany16, NULL }, test16_1 },
2155 { UDA, CI, 2, 0, 0, 2, 3, { allany16, NULL }, test16_1 },
2156 { UDA, CI, 3, 0, 0, 3, 4, { allany16, NULL }, test16_1 },
2157 { UDA, CI, 0, 0, 0, 0, 2, { allany16, NULL }, test16_2 },
2158 { UDA, CI, 0, 0, 3, -1, -1, { allany16, NULL }, test16_2 },
2159 { UDA, CI, 1, 0, 0, -1, -1, { allany16, NULL }, test16_2 },
2160 { UDA, CI, 0, 0, 0, 0, 2, { allany16, NULL }, test16_3 },
2161 { UDA, CI, 0, 0, 3, -1, -1, { allany16, NULL }, test16_3 },
2162 { UDA, CI, 1, 0, 0, -1, -1, { allany16, NULL }, test16_3 },
2163
2164 { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary16, NULL }, test16_1 },
2165 { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_1 },
2166 { UDA, CPI, 3, 0, 0, 3, 3, { non_word_boundary16, NULL }, test16_1 },
2167 { UDA, CPI, 4, 0, 0, 4, 4, { non_word_boundary16, NULL }, test16_1 },
2168 { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_2 },
2169 { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_3 },
2170 { UDA, CPI, 2, 1, 1, -1, -1, { non_word_boundary16, word_boundary16 }, test16_2 },
2171 { UDA, CPI, 2, 1, 1, -1, -1, { non_word_boundary16, word_boundary16 }, test16_3 },
2172 { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_4 },
2173 { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_5 },
2174
2175 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference16, NULL }, test16_6 },
2176 { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference16, NULL }, test16_6 },
2177 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { backreference16, NULL }, test16_7 },
2178 { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { backreference16, NULL }, test16_7 },
2179
2180 { UDA, CPI, 0, 0, 0, 0, 1, { grapheme16, NULL }, test16_6 },
2181 { UDA, CPI, 1, 0, 0, 1, 2, { grapheme16, NULL }, test16_6 },
2182 { UDA, CPI, 2, 0, 0, -1, -1, { grapheme16, NULL }, test16_6 },
2183 { UDA, CPI, 0, 0, 0, 0, 2, { grapheme16, NULL }, test16_7 },
2184 { UDA, CPI, 2, 0, 0, 2, 4, { grapheme16, NULL }, test16_7 },
2185 { UDA, CPI, 1, 0, 0, -1, -1, { grapheme16, NULL }, test16_7 },
2186
2187 { UDA, CPI, 0, 0, 0, -1, -1, { nothashmark16, NULL }, test16_8 },
2188 { UDA, CPI, 1, 0, 0, 1, 3, { nothashmark16, NULL }, test16_8 },
2189 { UDA, CPI, 2, 0, 0, -1, -1, { nothashmark16, NULL }, test16_8 },
2190
2191 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { afternl16, NULL }, test16_9 },
2192 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { afternl16, NULL }, test16_10 },
2193
2194 { PCRE2_UTF | PCRE2_NO_START_OPTIMIZE, CI, 0, 0, 0, 5, 9, { generic16, NULL }, test16_11 },
2195 { PCRE2_UTF | PCRE2_NO_START_OPTIMIZE, CI, 0, 0, 0, 9, 13, { generic16, NULL }, test16_12 },
2196 { PCRE2_UTF, CI, 0, 0, 0, 5, 9, { generic16, NULL }, test16_11 },
2197 { PCRE2_UTF, CI, 0, 0, 0, 9, 13, { generic16, NULL }, test16_12 },
2198
2199 { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
2200};
2201
2202#undef UDA
2203#undef CI
2204#undef CPI
2205
2206static int run_invalid_utf16_test(const struct invalid_utf16_regression_test_case *current,
2207 int pattern_index, int i, pcre2_compile_context_16 *ccontext, pcre2_match_data_16 *mdata)
2208{
2209 pcre2_code_16 *code;
2210 int result, errorcode;
2211 PCRE2_SIZE length, erroroffset;
2212 const PCRE2_UCHAR16 *input;
2213 PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_16(mdata);
2214
2215 if (current->pattern[i] == NULL)
2216 return 1;
2217
2218 code = pcre2_compile_16(current->pattern[i], PCRE2_ZERO_TERMINATED,
2219 current->compile_options, &errorcode, &erroroffset, ccontext);
2220
2221 if (!code) {
2222 printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
2223 return 0;
2224 }
2225
2226 if (pcre2_jit_compile_16(code, current->jit_compile_options) != 0) {
2227 printf("Pattern[%d:0] cannot be compiled by the JIT compiler.\n", pattern_index);
2228 pcre2_code_free_16(code);
2229 return 0;
2230 }
2231
2232 input = current->input;
2233 length = 0;
2234
2235 while (*input++ != 0)
2236 length++;
2237
2238 length -= current->skip_left + current->skip_right;
2239
2240 if (current->jit_compile_options & PCRE2_JIT_COMPLETE) {
2241 result = pcre2_jit_match_16(code, (current->input + current->skip_left),
2242 length, current->start_offset - current->skip_left, 0, mdata, NULL);
2243
2244 if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
2245 pcre2_code_free_16(code);
2246 return 0;
2247 }
2248 }
2249
2250 if (current->jit_compile_options & PCRE2_JIT_PARTIAL_SOFT) {
2251 result = pcre2_jit_match_16(code, (current->input + current->skip_left),
2252 length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
2253
2254 if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
2255 pcre2_code_free_16(code);
2256 return 0;
2257 }
2258 }
2259
2260 pcre2_code_free_16(code);
2261 return 1;
2262}
2263
2264static int invalid_utf16_regression_tests(void)
2265{
2266 const struct invalid_utf16_regression_test_case *current;
2267 pcre2_compile_context_16 *ccontext;
2268 pcre2_match_data_16 *mdata;
2269 int total = 0, successful = 0;
2270 int result;
2271
2272 printf("\nRunning invalid-utf16 JIT regression tests\n");
2273
2274 ccontext = pcre2_compile_context_create_16(NULL);
2275 pcre2_set_newline_16(ccontext, PCRE2_NEWLINE_ANY);
2276 mdata = pcre2_match_data_create_16(4, NULL);
2277
2278 for (current = invalid_utf16_regression_test_cases; current->pattern[0]; current++) {
2279 /* printf("\nPattern: %s :\n", current->pattern); */
2280 total++;
2281
2282 result = 1;
2283 if (!run_invalid_utf16_test(current, total - 1, 0, ccontext, mdata))
2284 result = 0;
2285 if (!run_invalid_utf16_test(current, total - 1, 1, ccontext, mdata))
2286 result = 0;
2287
2288 if (result) {
2289 successful++;
2290 }
2291
2292 printf(".");
2293 if ((total % 60) == 0)
2294 printf("\n");
2295 }
2296
2297 if ((total % 60) != 0)
2298 printf("\n");
2299
2300 pcre2_match_data_free_16(mdata);
2301 pcre2_compile_context_free_16(ccontext);
2302
2303 if (total == successful) {
2304 printf("\nAll invalid UTF16 JIT regression tests are successfully passed.\n");
2305 return 0;
2306 } else {
2307 printf("\nInvalid UTF16 successful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
2308 return 1;
2309 }
2310}
2311
2312#else /* !SUPPORT_UNICODE || !SUPPORT_PCRE2_16 */
2313
2314static int invalid_utf16_regression_tests(void)
2315{
2316 return 0;
2317}
2318
2319#endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_16 */
2320
2321#if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_32
2322
2323#define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED)
2324#define CI (PCRE2_JIT_COMPLETE | PCRE2_JIT_INVALID_UTF)
2325#define CPI (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_INVALID_UTF)
2326
2327struct invalid_utf32_regression_test_case {
2328 int compile_options;
2329 int jit_compile_options;
2330 int start_offset;
2331 int skip_left;
2332 int skip_right;
2333 int match_start;
2334 int match_end;
2335 const PCRE2_UCHAR32 *pattern[2];
2336 const PCRE2_UCHAR32 *input;
2337};
2338
2339static PCRE2_UCHAR32 allany32[] = { '.', 0 };
2340static PCRE2_UCHAR32 non_word_boundary32[] = { '\\', 'B', 0 };
2341static PCRE2_UCHAR32 word_boundary32[] = { '\\', 'b', 0 };
2342static PCRE2_UCHAR32 backreference32[] = { '(', '.', ')', '\\', '1', 0 };
2343static PCRE2_UCHAR32 grapheme32[] = { '\\', 'X', 0 };
2344static PCRE2_UCHAR32 nothashmark32[] = { '[', '^', '#', ']', 0 };
2345static PCRE2_UCHAR32 afternl32[] = { '^', '\\', 'W', 0 };
2346static PCRE2_UCHAR32 test32_1[] = { 0x10ffff, 0x10ffff, 0x110000, 0x110000, 0x10ffff, 0 };
2347static PCRE2_UCHAR32 test32_2[] = { 0xd7ff, 0xe000, 0xd800, 0xdfff, 0xe000, 0xdfff, 0xd800, 0 };
2348static PCRE2_UCHAR32 test32_3[] = { 'a', 'A', 0x110000, 0 };
2349static PCRE2_UCHAR32 test32_4[] = { '#', 0x10ffff, 0x110000, 0 };
2350static PCRE2_UCHAR32 test32_5[] = { ' ', 0x2028, '#', 0 };
2351static PCRE2_UCHAR32 test32_6[] = { ' ', 0x110000, 0x2028, '#', 0 };
2352
2353static const struct invalid_utf32_regression_test_case invalid_utf32_regression_test_cases[] = {
2354 { UDA, CI, 0, 0, 0, 0, 1, { allany32, NULL }, test32_1 },
2355 { UDA, CI, 2, 0, 0, -1, -1, { allany32, NULL }, test32_1 },
2356 { UDA, CI, 0, 0, 0, 0, 1, { allany32, NULL }, test32_2 },
2357 { UDA, CI, 1, 0, 0, 1, 2, { allany32, NULL }, test32_2 },
2358 { UDA, CI, 2, 0, 0, -1, -1, { allany32, NULL }, test32_2 },
2359 { UDA, CI, 3, 0, 0, -1, -1, { allany32, NULL }, test32_2 },
2360
2361 { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary32, NULL }, test32_1 },
2362 { UDA, CPI, 3, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_1 },
2363 { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary32, NULL }, test32_2 },
2364 { UDA, CPI, 3, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_2 },
2365 { UDA, CPI, 6, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_2 },
2366
2367 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference32, NULL }, test32_3 },
2368 { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference32, NULL }, test32_3 },
2369
2370 { UDA, CPI, 0, 0, 0, 0, 1, { grapheme32, NULL }, test32_1 },
2371 { UDA, CPI, 2, 0, 0, -1, -1, { grapheme32, NULL }, test32_1 },
2372 { UDA, CPI, 1, 0, 0, 1, 2, { grapheme32, NULL }, test32_2 },
2373 { UDA, CPI, 2, 0, 0, -1, -1, { grapheme32, NULL }, test32_2 },
2374 { UDA, CPI, 3, 0, 0, -1, -1, { grapheme32, NULL }, test32_2 },
2375 { UDA, CPI, 4, 0, 0, 4, 5, { grapheme32, NULL }, test32_2 },
2376
2377 { UDA, CPI, 0, 0, 0, -1, -1, { nothashmark32, NULL }, test32_4 },
2378 { UDA, CPI, 1, 0, 0, 1, 2, { nothashmark32, NULL }, test32_4 },
2379 { UDA, CPI, 2, 0, 0, -1, -1, { nothashmark32, NULL }, test32_4 },
2380 { UDA, CPI, 1, 0, 0, 1, 2, { nothashmark32, NULL }, test32_2 },
2381 { UDA, CPI, 2, 0, 0, -1, -1, { nothashmark32, NULL }, test32_2 },
2382
2383 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { afternl32, NULL }, test32_5 },
2384 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { afternl32, NULL }, test32_6 },
2385
2386 { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
2387};
2388
2389#undef UDA
2390#undef CI
2391#undef CPI
2392
2393static int run_invalid_utf32_test(const struct invalid_utf32_regression_test_case *current,
2394 int pattern_index, int i, pcre2_compile_context_32 *ccontext, pcre2_match_data_32 *mdata)
2395{
2396 pcre2_code_32 *code;
2397 int result, errorcode;
2398 PCRE2_SIZE length, erroroffset;
2399 const PCRE2_UCHAR32 *input;
2400 PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_32(mdata);
2401
2402 if (current->pattern[i] == NULL)
2403 return 1;
2404
2405 code = pcre2_compile_32(current->pattern[i], PCRE2_ZERO_TERMINATED,
2406 current->compile_options, &errorcode, &erroroffset, ccontext);
2407
2408 if (!code) {
2409 printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
2410 return 0;
2411 }
2412
2413 if (pcre2_jit_compile_32(code, current->jit_compile_options) != 0) {
2414 printf("Pattern[%d:0] cannot be compiled by the JIT compiler.\n", pattern_index);
2415 pcre2_code_free_32(code);
2416 return 0;
2417 }
2418
2419 input = current->input;
2420 length = 0;
2421
2422 while (*input++ != 0)
2423 length++;
2424
2425 length -= current->skip_left + current->skip_right;
2426
2427 if (current->jit_compile_options & PCRE2_JIT_COMPLETE) {
2428 result = pcre2_jit_match_32(code, (current->input + current->skip_left),
2429 length, current->start_offset - current->skip_left, 0, mdata, NULL);
2430
2431 if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
2432 pcre2_code_free_32(code);
2433 return 0;
2434 }
2435 }
2436
2437 if (current->jit_compile_options & PCRE2_JIT_PARTIAL_SOFT) {
2438 result = pcre2_jit_match_32(code, (current->input + current->skip_left),
2439 length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
2440
2441 if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
2442 pcre2_code_free_32(code);
2443 return 0;
2444 }
2445 }
2446
2447 pcre2_code_free_32(code);
2448 return 1;
2449}
2450
2451static int invalid_utf32_regression_tests(void)
2452{
2453 const struct invalid_utf32_regression_test_case *current;
2454 pcre2_compile_context_32 *ccontext;
2455 pcre2_match_data_32 *mdata;
2456 int total = 0, successful = 0;
2457 int result;
2458
2459 printf("\nRunning invalid-utf32 JIT regression tests\n");
2460
2461 ccontext = pcre2_compile_context_create_32(NULL);
2462 pcre2_set_newline_32(ccontext, PCRE2_NEWLINE_ANY);
2463 mdata = pcre2_match_data_create_32(4, NULL);
2464
2465 for (current = invalid_utf32_regression_test_cases; current->pattern[0]; current++) {
2466 /* printf("\nPattern: %s :\n", current->pattern); */
2467 total++;
2468
2469 result = 1;
2470 if (!run_invalid_utf32_test(current, total - 1, 0, ccontext, mdata))
2471 result = 0;
2472 if (!run_invalid_utf32_test(current, total - 1, 1, ccontext, mdata))
2473 result = 0;
2474
2475 if (result) {
2476 successful++;
2477 }
2478
2479 printf(".");
2480 if ((total % 60) == 0)
2481 printf("\n");
2482 }
2483
2484 if ((total % 60) != 0)
2485 printf("\n");
2486
2487 pcre2_match_data_free_32(mdata);
2488 pcre2_compile_context_free_32(ccontext);
2489
2490 if (total == successful) {
2491 printf("\nAll invalid UTF32 JIT regression tests are successfully passed.\n");
2492 return 0;
2493 } else {
2494 printf("\nInvalid UTF32 successful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
2495 return 1;
2496 }
2497}
2498
2499#else /* !SUPPORT_UNICODE || !SUPPORT_PCRE2_32 */
2500
2501static int invalid_utf32_regression_tests(void)
2502{
2503 return 0;
2504}
2505
2506#endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_32 */
2507
2508/* End of pcre2_jit_test.c */