Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 1 | /*************************************************************************** |
| 2 | Fuzzer driver for PCRE2. Given an arbitrary string of bytes and a length, it |
| 3 | tries to compile and match it, deriving options from the string itself. If |
| 4 | STANDALONE is defined, a main program that calls the driver with the contents |
| 5 | of specified files is compiled, and commentary on what is happening is output. |
| 6 | If an argument starts with '=' the rest of it it is taken as a literal string |
| 7 | rather than a file name. This allows easy testing of short strings. |
| 8 | |
| 9 | Written by Philip Hazel, October 2016 |
| 10 | ***************************************************************************/ |
| 11 | |
| 12 | #include <errno.h> |
| 13 | #include <stdio.h> |
| 14 | #include <stdlib.h> |
| 15 | #include <string.h> |
| 16 | |
| 17 | #define PCRE2_CODE_UNIT_WIDTH 8 |
| 18 | #include "pcre2.h" |
| 19 | |
| 20 | #define MAX_MATCH_SIZE 1000 |
| 21 | |
| 22 | #define DFA_WORKSPACE_COUNT 100 |
| 23 | |
| 24 | #define ALLOWED_COMPILE_OPTIONS \ |
| 25 | (PCRE2_ANCHORED|PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \ |
| 26 | PCRE2_ALT_VERBNAMES|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY| \ |
| 27 | PCRE2_DOTALL|PCRE2_DUPNAMES|PCRE2_ENDANCHORED|PCRE2_EXTENDED|PCRE2_FIRSTLINE| \ |
| 28 | PCRE2_MATCH_UNSET_BACKREF|PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C| \ |
| 29 | PCRE2_NO_AUTO_CAPTURE| \ |
| 30 | PCRE2_NO_AUTO_POSSESS|PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE| \ |
| 31 | PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_USE_OFFSET_LIMIT| \ |
| 32 | PCRE2_UTF) |
| 33 | |
| 34 | #define ALLOWED_MATCH_OPTIONS \ |
| 35 | (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \ |
| 36 | PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_HARD| \ |
| 37 | PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT) |
| 38 | |
| 39 | /* This is the callout function. Its only purpose is to halt matching if there |
| 40 | are more than 100 callouts, as one way of stopping too much time being spent on |
| 41 | fruitless matches. The callout data is a pointer to the counter. */ |
| 42 | |
| 43 | static int callout_function(pcre2_callout_block *cb, void *callout_data) |
| 44 | { |
| 45 | (void)cb; /* Avoid unused parameter warning */ |
| 46 | *((uint32_t *)callout_data) += 1; |
| 47 | return (*((uint32_t *)callout_data) > 100)? PCRE2_ERROR_CALLOUT : 0; |
| 48 | } |
| 49 | |
| 50 | /* Putting in this apparently unnecessary prototype prevents gcc from giving a |
| 51 | "no previous prototype" warning when compiling at high warning level. */ |
| 52 | |
| 53 | int LLVMFuzzerTestOneInput(const unsigned char *, size_t); |
| 54 | |
| 55 | /* Here's the driving function. */ |
| 56 | |
| 57 | int LLVMFuzzerTestOneInput(const unsigned char *data, size_t size) |
| 58 | { |
| 59 | uint32_t compile_options; |
| 60 | uint32_t match_options; |
| 61 | pcre2_match_data *match_data = NULL; |
| 62 | pcre2_match_context *match_context = NULL; |
| 63 | size_t match_size; |
| 64 | int dfa_workspace[DFA_WORKSPACE_COUNT]; |
| 65 | int r1, r2; |
| 66 | int i; |
| 67 | |
| 68 | if (size < 1) return 0; |
| 69 | |
| 70 | /* Limiting the length of the subject for matching stops fruitless searches |
| 71 | in large trees taking too much time. */ |
| 72 | |
| 73 | match_size = (size > MAX_MATCH_SIZE)? MAX_MATCH_SIZE : size; |
| 74 | |
| 75 | /* Figure out some options to use. Initialize the random number to ensure |
| 76 | repeatability. Ensure that we get a 32-bit unsigned random number for testing |
| 77 | options. (RAND_MAX is required to be at least 32767, but is commonly |
| 78 | 2147483647, which excludes the top bit.) */ |
| 79 | |
| 80 | srand((unsigned int)(data[size/2])); |
| 81 | r1 = rand(); |
| 82 | r2 = rand(); |
| 83 | |
| 84 | /* Ensure that all undefined option bits are zero (waste of time trying them) |
| 85 | and also that PCRE2_NO_UTF_CHECK is unset, as there is no guarantee that the |
| 86 | input is UTF-8. Also unset PCRE2_NEVER_UTF and PCRE2_NEVER_UCP as there is no |
| 87 | reason to disallow UTF and UCP. Force PCRE2_NEVER_BACKSLASH_C to be set because |
| 88 | \C in random patterns is highly likely to cause a crash. */ |
| 89 | |
| 90 | compile_options = |
| 91 | ((((uint32_t)r1 << 16) | ((uint32_t)r2 & 0xffff)) & ALLOWED_COMPILE_OPTIONS) | |
| 92 | PCRE2_NEVER_BACKSLASH_C; |
| 93 | |
| 94 | match_options = |
| 95 | ((((uint32_t)r1 << 16) | ((uint32_t)r2 & 0xffff)) & ALLOWED_MATCH_OPTIONS); |
| 96 | |
| 97 | /* Discard partial matching if PCRE2_ENDANCHORED is set, because they are not |
| 98 | allowed together and just give an immediate error return. */ |
| 99 | |
| 100 | if (((compile_options|match_options) & PCRE2_ENDANCHORED) != 0) |
| 101 | match_options &= ~(PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT); |
| 102 | |
| 103 | /* Do the compile with and without the options, and after a successful compile, |
| 104 | likewise do the match with and without the options. */ |
| 105 | |
| 106 | for (i = 0; i < 2; i++) |
| 107 | { |
| 108 | uint32_t callout_count; |
| 109 | int errorcode; |
| 110 | PCRE2_SIZE erroroffset; |
| 111 | pcre2_code *code; |
| 112 | |
| 113 | #ifdef STANDALONE |
| 114 | printf("Compile options %.8x never_backslash_c", compile_options); |
| 115 | printf("%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", |
| 116 | ((compile_options & PCRE2_ALT_BSUX) != 0)? ",alt_bsux" : "", |
| 117 | ((compile_options & PCRE2_ALT_CIRCUMFLEX) != 0)? ",alt_circumflex" : "", |
| 118 | ((compile_options & PCRE2_ALT_VERBNAMES) != 0)? ",alt_verbnames" : "", |
| 119 | ((compile_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)? ",allow_empty_class" : "", |
| 120 | ((compile_options & PCRE2_ANCHORED) != 0)? ",anchored" : "", |
| 121 | ((compile_options & PCRE2_AUTO_CALLOUT) != 0)? ",auto_callout" : "", |
| 122 | ((compile_options & PCRE2_CASELESS) != 0)? ",caseless" : "", |
| 123 | ((compile_options & PCRE2_DOLLAR_ENDONLY) != 0)? ",dollar_endonly" : "", |
| 124 | ((compile_options & PCRE2_DOTALL) != 0)? ",dotall" : "", |
| 125 | ((compile_options & PCRE2_DUPNAMES) != 0)? ",dupnames" : "", |
| 126 | ((compile_options & PCRE2_ENDANCHORED) != 0)? ",endanchored" : "", |
| 127 | ((compile_options & PCRE2_EXTENDED) != 0)? ",extended" : "", |
| 128 | ((compile_options & PCRE2_FIRSTLINE) != 0)? ",firstline" : "", |
| 129 | ((compile_options & PCRE2_MATCH_UNSET_BACKREF) != 0)? ",match_unset_backref" : "", |
| 130 | ((compile_options & PCRE2_MULTILINE) != 0)? ",multiline" : "", |
| 131 | ((compile_options & PCRE2_NEVER_UCP) != 0)? ",never_ucp" : "", |
| 132 | ((compile_options & PCRE2_NEVER_UTF) != 0)? ",never_utf" : "", |
| 133 | ((compile_options & PCRE2_NO_AUTO_CAPTURE) != 0)? ",no_auto_capture" : "", |
| 134 | ((compile_options & PCRE2_NO_AUTO_POSSESS) != 0)? ",no_auto_possess" : "", |
| 135 | ((compile_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)? ",no_dotstar_anchor" : "", |
| 136 | ((compile_options & PCRE2_NO_UTF_CHECK) != 0)? ",no_utf_check" : "", |
| 137 | ((compile_options & PCRE2_NO_START_OPTIMIZE) != 0)? ",no_start_optimize" : "", |
| 138 | ((compile_options & PCRE2_UCP) != 0)? ",ucp" : "", |
| 139 | ((compile_options & PCRE2_UNGREEDY) != 0)? ",ungreedy" : "", |
| 140 | ((compile_options & PCRE2_USE_OFFSET_LIMIT) != 0)? ",use_offset_limit" : "", |
| 141 | ((compile_options & PCRE2_UTF) != 0)? ",utf" : ""); |
| 142 | #endif |
| 143 | |
| 144 | code = pcre2_compile((PCRE2_SPTR)data, (PCRE2_SIZE)size, compile_options, |
| 145 | &errorcode, &erroroffset, NULL); |
| 146 | |
| 147 | /* Compilation succeeded */ |
| 148 | |
| 149 | if (code != NULL) |
| 150 | { |
| 151 | int j; |
| 152 | uint32_t save_match_options = match_options; |
| 153 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 154 | #ifdef SUPPORT_JIT |
| 155 | pcre2_jit_compile(code, PCRE2_JIT_COMPLETE); |
| 156 | #endif |
| 157 | |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 158 | /* Create match data and context blocks only when we first need them. Set |
| 159 | low match and depth limits to avoid wasting too much searching large |
| 160 | pattern trees. Almost all matches are going to fail. */ |
| 161 | |
| 162 | if (match_data == NULL) |
| 163 | { |
| 164 | match_data = pcre2_match_data_create(32, NULL); |
| 165 | if (match_data == NULL) |
| 166 | { |
| 167 | #ifdef STANDALONE |
| 168 | printf("** Failed to create match data block\n"); |
| 169 | #endif |
| 170 | return 0; |
| 171 | } |
| 172 | } |
| 173 | |
| 174 | if (match_context == NULL) |
| 175 | { |
| 176 | match_context = pcre2_match_context_create(NULL); |
| 177 | if (match_context == NULL) |
| 178 | { |
| 179 | #ifdef STANDALONE |
| 180 | printf("** Failed to create match context block\n"); |
| 181 | #endif |
| 182 | return 0; |
| 183 | } |
| 184 | (void)pcre2_set_match_limit(match_context, 100); |
| 185 | (void)pcre2_set_depth_limit(match_context, 100); |
| 186 | (void)pcre2_set_callout(match_context, callout_function, &callout_count); |
| 187 | } |
| 188 | |
| 189 | /* Match twice, with and without options. */ |
| 190 | |
| 191 | for (j = 0; j < 2; j++) |
| 192 | { |
| 193 | #ifdef STANDALONE |
| 194 | printf("Match options %.8x", match_options); |
| 195 | printf("%s%s%s%s%s%s%s%s%s%s\n", |
| 196 | ((match_options & PCRE2_ANCHORED) != 0)? ",anchored" : "", |
| 197 | ((match_options & PCRE2_ENDANCHORED) != 0)? ",endanchored" : "", |
| 198 | ((match_options & PCRE2_NO_JIT) != 0)? ",no_jit" : "", |
| 199 | ((match_options & PCRE2_NO_UTF_CHECK) != 0)? ",no_utf_check" : "", |
| 200 | ((match_options & PCRE2_NOTBOL) != 0)? ",notbol" : "", |
| 201 | ((match_options & PCRE2_NOTEMPTY) != 0)? ",notempty" : "", |
| 202 | ((match_options & PCRE2_NOTEMPTY_ATSTART) != 0)? ",notempty_atstart" : "", |
| 203 | ((match_options & PCRE2_NOTEOL) != 0)? ",noteol" : "", |
| 204 | ((match_options & PCRE2_PARTIAL_HARD) != 0)? ",partial_hard" : "", |
| 205 | ((match_options & PCRE2_PARTIAL_SOFT) != 0)? ",partial_soft" : ""); |
| 206 | #endif |
| 207 | |
| 208 | callout_count = 0; |
| 209 | errorcode = pcre2_match(code, (PCRE2_SPTR)data, (PCRE2_SIZE)match_size, 0, |
| 210 | match_options, match_data, match_context); |
| 211 | |
| 212 | #ifdef STANDALONE |
| 213 | if (errorcode >= 0) printf("Match returned %d\n", errorcode); else |
| 214 | { |
| 215 | unsigned char buffer[256]; |
| 216 | pcre2_get_error_message(errorcode, buffer, 256); |
| 217 | printf("Match failed: error %d: %s\n", errorcode, buffer); |
| 218 | } |
| 219 | #endif |
| 220 | |
| 221 | match_options = 0; /* For second time */ |
| 222 | } |
| 223 | |
| 224 | /* Match with DFA twice, with and without options. */ |
| 225 | |
| 226 | match_options = save_match_options & ~PCRE2_NO_JIT; /* Not valid for DFA */ |
| 227 | |
| 228 | for (j = 0; j < 2; j++) |
| 229 | { |
| 230 | #ifdef STANDALONE |
| 231 | printf("DFA match options %.8x", match_options); |
| 232 | printf("%s%s%s%s%s%s%s%s%s\n", |
| 233 | ((match_options & PCRE2_ANCHORED) != 0)? ",anchored" : "", |
| 234 | ((match_options & PCRE2_ENDANCHORED) != 0)? ",endanchored" : "", |
| 235 | ((match_options & PCRE2_NO_UTF_CHECK) != 0)? ",no_utf_check" : "", |
| 236 | ((match_options & PCRE2_NOTBOL) != 0)? ",notbol" : "", |
| 237 | ((match_options & PCRE2_NOTEMPTY) != 0)? ",notempty" : "", |
| 238 | ((match_options & PCRE2_NOTEMPTY_ATSTART) != 0)? ",notempty_atstart" : "", |
| 239 | ((match_options & PCRE2_NOTEOL) != 0)? ",noteol" : "", |
| 240 | ((match_options & PCRE2_PARTIAL_HARD) != 0)? ",partial_hard" : "", |
| 241 | ((match_options & PCRE2_PARTIAL_SOFT) != 0)? ",partial_soft" : ""); |
| 242 | #endif |
| 243 | |
| 244 | callout_count = 0; |
| 245 | errorcode = pcre2_dfa_match(code, (PCRE2_SPTR)data, |
| 246 | (PCRE2_SIZE)match_size, 0, match_options, match_data, match_context, |
| 247 | dfa_workspace, DFA_WORKSPACE_COUNT); |
| 248 | |
| 249 | #ifdef STANDALONE |
| 250 | if (errorcode >= 0) printf("Match returned %d\n", errorcode); else |
| 251 | { |
| 252 | unsigned char buffer[256]; |
| 253 | pcre2_get_error_message(errorcode, buffer, 256); |
| 254 | printf("Match failed: error %d: %s\n", errorcode, buffer); |
| 255 | } |
| 256 | #endif |
| 257 | |
| 258 | match_options = 0; /* For second time */ |
| 259 | } |
| 260 | |
| 261 | match_options = save_match_options; /* Reset for the second compile */ |
| 262 | pcre2_code_free(code); |
| 263 | } |
| 264 | |
| 265 | /* Compilation failed */ |
| 266 | |
| 267 | else |
| 268 | { |
| 269 | unsigned char buffer[256]; |
| 270 | pcre2_get_error_message(errorcode, buffer, 256); |
| 271 | #ifdef STANDALONE |
| 272 | printf("Error %d at offset %lu: %s\n", errorcode, erroroffset, buffer); |
| 273 | #else |
| 274 | if (strstr((const char *)buffer, "internal error") != NULL) abort(); |
| 275 | #endif |
| 276 | } |
| 277 | |
| 278 | compile_options = PCRE2_NEVER_BACKSLASH_C; /* For second time */ |
| 279 | } |
| 280 | |
| 281 | if (match_data != NULL) pcre2_match_data_free(match_data); |
| 282 | if (match_context != NULL) pcre2_match_context_free(match_context); |
| 283 | |
| 284 | return 0; |
| 285 | } |
| 286 | |
| 287 | |
| 288 | /* Optional main program. */ |
| 289 | |
| 290 | #ifdef STANDALONE |
| 291 | int main(int argc, char **argv) |
| 292 | { |
| 293 | int i; |
| 294 | |
| 295 | if (argc < 2) |
| 296 | { |
| 297 | printf("** No arguments given\n"); |
| 298 | return 0; |
| 299 | } |
| 300 | |
| 301 | for (i = 1; i < argc; i++) |
| 302 | { |
| 303 | size_t filelen; |
| 304 | size_t readsize; |
| 305 | unsigned char *buffer; |
| 306 | FILE *f; |
| 307 | |
| 308 | /* Handle a literal string. Copy to an exact size buffer so that checks for |
| 309 | overrunning work. */ |
| 310 | |
| 311 | if (argv[i][0] == '=') |
| 312 | { |
| 313 | readsize = strlen(argv[i]) - 1; |
| 314 | printf("------ <Literal> ------\n"); |
| 315 | printf("Length = %lu\n", readsize); |
| 316 | printf("%.*s\n", (int)readsize, argv[i]+1); |
| 317 | buffer = (unsigned char *)malloc(readsize); |
| 318 | if (buffer == NULL) |
| 319 | printf("** Failed to allocate %lu bytes of memory\n", readsize); |
| 320 | else |
| 321 | { |
| 322 | memcpy(buffer, argv[i]+1, readsize); |
| 323 | LLVMFuzzerTestOneInput(buffer, readsize); |
| 324 | free(buffer); |
| 325 | } |
| 326 | continue; |
| 327 | } |
| 328 | |
| 329 | /* Handle a string given in a file */ |
| 330 | |
| 331 | f = fopen(argv[i], "rb"); |
| 332 | if (f == NULL) |
| 333 | { |
| 334 | printf("** Failed to open %s: %s\n", argv[i], strerror(errno)); |
| 335 | continue; |
| 336 | } |
| 337 | |
| 338 | printf("------ %s ------\n", argv[i]); |
| 339 | |
| 340 | fseek(f, 0, SEEK_END); |
| 341 | filelen = ftell(f); |
| 342 | fseek(f, 0, SEEK_SET); |
| 343 | |
| 344 | buffer = (unsigned char *)malloc(filelen); |
| 345 | if (buffer == NULL) |
| 346 | { |
| 347 | printf("** Failed to allocate %lu bytes of memory\n", filelen); |
| 348 | fclose(f); |
| 349 | continue; |
| 350 | } |
| 351 | |
| 352 | readsize = fread(buffer, 1, filelen, f); |
| 353 | fclose(f); |
| 354 | |
| 355 | if (readsize != filelen) |
| 356 | printf("** File size is %lu but fread() returned %lu\n", filelen, readsize); |
| 357 | else |
| 358 | { |
| 359 | printf("Length = %lu\n", filelen); |
| 360 | LLVMFuzzerTestOneInput(buffer, filelen); |
| 361 | } |
| 362 | free(buffer); |
| 363 | } |
| 364 | |
| 365 | return 0; |
| 366 | } |
| 367 | #endif /* STANDALONE */ |
| 368 | |
| 369 | /* End */ |