blob: d45b6fee9744b2073c23810991db46796188e0d6 [file] [log] [blame]
Elliott Hughes9bc971b2018-07-27 13:23:14 -07001/*************************************************
2* Perl-Compatible Regular Expressions *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
Elliott Hughes653c2102019-01-09 15:41:36 -080010 New API code Copyright (c) 2016-2018 University of Cambridge
Elliott Hughes9bc971b2018-07-27 13:23:14 -070011
12-----------------------------------------------------------------------------
13Redistribution and use in source and binary forms, with or without
14modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE.
38-----------------------------------------------------------------------------
39*/
40
41
42#ifdef HAVE_CONFIG_H
43#include "config.h"
44#endif
45
46#include "pcre2_internal.h"
47
48#define TYPE_OPTIONS (PCRE2_CONVERT_GLOB| \
49 PCRE2_CONVERT_POSIX_BASIC|PCRE2_CONVERT_POSIX_EXTENDED)
50
51#define ALL_OPTIONS (PCRE2_CONVERT_UTF|PCRE2_CONVERT_NO_UTF_CHECK| \
52 PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR| \
53 PCRE2_CONVERT_GLOB_NO_STARSTAR| \
54 TYPE_OPTIONS)
55
56#define DUMMY_BUFFER_SIZE 100
57
58/* Generated pattern fragments */
59
60#define STR_BACKSLASH_A STR_BACKSLASH STR_A
61#define STR_BACKSLASH_z STR_BACKSLASH STR_z
62#define STR_COLON_RIGHT_SQUARE_BRACKET STR_COLON STR_RIGHT_SQUARE_BRACKET
63#define STR_DOT_STAR_LOOKBEHIND STR_DOT STR_ASTERISK STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_LESS_THAN_SIGN STR_EQUALS_SIGN
64#define STR_LOOKAHEAD_NOT_DOT STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_EXCLAMATION_MARK STR_BACKSLASH STR_DOT STR_RIGHT_PARENTHESIS
65#define STR_QUERY_s STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_s STR_RIGHT_PARENTHESIS
66#define STR_STAR_NUL STR_LEFT_PARENTHESIS STR_ASTERISK STR_N STR_U STR_L STR_RIGHT_PARENTHESIS
67
68/* States for range and POSIX processing */
69
70enum { RANGE_NOT_STARTED, RANGE_STARTING, RANGE_STARTED };
71enum { POSIX_START_REGEX, POSIX_ANCHORED, POSIX_NOT_BRACKET,
72 POSIX_CLASS_NOT_STARTED, POSIX_CLASS_STARTING, POSIX_CLASS_STARTED };
73
74/* Macro to add a character string to the output buffer, checking for overflow. */
75
76#define PUTCHARS(string) \
77 { \
78 for (s = (char *)(string); *s != 0; s++) \
79 { \
80 if (p >= endp) return PCRE2_ERROR_NOMEMORY; \
81 *p++ = *s; \
82 } \
83 }
84
85/* Literals that must be escaped: \ ? * + | . ^ $ { } [ ] ( ) */
86
87static const char *pcre2_escaped_literals =
88 STR_BACKSLASH STR_QUESTION_MARK STR_ASTERISK STR_PLUS
89 STR_VERTICAL_LINE STR_DOT STR_CIRCUMFLEX_ACCENT STR_DOLLAR_SIGN
90 STR_LEFT_CURLY_BRACKET STR_RIGHT_CURLY_BRACKET
91 STR_LEFT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET
92 STR_LEFT_PARENTHESIS STR_RIGHT_PARENTHESIS;
93
94/* Recognized escaped metacharacters in POSIX basic patterns. */
95
96static const char *posix_meta_escapes =
97 STR_LEFT_PARENTHESIS STR_RIGHT_PARENTHESIS
98 STR_LEFT_CURLY_BRACKET STR_RIGHT_CURLY_BRACKET
99 STR_1 STR_2 STR_3 STR_4 STR_5 STR_6 STR_7 STR_8 STR_9;
100
101
102
103/*************************************************
104* Convert a POSIX pattern *
105*************************************************/
106
107/* This function handles both basic and extended POSIX patterns.
108
109Arguments:
110 pattype the pattern type
111 pattern the pattern
112 plength length in code units
113 utf TRUE if UTF
114 use_buffer where to put the output
115 use_length length of use_buffer
116 bufflenptr where to put the used length
117 dummyrun TRUE if a dummy run
118 ccontext the convert context
119
120Returns: 0 => success
121 !0 => error code
122*/
123
124static int
125convert_posix(uint32_t pattype, PCRE2_SPTR pattern, PCRE2_SIZE plength,
126 BOOL utf, PCRE2_UCHAR *use_buffer, PCRE2_SIZE use_length,
127 PCRE2_SIZE *bufflenptr, BOOL dummyrun, pcre2_convert_context *ccontext)
128{
129char *s;
130PCRE2_SPTR posix = pattern;
131PCRE2_UCHAR *p = use_buffer;
132PCRE2_UCHAR *pp = p;
133PCRE2_UCHAR *endp = p + use_length - 1; /* Allow for trailing zero */
134PCRE2_SIZE convlength = 0;
135
136uint32_t bracount = 0;
137uint32_t posix_state = POSIX_START_REGEX;
138uint32_t lastspecial = 0;
139BOOL extended = (pattype & PCRE2_CONVERT_POSIX_EXTENDED) != 0;
140BOOL nextisliteral = FALSE;
141
142(void)utf; /* Not used when Unicode not supported */
143(void)ccontext; /* Not currently used */
144
145/* Initialize default for error offset as end of input. */
146
147*bufflenptr = plength;
148PUTCHARS(STR_STAR_NUL);
149
150/* Now scan the input. */
151
152while (plength > 0)
153 {
154 uint32_t c, sc;
155 int clength = 1;
156
157 /* Add in the length of the last item, then, if in the dummy run, pull the
158 pointer back to the start of the (temporary) buffer and then remember the
159 start of the next item. */
160
161 convlength += p - pp;
162 if (dummyrun) p = use_buffer;
163 pp = p;
164
165 /* Pick up the next character */
166
167#ifndef SUPPORT_UNICODE
168 c = *posix;
169#else
170 GETCHARLENTEST(c, posix, clength);
171#endif
172 posix += clength;
173 plength -= clength;
174
175 sc = nextisliteral? 0 : c;
176 nextisliteral = FALSE;
177
178 /* Handle a character within a class. */
179
180 if (posix_state >= POSIX_CLASS_NOT_STARTED)
181 {
182 if (c == CHAR_RIGHT_SQUARE_BRACKET)
183 {
184 PUTCHARS(STR_RIGHT_SQUARE_BRACKET);
185 posix_state = POSIX_NOT_BRACKET;
186 }
187
188 /* Not the end of the class */
189
190 else
191 {
192 switch (posix_state)
193 {
194 case POSIX_CLASS_STARTED:
195 if (c <= 127 && islower(c)) break; /* Remain in started state */
196 posix_state = POSIX_CLASS_NOT_STARTED;
197 if (c == CHAR_COLON && plength > 0 &&
198 *posix == CHAR_RIGHT_SQUARE_BRACKET)
199 {
200 PUTCHARS(STR_COLON_RIGHT_SQUARE_BRACKET);
201 plength--;
202 posix++;
203 continue; /* With next character after :] */
204 }
205 /* Fall through */
206
207 case POSIX_CLASS_NOT_STARTED:
208 if (c == CHAR_LEFT_SQUARE_BRACKET)
209 posix_state = POSIX_CLASS_STARTING;
210 break;
211
212 case POSIX_CLASS_STARTING:
213 if (c == CHAR_COLON) posix_state = POSIX_CLASS_STARTED;
214 break;
215 }
216
217 if (c == CHAR_BACKSLASH) PUTCHARS(STR_BACKSLASH);
218 if (p + clength > endp) return PCRE2_ERROR_NOMEMORY;
219 memcpy(p, posix - clength, CU2BYTES(clength));
220 p += clength;
221 }
222 }
223
224 /* Handle a character not within a class. */
225
226 else switch(sc)
227 {
228 case CHAR_LEFT_SQUARE_BRACKET:
229 PUTCHARS(STR_LEFT_SQUARE_BRACKET);
230
231#ifdef NEVER
232 /* We could handle special cases [[:<:]] and [[:>:]] (which PCRE does
233 support) but they are not part of POSIX 1003.1. */
234
235 if (plength >= 6)
236 {
237 if (posix[0] == CHAR_LEFT_SQUARE_BRACKET &&
238 posix[1] == CHAR_COLON &&
239 (posix[2] == CHAR_LESS_THAN_SIGN ||
240 posix[2] == CHAR_GREATER_THAN_SIGN) &&
241 posix[3] == CHAR_COLON &&
242 posix[4] == CHAR_RIGHT_SQUARE_BRACKET &&
243 posix[5] == CHAR_RIGHT_SQUARE_BRACKET)
244 {
245 if (p + 6 > endp) return PCRE2_ERROR_NOMEMORY;
246 memcpy(p, posix, CU2BYTES(6));
247 p += 6;
248 posix += 6;
249 plength -= 6;
250 continue; /* With next character */
251 }
252 }
253#endif
254
255 /* Handle start of "normal" character classes */
256
257 posix_state = POSIX_CLASS_NOT_STARTED;
258
259 /* Handle ^ and ] as first characters */
260
261 if (plength > 0)
262 {
263 if (*posix == CHAR_CIRCUMFLEX_ACCENT)
264 {
265 posix++;
266 plength--;
267 PUTCHARS(STR_CIRCUMFLEX_ACCENT);
268 }
269 if (plength > 0 && *posix == CHAR_RIGHT_SQUARE_BRACKET)
270 {
271 posix++;
272 plength--;
273 PUTCHARS(STR_RIGHT_SQUARE_BRACKET);
274 }
275 }
276 break;
277
278 case CHAR_BACKSLASH:
Elliott Hughes0c26e192019-08-07 12:24:46 -0700279 if (plength == 0) return PCRE2_ERROR_END_BACKSLASH;
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700280 if (extended) nextisliteral = TRUE; else
281 {
282 if (*posix < 127 && strchr(posix_meta_escapes, *posix) != NULL)
283 {
284 if (isdigit(*posix)) PUTCHARS(STR_BACKSLASH);
285 if (p + 1 > endp) return PCRE2_ERROR_NOMEMORY;
286 lastspecial = *p++ = *posix++;
287 plength--;
288 }
289 else nextisliteral = TRUE;
290 }
291 break;
292
293 case CHAR_RIGHT_PARENTHESIS:
294 if (!extended || bracount == 0) goto ESCAPE_LITERAL;
295 bracount--;
296 goto COPY_SPECIAL;
297
298 case CHAR_LEFT_PARENTHESIS:
299 bracount++;
300 /* Fall through */
301
302 case CHAR_QUESTION_MARK:
303 case CHAR_PLUS:
304 case CHAR_LEFT_CURLY_BRACKET:
305 case CHAR_RIGHT_CURLY_BRACKET:
306 case CHAR_VERTICAL_LINE:
307 if (!extended) goto ESCAPE_LITERAL;
308 /* Fall through */
309
310 case CHAR_DOT:
311 case CHAR_DOLLAR_SIGN:
312 posix_state = POSIX_NOT_BRACKET;
313 COPY_SPECIAL:
314 lastspecial = c;
315 if (p + 1 > endp) return PCRE2_ERROR_NOMEMORY;
316 *p++ = c;
317 break;
318
319 case CHAR_ASTERISK:
320 if (lastspecial != CHAR_ASTERISK)
321 {
322 if (!extended && (posix_state < POSIX_NOT_BRACKET ||
323 lastspecial == CHAR_LEFT_PARENTHESIS))
324 goto ESCAPE_LITERAL;
325 goto COPY_SPECIAL;
326 }
327 break; /* Ignore second and subsequent asterisks */
328
329 case CHAR_CIRCUMFLEX_ACCENT:
330 if (extended) goto COPY_SPECIAL;
331 if (posix_state == POSIX_START_REGEX ||
332 lastspecial == CHAR_LEFT_PARENTHESIS)
333 {
334 posix_state = POSIX_ANCHORED;
335 goto COPY_SPECIAL;
336 }
337 /* Fall through */
338
339 default:
340 if (c < 128 && strchr(pcre2_escaped_literals, c) != NULL)
341 {
342 ESCAPE_LITERAL:
343 PUTCHARS(STR_BACKSLASH);
344 }
345 lastspecial = 0xff; /* Indicates nothing special */
346 if (p + clength > endp) return PCRE2_ERROR_NOMEMORY;
347 memcpy(p, posix - clength, CU2BYTES(clength));
348 p += clength;
349 posix_state = POSIX_NOT_BRACKET;
350 break;
351 }
352 }
353
354if (posix_state >= POSIX_CLASS_NOT_STARTED)
355 return PCRE2_ERROR_MISSING_SQUARE_BRACKET;
356convlength += p - pp; /* Final segment */
357*bufflenptr = convlength;
358*p++ = 0;
359return 0;
360}
361
362
363/*************************************************
364* Convert a glob pattern *
365*************************************************/
366
367/* Context for writing the output into a buffer. */
368
369typedef struct pcre2_output_context {
370 PCRE2_UCHAR *output; /* current output position */
371 PCRE2_SPTR output_end; /* output end */
372 PCRE2_SIZE output_size; /* size of the output */
373 uint8_t out_str[8]; /* string copied to the output */
374} pcre2_output_context;
375
376
377/* Write a character into the output.
378
379Arguments:
380 out output context
381 chr the next character
382*/
383
384static void
385convert_glob_write(pcre2_output_context *out, PCRE2_UCHAR chr)
386{
387out->output_size++;
388
389if (out->output < out->output_end)
390 *out->output++ = chr;
391}
392
393
394/* Write a string into the output.
395
396Arguments:
397 out output context
398 length length of out->out_str
399*/
400
401static void
402convert_glob_write_str(pcre2_output_context *out, PCRE2_SIZE length)
403{
404uint8_t *out_str = out->out_str;
405PCRE2_UCHAR *output = out->output;
406PCRE2_SPTR output_end = out->output_end;
407PCRE2_SIZE output_size = out->output_size;
408
409do
410 {
411 output_size++;
412
413 if (output < output_end)
414 *output++ = *out_str++;
415 }
416while (--length != 0);
417
418out->output = output;
419out->output_size = output_size;
420}
421
422
423/* Prints the separator into the output.
424
425Arguments:
426 out output context
427 separator glob separator
428 with_escape backslash is needed before separator
429*/
430
431static void
432convert_glob_print_separator(pcre2_output_context *out,
433 PCRE2_UCHAR separator, BOOL with_escape)
434{
435if (with_escape)
436 convert_glob_write(out, CHAR_BACKSLASH);
437
438convert_glob_write(out, separator);
439}
440
441
442/* Prints a wildcard into the output.
443
444Arguments:
445 out output context
446 separator glob separator
447 with_escape backslash is needed before separator
448*/
449
450static void
451convert_glob_print_wildcard(pcre2_output_context *out,
452 PCRE2_UCHAR separator, BOOL with_escape)
453{
454out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
455out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
456convert_glob_write_str(out, 2);
457
458convert_glob_print_separator(out, separator, with_escape);
459
460convert_glob_write(out, CHAR_RIGHT_SQUARE_BRACKET);
461}
462
463
464/* Parse a posix class.
465
466Arguments:
467 from starting point of scanning the range
468 pattern_end end of pattern
469 out output context
470
471Returns: >0 => class index
472 0 => malformed class
473*/
474
475static int
476convert_glob_parse_class(PCRE2_SPTR *from, PCRE2_SPTR pattern_end,
477 pcre2_output_context *out)
478{
479static const char *posix_classes = "alnum:alpha:ascii:blank:cntrl:digit:"
480 "graph:lower:print:punct:space:upper:word:xdigit:";
481PCRE2_SPTR start = *from + 1;
482PCRE2_SPTR pattern = start;
483const char *class_ptr;
484PCRE2_UCHAR c;
485int class_index;
486
487while (TRUE)
488 {
489 if (pattern >= pattern_end) return 0;
490
491 c = *pattern++;
492
493 if (c < CHAR_a || c > CHAR_z) break;
494 }
495
496if (c != CHAR_COLON || pattern >= pattern_end ||
497 *pattern != CHAR_RIGHT_SQUARE_BRACKET)
498 return 0;
499
500class_ptr = posix_classes;
501class_index = 1;
502
503while (TRUE)
504 {
505 if (*class_ptr == CHAR_NUL) return 0;
506
507 pattern = start;
508
509 while (*pattern == (PCRE2_UCHAR) *class_ptr)
510 {
511 if (*pattern == CHAR_COLON)
512 {
513 pattern += 2;
514 start -= 2;
515
516 do convert_glob_write(out, *start++); while (start < pattern);
517
518 *from = pattern;
519 return class_index;
520 }
521 pattern++;
522 class_ptr++;
523 }
524
525 while (*class_ptr != CHAR_COLON) class_ptr++;
526 class_ptr++;
527 class_index++;
528 }
529}
530
531/* Checks whether the character is in the class.
532
533Arguments:
534 class_index class index
535 c character
536
537Returns: !0 => character is found in the class
538 0 => otherwise
539*/
540
541static BOOL
542convert_glob_char_in_class(int class_index, PCRE2_UCHAR c)
543{
544switch (class_index)
545 {
546 case 1: return isalnum(c);
547 case 2: return isalpha(c);
548 case 3: return 1;
549 case 4: return c == CHAR_HT || c == CHAR_SPACE;
550 case 5: return iscntrl(c);
551 case 6: return isdigit(c);
552 case 7: return isgraph(c);
553 case 8: return islower(c);
554 case 9: return isprint(c);
555 case 10: return ispunct(c);
556 case 11: return isspace(c);
557 case 12: return isupper(c);
558 case 13: return isalnum(c) || c == CHAR_UNDERSCORE;
559 default: return isxdigit(c);
560 }
561}
562
563/* Parse a range of characters.
564
565Arguments:
566 from starting point of scanning the range
567 pattern_end end of pattern
568 out output context
569 separator glob separator
570 with_escape backslash is needed before separator
571
572Returns: 0 => success
573 !0 => error code
574*/
575
576static int
577convert_glob_parse_range(PCRE2_SPTR *from, PCRE2_SPTR pattern_end,
578 pcre2_output_context *out, BOOL utf, PCRE2_UCHAR separator,
579 BOOL with_escape, PCRE2_UCHAR escape, BOOL no_wildsep)
580{
581BOOL is_negative = FALSE;
582BOOL separator_seen = FALSE;
583BOOL has_prev_c;
584PCRE2_SPTR pattern = *from;
585PCRE2_SPTR char_start = NULL;
586uint32_t c, prev_c;
587int len, class_index;
588
589(void)utf; /* Avoid compiler warning. */
590
591if (pattern >= pattern_end)
592 {
593 *from = pattern;
594 return PCRE2_ERROR_MISSING_SQUARE_BRACKET;
595 }
596
597if (*pattern == CHAR_EXCLAMATION_MARK
598 || *pattern == CHAR_CIRCUMFLEX_ACCENT)
599 {
600 pattern++;
601
602 if (pattern >= pattern_end)
603 {
604 *from = pattern;
605 return PCRE2_ERROR_MISSING_SQUARE_BRACKET;
606 }
607
608 is_negative = TRUE;
609
610 out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
611 out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
612 len = 2;
613
614 if (!no_wildsep)
615 {
616 if (with_escape)
617 {
618 out->out_str[len] = CHAR_BACKSLASH;
619 len++;
620 }
621 out->out_str[len] = (uint8_t) separator;
622 }
623
624 convert_glob_write_str(out, len + 1);
625 }
626else
627 convert_glob_write(out, CHAR_LEFT_SQUARE_BRACKET);
628
629has_prev_c = FALSE;
630prev_c = 0;
631
632if (*pattern == CHAR_RIGHT_SQUARE_BRACKET)
633 {
634 out->out_str[0] = CHAR_BACKSLASH;
635 out->out_str[1] = CHAR_RIGHT_SQUARE_BRACKET;
636 convert_glob_write_str(out, 2);
637 has_prev_c = TRUE;
638 prev_c = CHAR_RIGHT_SQUARE_BRACKET;
639 pattern++;
640 }
641
642while (pattern < pattern_end)
643 {
644 char_start = pattern;
645 GETCHARINCTEST(c, pattern);
646
647 if (c == CHAR_RIGHT_SQUARE_BRACKET)
648 {
649 convert_glob_write(out, c);
650
651 if (!is_negative && !no_wildsep && separator_seen)
652 {
653 out->out_str[0] = CHAR_LEFT_PARENTHESIS;
654 out->out_str[1] = CHAR_QUESTION_MARK;
655 out->out_str[2] = CHAR_LESS_THAN_SIGN;
656 out->out_str[3] = CHAR_EXCLAMATION_MARK;
657 convert_glob_write_str(out, 4);
658
659 convert_glob_print_separator(out, separator, with_escape);
660 convert_glob_write(out, CHAR_RIGHT_PARENTHESIS);
661 }
662
663 *from = pattern;
664 return 0;
665 }
666
667 if (pattern >= pattern_end) break;
668
669 if (c == CHAR_LEFT_SQUARE_BRACKET && *pattern == CHAR_COLON)
670 {
671 *from = pattern;
672 class_index = convert_glob_parse_class(from, pattern_end, out);
673
674 if (class_index != 0)
675 {
676 pattern = *from;
677
678 has_prev_c = FALSE;
679 prev_c = 0;
680
681 if (!is_negative &&
682 convert_glob_char_in_class (class_index, separator))
683 separator_seen = TRUE;
684 continue;
685 }
686 }
687 else if (c == CHAR_MINUS && has_prev_c &&
688 *pattern != CHAR_RIGHT_SQUARE_BRACKET)
689 {
690 convert_glob_write(out, CHAR_MINUS);
691
692 char_start = pattern;
693 GETCHARINCTEST(c, pattern);
694
695 if (pattern >= pattern_end) break;
696
697 if (escape != 0 && c == escape)
698 {
699 char_start = pattern;
700 GETCHARINCTEST(c, pattern);
701 }
702 else if (c == CHAR_LEFT_SQUARE_BRACKET && *pattern == CHAR_COLON)
703 {
704 *from = pattern;
705 return PCRE2_ERROR_CONVERT_SYNTAX;
706 }
707
708 if (prev_c > c)
709 {
710 *from = pattern;
711 return PCRE2_ERROR_CONVERT_SYNTAX;
712 }
713
714 if (prev_c < separator && separator < c) separator_seen = TRUE;
715
716 has_prev_c = FALSE;
717 prev_c = 0;
718 }
719 else
720 {
721 if (escape != 0 && c == escape)
722 {
723 char_start = pattern;
724 GETCHARINCTEST(c, pattern);
725
726 if (pattern >= pattern_end) break;
727 }
728
729 has_prev_c = TRUE;
730 prev_c = c;
731 }
732
733 if (c == CHAR_LEFT_SQUARE_BRACKET || c == CHAR_RIGHT_SQUARE_BRACKET ||
734 c == CHAR_BACKSLASH || c == CHAR_MINUS)
735 convert_glob_write(out, CHAR_BACKSLASH);
736
737 if (c == separator) separator_seen = TRUE;
738
739 do convert_glob_write(out, *char_start++); while (char_start < pattern);
740 }
741
742*from = pattern;
743return PCRE2_ERROR_MISSING_SQUARE_BRACKET;
744}
745
746
747/* Prints a (*COMMIT) into the output.
748
749Arguments:
750 out output context
751*/
752
753static void
754convert_glob_print_commit(pcre2_output_context *out)
755{
756out->out_str[0] = CHAR_LEFT_PARENTHESIS;
757out->out_str[1] = CHAR_ASTERISK;
758out->out_str[2] = CHAR_C;
759out->out_str[3] = CHAR_O;
760out->out_str[4] = CHAR_M;
761out->out_str[5] = CHAR_M;
762out->out_str[6] = CHAR_I;
763out->out_str[7] = CHAR_T;
764convert_glob_write_str(out, 8);
765convert_glob_write(out, CHAR_RIGHT_PARENTHESIS);
766}
767
768
769/* Bash glob converter.
770
771Arguments:
772 pattype the pattern type
773 pattern the pattern
774 plength length in code units
775 utf TRUE if UTF
776 use_buffer where to put the output
777 use_length length of use_buffer
778 bufflenptr where to put the used length
779 dummyrun TRUE if a dummy run
780 ccontext the convert context
781
782Returns: 0 => success
783 !0 => error code
784*/
785
786static int
787convert_glob(uint32_t options, PCRE2_SPTR pattern, PCRE2_SIZE plength,
788 BOOL utf, PCRE2_UCHAR *use_buffer, PCRE2_SIZE use_length,
789 PCRE2_SIZE *bufflenptr, BOOL dummyrun, pcre2_convert_context *ccontext)
790{
791pcre2_output_context out;
792PCRE2_SPTR pattern_start = pattern;
793PCRE2_SPTR pattern_end = pattern + plength;
794PCRE2_UCHAR separator = ccontext->glob_separator;
795PCRE2_UCHAR escape = ccontext->glob_escape;
796PCRE2_UCHAR c;
797BOOL no_wildsep = (options & PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR) != 0;
798BOOL no_starstar = (options & PCRE2_CONVERT_GLOB_NO_STARSTAR) != 0;
799BOOL in_atomic = FALSE;
800BOOL after_starstar = FALSE;
801BOOL no_slash_z = FALSE;
802BOOL with_escape, is_start, after_separator;
803int result = 0;
804
805(void)utf; /* Avoid compiler warning. */
806
807#ifdef SUPPORT_UNICODE
808if (utf && (separator >= 128 || escape >= 128))
809 {
810 /* Currently only ASCII characters are supported. */
811 *bufflenptr = 0;
812 return PCRE2_ERROR_CONVERT_SYNTAX;
813 }
814#endif
815
816with_escape = strchr(pcre2_escaped_literals, separator) != NULL;
817
818/* Initialize default for error offset as end of input. */
819out.output = use_buffer;
820out.output_end = use_buffer + use_length;
821out.output_size = 0;
822
823out.out_str[0] = CHAR_LEFT_PARENTHESIS;
824out.out_str[1] = CHAR_QUESTION_MARK;
825out.out_str[2] = CHAR_s;
826out.out_str[3] = CHAR_RIGHT_PARENTHESIS;
827convert_glob_write_str(&out, 4);
828
829is_start = TRUE;
830
831if (pattern < pattern_end && pattern[0] == CHAR_ASTERISK)
832 {
833 if (no_wildsep)
834 is_start = FALSE;
835 else if (!no_starstar && pattern + 1 < pattern_end &&
836 pattern[1] == CHAR_ASTERISK)
837 is_start = FALSE;
838 }
839
840if (is_start)
841 {
842 out.out_str[0] = CHAR_BACKSLASH;
843 out.out_str[1] = CHAR_A;
844 convert_glob_write_str(&out, 2);
845 }
846
847while (pattern < pattern_end)
848 {
849 c = *pattern++;
850
851 if (c == CHAR_ASTERISK)
852 {
853 is_start = pattern == pattern_start + 1;
854
855 if (in_atomic)
856 {
857 convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS);
858 in_atomic = FALSE;
859 }
860
861 if (!no_starstar && pattern < pattern_end && *pattern == CHAR_ASTERISK)
862 {
863 after_separator = is_start || (pattern[-2] == separator);
864
865 do pattern++; while (pattern < pattern_end &&
866 *pattern == CHAR_ASTERISK);
867
868 if (pattern >= pattern_end)
869 {
870 no_slash_z = TRUE;
871 break;
872 }
873
874 after_starstar = TRUE;
875
876 if (after_separator && escape != 0 && *pattern == escape &&
877 pattern + 1 < pattern_end && pattern[1] == separator)
878 pattern++;
879
880 if (is_start)
881 {
882 if (*pattern != separator) continue;
883
884 out.out_str[0] = CHAR_LEFT_PARENTHESIS;
885 out.out_str[1] = CHAR_QUESTION_MARK;
886 out.out_str[2] = CHAR_COLON;
887 out.out_str[3] = CHAR_BACKSLASH;
888 out.out_str[4] = CHAR_A;
889 out.out_str[5] = CHAR_VERTICAL_LINE;
890 convert_glob_write_str(&out, 6);
891
892 convert_glob_print_separator(&out, separator, with_escape);
893 convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS);
894
895 pattern++;
896 continue;
897 }
898
899 convert_glob_print_commit(&out);
900
901 if (!after_separator || *pattern != separator)
902 {
903 out.out_str[0] = CHAR_DOT;
904 out.out_str[1] = CHAR_ASTERISK;
905 out.out_str[2] = CHAR_QUESTION_MARK;
906 convert_glob_write_str(&out, 3);
907 continue;
908 }
909
910 out.out_str[0] = CHAR_LEFT_PARENTHESIS;
911 out.out_str[1] = CHAR_QUESTION_MARK;
912 out.out_str[2] = CHAR_COLON;
913 out.out_str[3] = CHAR_DOT;
914 out.out_str[4] = CHAR_ASTERISK;
915 out.out_str[5] = CHAR_QUESTION_MARK;
916
917 convert_glob_write_str(&out, 6);
918
919 convert_glob_print_separator(&out, separator, with_escape);
920
921 out.out_str[0] = CHAR_RIGHT_PARENTHESIS;
922 out.out_str[1] = CHAR_QUESTION_MARK;
923 out.out_str[2] = CHAR_QUESTION_MARK;
924 convert_glob_write_str(&out, 3);
925
926 pattern++;
927 continue;
928 }
929
930 if (pattern < pattern_end && *pattern == CHAR_ASTERISK)
931 {
932 do pattern++; while (pattern < pattern_end &&
933 *pattern == CHAR_ASTERISK);
934 }
935
936 if (no_wildsep)
937 {
938 if (pattern >= pattern_end)
939 {
940 no_slash_z = TRUE;
941 break;
942 }
943
944 /* Start check must be after the end check. */
945 if (is_start) continue;
946 }
947
948 if (!is_start)
949 {
950 if (after_starstar)
951 {
952 out.out_str[0] = CHAR_LEFT_PARENTHESIS;
953 out.out_str[1] = CHAR_QUESTION_MARK;
954 out.out_str[2] = CHAR_GREATER_THAN_SIGN;
955 convert_glob_write_str(&out, 3);
956 in_atomic = TRUE;
957 }
958 else
959 convert_glob_print_commit(&out);
960 }
961
962 if (no_wildsep)
963 convert_glob_write(&out, CHAR_DOT);
964 else
965 convert_glob_print_wildcard(&out, separator, with_escape);
966
967 out.out_str[0] = CHAR_ASTERISK;
968 out.out_str[1] = CHAR_QUESTION_MARK;
969 if (pattern >= pattern_end)
970 out.out_str[1] = CHAR_PLUS;
971 convert_glob_write_str(&out, 2);
972 continue;
973 }
974
975 if (c == CHAR_QUESTION_MARK)
976 {
977 if (no_wildsep)
978 convert_glob_write(&out, CHAR_DOT);
979 else
980 convert_glob_print_wildcard(&out, separator, with_escape);
981 continue;
982 }
983
984 if (c == CHAR_LEFT_SQUARE_BRACKET)
985 {
986 result = convert_glob_parse_range(&pattern, pattern_end,
987 &out, utf, separator, with_escape, escape, no_wildsep);
988 if (result != 0) break;
989 continue;
990 }
991
992 if (escape != 0 && c == escape)
993 {
994 if (pattern >= pattern_end)
995 {
996 result = PCRE2_ERROR_CONVERT_SYNTAX;
997 break;
998 }
999 c = *pattern++;
1000 }
1001
1002 if (c < 128 && strchr(pcre2_escaped_literals, c) != NULL)
1003 convert_glob_write(&out, CHAR_BACKSLASH);
1004
1005 convert_glob_write(&out, c);
1006 }
1007
1008if (result == 0)
1009 {
1010 if (!no_slash_z)
1011 {
1012 out.out_str[0] = CHAR_BACKSLASH;
1013 out.out_str[1] = CHAR_z;
1014 convert_glob_write_str(&out, 2);
1015 }
1016
1017 if (in_atomic)
1018 convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS);
1019
1020 convert_glob_write(&out, CHAR_NUL);
1021
1022 if (!dummyrun && out.output_size != (PCRE2_SIZE) (out.output - use_buffer))
1023 result = PCRE2_ERROR_NOMEMORY;
1024 }
1025
1026if (result != 0)
1027 {
1028 *bufflenptr = pattern - pattern_start;
1029 return result;
1030 }
1031
1032*bufflenptr = out.output_size - 1;
1033return 0;
1034}
1035
1036
1037/*************************************************
1038* Convert pattern *
1039*************************************************/
1040
1041/* This is the external-facing function for converting other forms of pattern
1042into PCRE2 regular expression patterns. On error, the bufflenptr argument is
1043used to return an offset in the original pattern.
1044
1045Arguments:
1046 pattern the input pattern
1047 plength length of input, or PCRE2_ZERO_TERMINATED
1048 options options bits
1049 buffptr pointer to pointer to output buffer
1050 bufflenptr pointer to length of output buffer
1051 ccontext convert context or NULL
1052
1053Returns: 0 for success, else an error code (+ve or -ve)
1054*/
1055
1056PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
1057pcre2_pattern_convert(PCRE2_SPTR pattern, PCRE2_SIZE plength, uint32_t options,
1058 PCRE2_UCHAR **buffptr, PCRE2_SIZE *bufflenptr,
1059 pcre2_convert_context *ccontext)
1060{
1061int i, rc;
1062PCRE2_UCHAR dummy_buffer[DUMMY_BUFFER_SIZE];
1063PCRE2_UCHAR *use_buffer = dummy_buffer;
1064PCRE2_SIZE use_length = DUMMY_BUFFER_SIZE;
1065BOOL utf = (options & PCRE2_CONVERT_UTF) != 0;
1066uint32_t pattype = options & TYPE_OPTIONS;
1067
1068if (pattern == NULL || bufflenptr == NULL) return PCRE2_ERROR_NULL;
Elliott Hughes653c2102019-01-09 15:41:36 -08001069
Elliott Hughes9bc971b2018-07-27 13:23:14 -07001070if ((options & ~ALL_OPTIONS) != 0 || /* Undefined bit set */
1071 (pattype & (~pattype+1)) != pattype || /* More than one type set */
1072 pattype == 0) /* No type set */
1073 {
Elliott Hughes653c2102019-01-09 15:41:36 -08001074 *bufflenptr = 0; /* Error offset */
Elliott Hughes9bc971b2018-07-27 13:23:14 -07001075 return PCRE2_ERROR_BADOPTION;
1076 }
1077
1078if (plength == PCRE2_ZERO_TERMINATED) plength = PRIV(strlen)(pattern);
1079if (ccontext == NULL) ccontext =
1080 (pcre2_convert_context *)(&PRIV(default_convert_context));
1081
1082/* Check UTF if required. */
1083
1084#ifndef SUPPORT_UNICODE
Elliott Hughes653c2102019-01-09 15:41:36 -08001085if (utf)
1086 {
1087 *bufflenptr = 0; /* Error offset */
1088 return PCRE2_ERROR_UNICODE_NOT_SUPPORTED;
1089 }
Elliott Hughes9bc971b2018-07-27 13:23:14 -07001090#else
1091if (utf && (options & PCRE2_CONVERT_NO_UTF_CHECK) == 0)
1092 {
1093 PCRE2_SIZE erroroffset;
1094 rc = PRIV(valid_utf)(pattern, plength, &erroroffset);
1095 if (rc != 0)
1096 {
1097 *bufflenptr = erroroffset;
1098 return rc;
1099 }
1100 }
1101#endif
1102
1103/* If buffptr is not NULL, and what it points to is not NULL, we are being
1104provided with a buffer and a length, so set them as the buffer to use. */
1105
1106if (buffptr != NULL && *buffptr != NULL)
1107 {
1108 use_buffer = *buffptr;
1109 use_length = *bufflenptr;
1110 }
1111
1112/* Call an individual converter, either just once (if a buffer was provided or
1113just the length is needed), or twice (if a memory allocation is required). */
1114
1115for (i = 0; i < 2; i++)
1116 {
1117 PCRE2_UCHAR *allocated;
1118 BOOL dummyrun = buffptr == NULL || *buffptr == NULL;
1119
1120 switch(pattype)
1121 {
1122 case PCRE2_CONVERT_GLOB:
1123 rc = convert_glob(options & ~PCRE2_CONVERT_GLOB, pattern, plength, utf,
1124 use_buffer, use_length, bufflenptr, dummyrun, ccontext);
1125 break;
1126
1127 case PCRE2_CONVERT_POSIX_BASIC:
1128 case PCRE2_CONVERT_POSIX_EXTENDED:
1129 rc = convert_posix(pattype, pattern, plength, utf, use_buffer, use_length,
1130 bufflenptr, dummyrun, ccontext);
1131 break;
1132
1133 default:
Elliott Hughes653c2102019-01-09 15:41:36 -08001134 *bufflenptr = 0; /* Error offset */
Elliott Hughes9bc971b2018-07-27 13:23:14 -07001135 return PCRE2_ERROR_INTERNAL;
1136 }
1137
1138 if (rc != 0 || /* Error */
1139 buffptr == NULL || /* Just the length is required */
1140 *buffptr != NULL) /* Buffer was provided or allocated */
1141 return rc;
1142
1143 /* Allocate memory for the buffer, with hidden space for an allocator at
1144 the start. The next time round the loop runs the conversion for real. */
1145
1146 allocated = PRIV(memctl_malloc)(sizeof(pcre2_memctl) +
1147 (*bufflenptr + 1)*PCRE2_CODE_UNIT_WIDTH, (pcre2_memctl *)ccontext);
1148 if (allocated == NULL) return PCRE2_ERROR_NOMEMORY;
1149 *buffptr = (PCRE2_UCHAR *)(((char *)allocated) + sizeof(pcre2_memctl));
1150
1151 use_buffer = *buffptr;
1152 use_length = *bufflenptr + 1;
1153 }
1154
1155/* Control should never get here. */
1156
1157return PCRE2_ERROR_INTERNAL;
1158}
1159
1160
1161/*************************************************
1162* Free converted pattern *
1163*************************************************/
1164
1165/* This frees a converted pattern that was put in newly-allocated memory.
1166
1167Argument: the converted pattern
1168Returns: nothing
1169*/
1170
1171PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
1172pcre2_converted_pattern_free(PCRE2_UCHAR *converted)
1173{
1174if (converted != NULL)
1175 {
1176 pcre2_memctl *memctl =
1177 (pcre2_memctl *)((char *)converted - sizeof(pcre2_memctl));
1178 memctl->free(memctl, memctl->memory_data);
1179 }
1180}
1181
1182/* End of pcre2_convert.c */