| /* uwildmat.c is reused from libinn - https://launchpad.net/ubuntu/+source/inn2/2.5.4-1 |
| |
| This provides wild card matching originally used in InterNetNews and is |
| described in https://tools.ietf.org/html/rfc3977#section-4 |
| |
| INN licence: |
| INN as a whole and all code contained in it not otherwise marked with |
| different licenses and/or copyrights is covered by the following copyright |
| and license: |
| |
| Copyright (c) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, |
| 2013, 2014 by Internet Systems Consortium, Inc. ("ISC") |
| Copyright (c) 1991, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, |
| 2002, 2003 by The Internet Software Consortium and Rich Salz |
| |
| This code is derived from software contributed to the Internet Software |
| Consortium by Rich Salz. |
| |
| Permission to use, copy, modify, and distribute this software for any |
| purpose with or without fee is hereby granted, provided that the above |
| copyright notice and this permission notice appear in all copies. |
| |
| THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH |
| REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
| MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY |
| SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
| WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
| ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
| OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
| |
| */ |
| |
| /* $Id: uwildmat.c 8918 2010-01-22 23:28:28Z iulius $ |
| ** |
| ** wildmat pattern matching with Unicode UTF-8 extensions. |
| ** |
| ** Do shell-style pattern matching for ?, \, [], and * characters. Might not |
| ** be robust in face of malformed patterns; e.g., "foo[a-" could cause a |
| ** segmentation violation. It is 8-bit clean. (Robustness hopefully fixed |
| ** July 2000; all malformed patterns should now just fail to match anything.) |
| ** |
| ** Original by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986. |
| ** Rich $alz is now <rsalz@osf.org>. |
| ** |
| ** April, 1991: Replaced mutually-recursive calls with in-line code for the |
| ** star character. |
| ** |
| ** Special thanks to Lars Mathiesen <thorinn@diku.dk> for the ABORT code. |
| ** This can greatly speed up failing wildcard patterns. For example: |
| ** |
| ** pattern: -*-*-*-*-*-*-12-*-*-*-m-*-*-* |
| ** text 1: -adobe-courier-bold-o-normal--12-120-75-75-m-70-iso8859-1 |
| ** text 2: -adobe-courier-bold-o-normal--12-120-75-75-X-70-iso8859-1 |
| ** |
| ** Text 1 matches with 51 calls, while text 2 fails with 54 calls. Without |
| ** the ABORT code, it takes 22310 calls to fail. Ugh. The following |
| ** explanation is from Lars: |
| ** |
| ** The precondition that must be fulfilled is that DoMatch will consume at |
| ** least one character in text. This is true if *p is neither '*' nor '\0'.) |
| ** The last return has ABORT instead of false to avoid quadratic behaviour in |
| ** cases like pattern "*a*b*c*d" with text "abcxxxxx". With false, each |
| ** star-loop has to run to the end of the text; with ABORT only the last one |
| ** does. |
| ** |
| ** Once the control of one instance of DoMatch enters the star-loop, that |
| ** instance will return either true or ABORT, and any calling instance will |
| ** therefore return immediately after (without calling recursively again). |
| ** In effect, only one star-loop is ever active. It would be possible to |
| ** modify the code to maintain this context explicitly, eliminating all |
| ** recursive calls at the cost of some complication and loss of clarity (and |
| ** the ABORT stuff seems to be unclear enough by itself). I think it would |
| ** be unwise to try to get this into a released version unless you have a |
| ** good test data base to try it out on. |
| ** |
| ** June, 1991: Robert Elz <kre@munnari.oz.au> added minus and close bracket |
| ** handling for character sets. |
| ** |
| ** July, 2000: Largely rewritten by Russ Allbery <rra@stanford.edu> to add |
| ** support for ',', '!', and optionally '@' to the core wildmat routine. |
| ** Broke the character class matching into a separate function for clarity |
| ** since it's infrequently used in practice, and added some simple lookahead |
| ** to significantly decrease the recursive calls in the '*' matching code. |
| ** Added support for UTF-8 as the default character set for any high-bit |
| ** characters. |
| ** |
| ** For more information on UTF-8, see RFC 3629. |
| ** |
| ** Please note that this file is intentionally written so that conditionally |
| ** executed expressions are on separate lines from the condition to |
| ** facilitate analysis of the coverage of the test suite using purecov. |
| ** Please preserve this. As of March 11, 2001, purecov reports that the |
| ** accompanying test suite achieves 100% coverage of this file. |
| */ |
| |
| #include <ctype.h> |
| #include <string.h> |
| #include <stdint.h> |
| #include "uwildmat/uwildmat.h" |
| |
| #define ABORT -1 |
| |
| /* Whether or not an octet looks like the start of a UTF-8 character. */ |
| #define ISUTF8(c) (((c) & 0xc0) == 0xc0) |
| |
| |
| /* |
| ** Determine the length of a non-ASCII character in octets (for advancing |
| ** pointers when skipping over characters). Takes a pointer to the start of |
| ** the character and to the last octet of the string. If end is NULL, expect |
| ** the string pointed to by start to be nul-terminated. If the character is |
| ** malformed UTF-8, return 1 to treat it like an eight-bit local character. |
| */ |
| static int |
| utf8_length(const unsigned char *start, const unsigned char *end) |
| { |
| unsigned char mask = 0x80; |
| const unsigned char *p; |
| int length = 0; |
| int left; |
| |
| for (; mask > 0 && (*start & mask) == mask; mask >>= 1) |
| length++; |
| if (length < 2 || length > 6) |
| return 1; |
| if (end != NULL && (end - start + 1) < length) |
| return 1; |
| left = length - 1; |
| for (p = start + 1; left > 0 && (*p & 0xc0) == 0x80; p++) |
| left--; |
| return (left == 0) ? length : 1; |
| } |
| |
| |
| /* |
| ** Check whether a string contains only valid UTF-8 characters. |
| */ |
| bool |
| is_valid_utf8(const char *text) |
| { |
| unsigned char mask; |
| const unsigned char *p; |
| int length; |
| int left; |
| |
| for (p = (const unsigned char *)text; *p != '\0';) { |
| mask = 0x80; |
| length = 0; |
| |
| /* Find out the expected length of the character. */ |
| for (; mask > 0 && (*p & mask) == mask; mask >>= 1) |
| length++; |
| |
| p++; |
| |
| /* Valid ASCII. */ |
| if (length == 0) |
| continue; |
| |
| /* Invalid length. */ |
| if (length < 2 || length > 6) |
| return false; |
| |
| /* Check that each byte looks like 10xxxxxx, except for the first. */ |
| left = length - 1; |
| for (; left > 0 && (*p & 0xc0) == 0x80; p++) |
| left--; |
| |
| if (left > 0) |
| return false; |
| } |
| |
| return true; |
| } |
| |
| |
| /* |
| ** Convert a UTF-8 character to UCS-4. Takes a pointer to the start of the |
| ** character and to the last octet of the string, and to a uint32_t into |
| ** which to put the decoded UCS-4 value. If end is NULL, expect the string |
| ** pointed to by start to be nul-terminated. Returns the number of octets in |
| ** the UTF-8 encoding. If the UTF-8 character is malformed, set result to |
| ** the decimal value of the first octet; this is wrong, but it will generally |
| ** cause the rest of the wildmat matching to do the right thing for non-UTF-8 |
| ** input. |
| */ |
| static int |
| utf8_decode(const unsigned char *start, const unsigned char *end, |
| uint32_t *result) |
| { |
| uint32_t value = 0; |
| int length, i; |
| const unsigned char *p = start; |
| unsigned char mask; |
| |
| length = utf8_length(start, end); |
| if (length < 2) { |
| *result = *start; |
| return 1; |
| } |
| mask = (1 << (7 - length)) - 1; |
| value = *p & mask; |
| p++; |
| for (i = length - 1; i > 0; i--) { |
| value = (value << 6) | (*p & 0x3f); |
| p++; |
| } |
| *result = value; |
| return length; |
| } |
| |
| |
| /* |
| ** Match a character class against text, a UCS-4 character. start is a |
| ** pointer to the first character of the character class, end a pointer to |
| ** the last. Returns whether the class matches that character. |
| */ |
| static bool |
| match_class(uint32_t text, const unsigned char *start, |
| const unsigned char *end) |
| { |
| bool reversed, allowrange; |
| const unsigned char *p = start; |
| uint32_t first = 0; |
| uint32_t last; |
| uint32_t lc = tolower(text); |
| |
| /* Check for an inverted character class (starting with ^). If the |
| character matches the character class, we return !reversed; that way, |
| we return true if it's a regular character class and false if it's a |
| reversed one. If the character doesn't match, we return reversed. */ |
| reversed = (*p == '^'); |
| if (reversed) |
| p++; |
| |
| /* Walk through the character class until we reach the end or find a |
| match, handling character ranges as we go. Only permit a range to |
| start when allowrange is true; this allows - to be treated like a |
| normal character as the first character of the class and catches |
| malformed ranges like a-e-n. We treat the character at the beginning |
| of a range as both a regular member of the class and the beginning of |
| the range; this is harmless (although it means that malformed ranges |
| like m-a will match m and nothing else). */ |
| allowrange = false; |
| while (p <= end) { |
| if (allowrange && *p == '-' && p < end) { |
| p++; |
| p += utf8_decode(p, end, &last); |
| if ((text >= first && text <= last) || |
| (lc >= first && lc <= last)) |
| return !reversed; |
| allowrange = false; |
| } else { |
| p += utf8_decode(p, end, &first); |
| if (text == first || lc == first) |
| return !reversed; |
| allowrange = true; |
| } |
| } |
| return reversed; |
| } |
| |
| |
| /* |
| ** Match the text against the pattern between start and end. This is a |
| ** single pattern; a leading ! or @ must already be taken care of, and |
| ** commas must be dealt with outside of this routine. |
| */ |
| static int |
| match_pattern(const unsigned char *text, const unsigned char *start, |
| const unsigned char *end) |
| { |
| const unsigned char *q, *endclass; |
| const unsigned char *p = start; |
| bool ismeta; |
| int matched, width; |
| uint32_t c; |
| unsigned char lc; |
| |
| for (; p <= end; p++) { |
| if (!*text && *p != '*') |
| return ABORT; |
| |
| switch (*p) { |
| case '\\': |
| if (!*++p) |
| return ABORT; |
| /* Fall through. */ |
| |
| default: |
| lc = tolower(*text); |
| if (*text++ != *p && lc != *p) |
| return false; |
| break; |
| |
| case '?': |
| text += ISUTF8(*text) ? utf8_length(text, NULL) : 1; |
| break; |
| |
| case '*': |
| /* Consecutive stars are equivalent to one. Advance pattern to |
| the character after the star. */ |
| for (++p; *p == '*'; p++) |
| ; |
| |
| /* A trailing star will match anything. */ |
| if (p > end) |
| return true; |
| |
| /* Basic algorithm: Recurse at each point where the * could |
| possibly match. If the match succeeds or aborts, return |
| immediately; otherwise, try the next position. |
| |
| Optimization: If the character after the * in the pattern |
| isn't a metacharacter (the common case), then the * has to |
| consume characters at least up to the next occurrence of that |
| character in the text. Scan forward for those points rather |
| than recursing at every possible point to save the extra |
| function call overhead. */ |
| ismeta = (*p == '[' || *p == '?' || *p == '\\'); |
| while (*text) { |
| width = ISUTF8(*text) ? utf8_length(text, NULL) : 1; |
| if (ismeta) { |
| matched = match_pattern(text, p, end); |
| text += width; |
| } else { |
| while (*text && *text != *p) { |
| text += width; |
| width = ISUTF8(*text) ? utf8_length(text, NULL) : 1; |
| } |
| if (!*text) |
| return ABORT; |
| matched = match_pattern(++text, p + 1, end); |
| } |
| if (matched != false) |
| return matched; |
| } |
| return ABORT; |
| |
| case '[': |
| /* Find the end of the character class, making sure not to pick |
| up a close bracket at the beginning of the class. */ |
| p++; |
| q = p + (*p == '^') + 1; |
| if (q > end) |
| return ABORT; |
| endclass = memchr(q, ']', (size_t) (end - q + 1)); |
| if (!endclass) |
| return ABORT; |
| |
| /* Do the heavy lifting in another function for clarity, since |
| character classes are an uncommon case. */ |
| text += utf8_decode(text, NULL, &c); |
| if (!match_class(c, p, endclass - 1)) |
| return false; |
| p = endclass; |
| break; |
| } |
| } |
| |
| return (*text == '\0'); |
| } |
| |
| |
| /* |
| ** Takes text and a wildmat expression; a wildmat expression is a |
| ** comma-separated list of wildmat patterns, optionally preceded by ! to |
| ** invert the sense of the expression. Returns UWILDMAT_MATCH if that |
| ** expression matches the text, UWILDMAT_FAIL otherwise. If allowpoison is |
| ** set, allow @ to introduce a poison expression (the same as !, but if it |
| ** triggers the failed match the routine returns UWILDMAT_POISON instead). |
| */ |
| static enum uwildmat |
| match_expression(const unsigned char *text, const unsigned char *start, |
| bool allowpoison) |
| { |
| const unsigned char *end, *split; |
| const unsigned char *p = start; |
| bool reverse, escaped; |
| bool match = false; |
| bool poison = false; |
| bool poisoned = false; |
| |
| /* Handle the empty expression separately, since otherwise end will be |
| set to an invalid pointer. */ |
| if (!*p) |
| return !*text ? UWILDMAT_MATCH : UWILDMAT_FAIL; |
| end = start + strlen((const char *) start) - 1; |
| |
| /* Main match loop. Find each comma that separates patterns, and attempt |
| to match the text with each pattern in order. The last matching |
| pattern determines whether the whole expression matches. */ |
| for (; p <= end + 1; p = split + 1) { |
| if (allowpoison) |
| poison = (*p == '@'); |
| reverse = (*p == '!') || poison; |
| if (reverse) |
| p++; |
| |
| /* Find the first unescaped comma, if any. If there is none, split |
| will be one greater than end and point at the nul at the end of |
| the string. */ |
| for (escaped = false, split = p; split <= end; split++) { |
| if (*split == '[') { |
| split++; |
| if (*split == ']') |
| split++; |
| while (split <= end && *split != ']') |
| split++; |
| } |
| if (*split == ',' && !escaped) |
| break; |
| escaped = (*split == '\\') ? !escaped : false; |
| } |
| |
| /* Optimization: If match == !reverse and poison == poisoned, this |
| pattern can't change the result, so don't do any work. */ |
| if (match == !reverse && poison == poisoned) |
| continue; |
| if (match_pattern(text, p, split - 1) == true) { |
| poisoned = poison; |
| match = !reverse; |
| } |
| } |
| if (poisoned) |
| return UWILDMAT_POISON; |
| return match ? UWILDMAT_MATCH : UWILDMAT_FAIL; |
| } |
| |
| |
| /* |
| ** User-level routine used for wildmats where @ should be treated as a |
| ** regular character. |
| */ |
| bool |
| uwildmat(const char *text, const char *pat) |
| { |
| const unsigned char *utext = (const unsigned char *) text; |
| const unsigned char *upat = (const unsigned char *) pat; |
| |
| if (upat[0] == '*' && upat[1] == '\0') |
| return true; |
| else |
| return (match_expression(utext, upat, false) == UWILDMAT_MATCH); |
| } |
| |
| |
| /* |
| ** User-level routine used for wildmats that support poison matches. |
| */ |
| enum uwildmat |
| uwildmat_poison(const char *text, const char *pat) |
| { |
| const unsigned char *utext = (const unsigned char *) text; |
| const unsigned char *upat = (const unsigned char *) pat; |
| |
| if (upat[0] == '*' && upat[1] == '\0') |
| return UWILDMAT_MATCH; |
| else |
| return match_expression(utext, upat, true); |
| } |
| |
| |
| /* |
| ** User-level routine for simple expressions (neither , nor ! are special). |
| */ |
| bool |
| uwildmat_simple(const char *text, const char *pat) |
| { |
| const unsigned char *utext = (const unsigned char *) text; |
| const unsigned char *upat = (const unsigned char *) pat; |
| size_t length; |
| |
| if (upat[0] == '*' && upat[1] == '\0') |
| return true; |
| else { |
| length = strlen(pat); |
| return (match_pattern(utext, upat, upat + length - 1) == true); |
| } |
| } |