sewardj | d7a02db | 2008-12-12 08:07:49 +0000 | [diff] [blame] | 1 | |
| 2 | /*--------------------------------------------------------------------*/ |
| 3 | /*--- A simple sequence matching facility. ---*/ |
| 4 | /*--- m_seqmatch.c ---*/ |
| 5 | /*--------------------------------------------------------------------*/ |
| 6 | |
| 7 | /* |
| 8 | This file is part of Valgrind, a dynamic binary instrumentation |
| 9 | framework. |
| 10 | |
sewardj | 9eecbbb | 2010-05-03 21:37:12 +0000 | [diff] [blame] | 11 | Copyright (C) 2008-2010 OpenWorks Ltd |
sewardj | d7a02db | 2008-12-12 08:07:49 +0000 | [diff] [blame] | 12 | info@open-works.co.uk |
| 13 | |
| 14 | This program is free software; you can redistribute it and/or |
| 15 | modify it under the terms of the GNU General Public License as |
| 16 | published by the Free Software Foundation; either version 2 of the |
| 17 | License, or (at your option) any later version. |
| 18 | |
| 19 | This program is distributed in the hope that it will be useful, but |
| 20 | WITHOUT ANY WARRANTY; without even the implied warranty of |
| 21 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 22 | General Public License for more details. |
| 23 | |
| 24 | You should have received a copy of the GNU General Public License |
| 25 | along with this program; if not, write to the Free Software |
| 26 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA |
| 27 | 02111-1307, USA. |
| 28 | |
| 29 | The GNU General Public License is contained in the file COPYING. |
| 30 | */ |
| 31 | |
| 32 | #include "pub_core_basics.h" |
| 33 | #include "pub_core_libcassert.h" |
| 34 | #include "pub_core_libcbase.h" // VG_(strlen) |
| 35 | #include "pub_core_seqmatch.h" // self |
| 36 | |
| 37 | /* --------------------------------------------------------------------- |
| 38 | A simple sequence matching facility |
| 39 | ------------------------------------------------------------------ */ |
| 40 | |
| 41 | /* See detailed comment in include/pub_tool_seqmatch.h about this. */ |
| 42 | Bool VG_(generic_match) ( |
| 43 | Bool matchAll, |
| 44 | void* patt, SizeT szbPatt, UWord nPatt, UWord ixPatt, |
| 45 | void* input, SizeT szbInput, UWord nInput, UWord ixInput, |
| 46 | Bool (*pIsStar)(void*), |
| 47 | Bool (*pIsQuery)(void*), |
| 48 | Bool (*pattEQinp)(void*,void*) |
| 49 | ) |
| 50 | { |
| 51 | /* This is the spec, written in my favourite formal specification |
| 52 | language. It specifies non-greedy matching of '*'s. |
| 53 | |
| 54 | ma ('*':ps) (i:is) = ma ps (i:is) || ma ('*':ps) is |
| 55 | ma ('*':ps) [] = ma ps [] |
| 56 | |
| 57 | ma ('?':ps) (i:is) = ma ps is |
| 58 | ma ('?':ps) [] = False |
| 59 | |
| 60 | ma (p:ps) (i:is) = p == i && ma ps is |
| 61 | |
| 62 | ma (p:ps) [] = False |
| 63 | ma [] (i:is) = False -- m-all, True for m-prefix |
| 64 | ma [] [] = True |
| 65 | */ |
| 66 | Bool havePatt, haveInput; |
| 67 | void *currPatt, *currInput; |
| 68 | tailcall: |
| 69 | vg_assert(nPatt >= 0 && nPatt < 1000000); /* arbitrary */ |
| 70 | vg_assert(nInput >= 0 && nInput < 1000000); /* arbitrary */ |
| 71 | vg_assert(ixPatt >= 0 && ixPatt <= nPatt); |
| 72 | vg_assert(ixInput >= 0 && ixInput <= nInput); |
| 73 | |
| 74 | havePatt = ixPatt < nPatt; |
| 75 | haveInput = ixInput < nInput; |
| 76 | |
| 77 | /* No specific need to set NULL when !have{Patt,Input}, but guards |
| 78 | against inadvertantly dereferencing an out of range pointer to |
| 79 | the pattern or input arrays. */ |
| 80 | currPatt = havePatt ? ((Char*)patt) + szbPatt * ixPatt : NULL; |
| 81 | currInput = haveInput ? ((Char*)input) + szbInput * ixInput : NULL; |
| 82 | |
| 83 | // Deal with the complex case first: wildcards. Do frugal |
| 84 | // matching. When encountering a '*', first skip no characters |
| 85 | // at all, and see if the rest of the match still works. Only if |
| 86 | // that fails do we then skip a character, and retry at the next |
| 87 | // position. |
| 88 | // |
| 89 | // ma ('*':ps) (i:is) = ma ps (i:is) || ma ('*':ps) is |
| 90 | // |
| 91 | // If we're out of input, check the rest of the pattern matches |
| 92 | // the empty input. This really means can only be be empty or |
| 93 | // composed entirely of '*'s. |
| 94 | // |
| 95 | // ma ('*':ps) [] = ma ps [] |
| 96 | // |
| 97 | if (havePatt && pIsStar(currPatt)) { |
| 98 | if (haveInput) { |
| 99 | // ma ('*':ps) (i:is) = ma ps (i:is) || ma ('*':ps) is |
| 100 | // we unavoidably have to make a real recursive call for the |
| 101 | // first half of the OR, since this isn't straight tail-recursion. |
| 102 | if (VG_(generic_match)( matchAll, |
| 103 | patt, szbPatt, nPatt, ixPatt+1, |
| 104 | input,szbInput,nInput, ixInput+0, |
| 105 | pIsStar,pIsQuery,pattEQinp) ) { |
| 106 | return True; |
| 107 | } |
| 108 | // but we can tail-recurse for the second call |
| 109 | ixInput++; goto tailcall; |
| 110 | } else { |
| 111 | // ma ('*':ps) [] = ma ps [] |
| 112 | ixPatt++; goto tailcall; |
| 113 | } |
| 114 | } |
| 115 | |
| 116 | // simpler cases now. Deal with '?' wildcards. |
| 117 | // |
| 118 | // ma ('?':ps) (i:is) = ma ps is |
| 119 | // ma ('?':ps) [] = False |
| 120 | if (havePatt && pIsQuery(currPatt)) { |
| 121 | if (haveInput) { |
| 122 | ixPatt++; ixInput++; goto tailcall; |
| 123 | } else { |
| 124 | return False; |
| 125 | } |
| 126 | } |
| 127 | |
| 128 | // obvious case with literal chars in the pattern |
| 129 | // |
| 130 | // ma (p:ps) (i:is) = p == i && ma ps is |
| 131 | if (havePatt && haveInput) { |
| 132 | if (!pattEQinp(currPatt,currInput)) return False; |
| 133 | ixPatt++; ixInput++; goto tailcall; |
| 134 | } |
| 135 | |
| 136 | // if we run out of input before we run out of pattern, we must fail |
| 137 | // ma (p:ps) [] = False |
| 138 | if (havePatt && !haveInput) return False; |
| 139 | |
| 140 | // if we run out of pattern before we run out of input, the |
| 141 | // verdict depends on the matching mode. If we are trying to |
| 142 | // match exactly (the pattern must consume the entire input) |
| 143 | // then the outcome is failure. However, if we're merely attempting |
| 144 | // to match some prefix of the input, then we have been successful. |
| 145 | // |
| 146 | // ma [] (i:is) = False -- m-all, True for m-prefix |
| 147 | if (!havePatt && haveInput) { |
| 148 | return matchAll ? False // match-all |
| 149 | : True; // match-prefix |
| 150 | } |
| 151 | |
| 152 | // finally, if both sequence and input are both completely |
| 153 | // consumed, then we were successful, regardless of matching mode. |
| 154 | if (!havePatt && !haveInput) return True; |
| 155 | |
| 156 | // end of cases |
| 157 | vg_assert(0); |
| 158 | } |
| 159 | |
| 160 | |
| 161 | /* And a parameterization of the above, to make it do |
| 162 | string matching. |
| 163 | */ |
| 164 | static Bool charIsStar ( void* pV ) { return *(Char*)pV == '*'; } |
| 165 | static Bool charIsQuery ( void* pV ) { return *(Char*)pV == '?'; } |
| 166 | static Bool char_p_EQ_i ( void* pV, void* cV ) { |
| 167 | Char p = *(Char*)pV; |
| 168 | Char c = *(Char*)cV; |
| 169 | vg_assert(p != '*' && p != '?'); |
| 170 | return p == c; |
| 171 | } |
| 172 | Bool VG_(string_match) ( const Char* patt, const Char* input ) |
| 173 | { |
| 174 | return VG_(generic_match)( |
| 175 | True/* match-all */, |
| 176 | (void*)patt, sizeof(UChar), VG_(strlen)(patt), 0, |
| 177 | (void*)input, sizeof(UChar), VG_(strlen)(input), 0, |
| 178 | charIsStar, charIsQuery, char_p_EQ_i |
| 179 | ); |
| 180 | } |
| 181 | |
| 182 | |
| 183 | // test cases for the matcher (in match-all mode) |
| 184 | // typedef struct { char* patt; char* input; Bool xres; } Test; |
| 185 | // |
| 186 | //static Test tests[] = |
| 187 | // { |
| 188 | // { "" ,"" , True }, |
| 189 | // { "a" ,"" , False }, |
| 190 | // { "a" ,"b" , False }, |
| 191 | // { "a" ,"a" , True }, |
| 192 | // { "a" ,"aa" , False }, |
| 193 | // { "*" ,"" , True }, |
| 194 | // { "**" ,"" , True }, |
| 195 | // { "*" ,"abc", True }, |
| 196 | // { "*a" ,"abc", False }, |
| 197 | // { "*b" ,"abc", False }, |
| 198 | // { "*bc" ,"abc", True }, |
| 199 | // { "a*b" ,"abc", False }, |
| 200 | // { "a*c" ,"abc", True }, |
| 201 | // { "*c" ,"abc", True }, |
| 202 | // { "c*c" ,"abc", False }, |
| 203 | // { "abc*" ,"abc", True }, |
| 204 | // { "abc**" ,"abc", True }, |
| 205 | // { "**abc" ,"abc", True }, |
| 206 | // { "**a*b*c**" ,"abc", True }, |
| 207 | // { "**a*b*d**" ,"abc", False }, |
| 208 | // { "a?b" ,"abc", False }, |
| 209 | // { "a?c" ,"abc", True }, |
| 210 | // { "?" ,"" , False }, |
| 211 | // { "?" ,"a" , True }, |
| 212 | // { "?" ,"ab" , False }, |
| 213 | // { "abcd" ,"abc", False }, |
| 214 | // { "ab" ,"abc", False }, |
| 215 | // { NULL ,NULL , False } |
| 216 | // }; |
| 217 | // |
| 218 | //int main ( void ) |
| 219 | //{ |
| 220 | // Test* t; |
| 221 | // for (t = tests; t->patt; t++) { |
| 222 | // printf("%10s %6s %s\n", |
| 223 | // t->patt, t->input, |
| 224 | // match_string_all((UChar*)t->patt,(UChar*)t->input,True) |
| 225 | // == t->xres |
| 226 | // ? "pass" : "FAIL" ); |
| 227 | // } |
| 228 | // return 0; |
| 229 | //} |
| 230 | |
| 231 | /*--------------------------------------------------------------------*/ |
| 232 | /*--- end m_seqmatch.c ---*/ |
| 233 | /*--------------------------------------------------------------------*/ |