| Armin Rigo | 9ed7306 | 2005-12-14 18:10:45 +0000 | [diff] [blame] | 1 | # | 
 | 2 | # ElementTree | 
| Florent Xicluna | f15351d | 2010-03-13 23:24:31 +0000 | [diff] [blame] | 3 | # $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $ | 
| Armin Rigo | 9ed7306 | 2005-12-14 18:10:45 +0000 | [diff] [blame] | 4 | # | 
 | 5 | # limited xpath support for element trees | 
 | 6 | # | 
 | 7 | # history: | 
 | 8 | # 2003-05-23 fl   created | 
 | 9 | # 2003-05-28 fl   added support for // etc | 
 | 10 | # 2003-08-27 fl   fixed parsing of periods in element names | 
| Florent Xicluna | f15351d | 2010-03-13 23:24:31 +0000 | [diff] [blame] | 11 | # 2007-09-10 fl   new selection engine | 
 | 12 | # 2007-09-12 fl   fixed parent selector | 
 | 13 | # 2007-09-13 fl   added iterfind; changed findall to return a list | 
 | 14 | # 2007-11-30 fl   added namespaces support | 
 | 15 | # 2009-10-30 fl   added child element value filter | 
| Armin Rigo | 9ed7306 | 2005-12-14 18:10:45 +0000 | [diff] [blame] | 16 | # | 
| Florent Xicluna | f15351d | 2010-03-13 23:24:31 +0000 | [diff] [blame] | 17 | # Copyright (c) 2003-2009 by Fredrik Lundh.  All rights reserved. | 
| Armin Rigo | 9ed7306 | 2005-12-14 18:10:45 +0000 | [diff] [blame] | 18 | # | 
 | 19 | # fredrik@pythonware.com | 
 | 20 | # http://www.pythonware.com | 
 | 21 | # | 
 | 22 | # -------------------------------------------------------------------- | 
 | 23 | # The ElementTree toolkit is | 
 | 24 | # | 
| Florent Xicluna | f15351d | 2010-03-13 23:24:31 +0000 | [diff] [blame] | 25 | # Copyright (c) 1999-2009 by Fredrik Lundh | 
| Armin Rigo | 9ed7306 | 2005-12-14 18:10:45 +0000 | [diff] [blame] | 26 | # | 
 | 27 | # By obtaining, using, and/or copying this software and/or its | 
 | 28 | # associated documentation, you agree that you have read, understood, | 
 | 29 | # and will comply with the following terms and conditions: | 
 | 30 | # | 
 | 31 | # Permission to use, copy, modify, and distribute this software and | 
 | 32 | # its associated documentation for any purpose and without fee is | 
 | 33 | # hereby granted, provided that the above copyright notice appears in | 
 | 34 | # all copies, and that both that copyright notice and this permission | 
 | 35 | # notice appear in supporting documentation, and that the name of | 
 | 36 | # Secret Labs AB or the author not be used in advertising or publicity | 
 | 37 | # pertaining to distribution of the software without specific, written | 
 | 38 | # prior permission. | 
 | 39 | # | 
 | 40 | # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD | 
 | 41 | # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- | 
 | 42 | # ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR | 
 | 43 | # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY | 
 | 44 | # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, | 
 | 45 | # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS | 
 | 46 | # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE | 
 | 47 | # OF THIS SOFTWARE. | 
 | 48 | # -------------------------------------------------------------------- | 
 | 49 |  | 
| Fredrik Lundh | 63168a5 | 2005-12-14 22:29:34 +0000 | [diff] [blame] | 50 | # Licensed to PSF under a Contributor Agreement. | 
| Florent Xicluna | f15351d | 2010-03-13 23:24:31 +0000 | [diff] [blame] | 51 | # See http://www.python.org/psf/license for licensing details. | 
| Fredrik Lundh | 63168a5 | 2005-12-14 22:29:34 +0000 | [diff] [blame] | 52 |  | 
| Armin Rigo | 9ed7306 | 2005-12-14 18:10:45 +0000 | [diff] [blame] | 53 | ## | 
 | 54 | # Implementation module for XPath support.  There's usually no reason | 
 | 55 | # to import this module directly; the <b>ElementTree</b> does this for | 
 | 56 | # you, if needed. | 
 | 57 | ## | 
 | 58 |  | 
 | 59 | import re | 
 | 60 |  | 
| Florent Xicluna | f15351d | 2010-03-13 23:24:31 +0000 | [diff] [blame] | 61 | xpath_tokenizer_re = re.compile( | 
 | 62 |     "(" | 
 | 63 |     "'[^']*'|\"[^\"]*\"|" | 
 | 64 |     "::|" | 
 | 65 |     "//?|" | 
 | 66 |     "\.\.|" | 
 | 67 |     "\(\)|" | 
 | 68 |     "[/.*:\[\]\(\)@=])|" | 
 | 69 |     "((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|" | 
 | 70 |     "\s+" | 
 | 71 |     ) | 
| Armin Rigo | 9ed7306 | 2005-12-14 18:10:45 +0000 | [diff] [blame] | 72 |  | 
| Florent Xicluna | f15351d | 2010-03-13 23:24:31 +0000 | [diff] [blame] | 73 | def xpath_tokenizer(pattern, namespaces=None): | 
 | 74 |     for token in xpath_tokenizer_re.findall(pattern): | 
 | 75 |         tag = token[1] | 
 | 76 |         if tag and tag[0] != "{" and ":" in tag: | 
| Armin Rigo | 9ed7306 | 2005-12-14 18:10:45 +0000 | [diff] [blame] | 77 |             try: | 
| Florent Xicluna | f15351d | 2010-03-13 23:24:31 +0000 | [diff] [blame] | 78 |                 prefix, uri = tag.split(":", 1) | 
 | 79 |                 if not namespaces: | 
 | 80 |                     raise KeyError | 
 | 81 |                 yield token[0], "{%s}%s" % (namespaces[prefix], uri) | 
 | 82 |             except KeyError: | 
 | 83 |                 raise SyntaxError("prefix %r not found in prefix map" % prefix) | 
 | 84 |         else: | 
 | 85 |             yield token | 
 | 86 |  | 
 | 87 | def get_parent_map(context): | 
 | 88 |     parent_map = context.parent_map | 
 | 89 |     if parent_map is None: | 
 | 90 |         context.parent_map = parent_map = {} | 
 | 91 |         for p in context.root.iter(): | 
 | 92 |             for e in p: | 
 | 93 |                 parent_map[e] = p | 
 | 94 |     return parent_map | 
 | 95 |  | 
 | 96 | def prepare_child(next, token): | 
 | 97 |     tag = token[1] | 
 | 98 |     def select(context, result): | 
 | 99 |         for elem in result: | 
 | 100 |             for e in elem: | 
 | 101 |                 if e.tag == tag: | 
 | 102 |                     yield e | 
 | 103 |     return select | 
 | 104 |  | 
 | 105 | def prepare_star(next, token): | 
 | 106 |     def select(context, result): | 
 | 107 |         for elem in result: | 
| Philip Jenvey | fd0d3e5 | 2012-10-01 15:34:31 -0700 | [diff] [blame] | 108 |             yield from elem | 
| Florent Xicluna | f15351d | 2010-03-13 23:24:31 +0000 | [diff] [blame] | 109 |     return select | 
 | 110 |  | 
 | 111 | def prepare_self(next, token): | 
 | 112 |     def select(context, result): | 
| Philip Jenvey | fd0d3e5 | 2012-10-01 15:34:31 -0700 | [diff] [blame] | 113 |         yield from result | 
| Florent Xicluna | f15351d | 2010-03-13 23:24:31 +0000 | [diff] [blame] | 114 |     return select | 
 | 115 |  | 
 | 116 | def prepare_descendant(next, token): | 
 | 117 |     token = next() | 
 | 118 |     if token[0] == "*": | 
 | 119 |         tag = "*" | 
 | 120 |     elif not token[0]: | 
 | 121 |         tag = token[1] | 
 | 122 |     else: | 
 | 123 |         raise SyntaxError("invalid descendant") | 
 | 124 |     def select(context, result): | 
 | 125 |         for elem in result: | 
 | 126 |             for e in elem.iter(tag): | 
 | 127 |                 if e is not elem: | 
 | 128 |                     yield e | 
 | 129 |     return select | 
 | 130 |  | 
 | 131 | def prepare_parent(next, token): | 
 | 132 |     def select(context, result): | 
 | 133 |         # FIXME: raise error if .. is applied at toplevel? | 
 | 134 |         parent_map = get_parent_map(context) | 
 | 135 |         result_map = {} | 
 | 136 |         for elem in result: | 
 | 137 |             if elem in parent_map: | 
 | 138 |                 parent = parent_map[elem] | 
 | 139 |                 if parent not in result_map: | 
 | 140 |                     result_map[parent] = None | 
 | 141 |                     yield parent | 
 | 142 |     return select | 
 | 143 |  | 
 | 144 | def prepare_predicate(next, token): | 
 | 145 |     # FIXME: replace with real parser!!! refs: | 
 | 146 |     # http://effbot.org/zone/simple-iterator-parser.htm | 
 | 147 |     # http://javascript.crockford.com/tdop/tdop.html | 
 | 148 |     signature = [] | 
 | 149 |     predicate = [] | 
 | 150 |     while 1: | 
 | 151 |         token = next() | 
 | 152 |         if token[0] == "]": | 
 | 153 |             break | 
 | 154 |         if token[0] and token[0][:1] in "'\"": | 
 | 155 |             token = "'", token[0][1:-1] | 
 | 156 |         signature.append(token[0] or "-") | 
 | 157 |         predicate.append(token[1]) | 
 | 158 |     signature = "".join(signature) | 
 | 159 |     # use signature to determine predicate type | 
 | 160 |     if signature == "@-": | 
 | 161 |         # [@attribute] predicate | 
 | 162 |         key = predicate[1] | 
 | 163 |         def select(context, result): | 
 | 164 |             for elem in result: | 
 | 165 |                 if elem.get(key) is not None: | 
 | 166 |                     yield elem | 
 | 167 |         return select | 
 | 168 |     if signature == "@-='": | 
 | 169 |         # [@attribute='value'] | 
 | 170 |         key = predicate[1] | 
 | 171 |         value = predicate[-1] | 
 | 172 |         def select(context, result): | 
 | 173 |             for elem in result: | 
 | 174 |                 if elem.get(key) == value: | 
 | 175 |                     yield elem | 
 | 176 |         return select | 
| Eli Bendersky | 5c6198b | 2013-01-24 06:29:26 -0800 | [diff] [blame] | 177 |     if signature == "-" and not re.match("\-?\d+$", predicate[0]): | 
| Florent Xicluna | f15351d | 2010-03-13 23:24:31 +0000 | [diff] [blame] | 178 |         # [tag] | 
 | 179 |         tag = predicate[0] | 
 | 180 |         def select(context, result): | 
 | 181 |             for elem in result: | 
 | 182 |                 if elem.find(tag) is not None: | 
 | 183 |                     yield elem | 
 | 184 |         return select | 
| Eli Bendersky | 5c6198b | 2013-01-24 06:29:26 -0800 | [diff] [blame] | 185 |     if signature == "-='" and not re.match("\-?\d+$", predicate[0]): | 
| Florent Xicluna | f15351d | 2010-03-13 23:24:31 +0000 | [diff] [blame] | 186 |         # [tag='value'] | 
 | 187 |         tag = predicate[0] | 
 | 188 |         value = predicate[-1] | 
 | 189 |         def select(context, result): | 
 | 190 |             for elem in result: | 
 | 191 |                 for e in elem.findall(tag): | 
 | 192 |                     if "".join(e.itertext()) == value: | 
 | 193 |                         yield elem | 
 | 194 |                         break | 
 | 195 |         return select | 
 | 196 |     if signature == "-" or signature == "-()" or signature == "-()-": | 
 | 197 |         # [index] or [last()] or [last()-index] | 
 | 198 |         if signature == "-": | 
| Eli Bendersky | 5c6198b | 2013-01-24 06:29:26 -0800 | [diff] [blame] | 199 |             # [index] | 
| Florent Xicluna | f15351d | 2010-03-13 23:24:31 +0000 | [diff] [blame] | 200 |             index = int(predicate[0]) - 1 | 
| Eli Bendersky | 5c6198b | 2013-01-24 06:29:26 -0800 | [diff] [blame] | 201 |             if index < 0: | 
 | 202 |                 raise SyntaxError("XPath position >= 1 expected") | 
| Florent Xicluna | f15351d | 2010-03-13 23:24:31 +0000 | [diff] [blame] | 203 |         else: | 
 | 204 |             if predicate[0] != "last": | 
 | 205 |                 raise SyntaxError("unsupported function") | 
 | 206 |             if signature == "-()-": | 
| Armin Rigo | 9ed7306 | 2005-12-14 18:10:45 +0000 | [diff] [blame] | 207 |                 try: | 
| Florent Xicluna | f15351d | 2010-03-13 23:24:31 +0000 | [diff] [blame] | 208 |                     index = int(predicate[2]) - 1 | 
 | 209 |                 except ValueError: | 
 | 210 |                     raise SyntaxError("unsupported expression") | 
| Eli Bendersky | 5c6198b | 2013-01-24 06:29:26 -0800 | [diff] [blame] | 211 |                 if index > -2: | 
 | 212 |                     raise SyntaxError("XPath offset from last() must be negative") | 
| Armin Rigo | 9ed7306 | 2005-12-14 18:10:45 +0000 | [diff] [blame] | 213 |             else: | 
| Florent Xicluna | f15351d | 2010-03-13 23:24:31 +0000 | [diff] [blame] | 214 |                 index = -1 | 
 | 215 |         def select(context, result): | 
 | 216 |             parent_map = get_parent_map(context) | 
 | 217 |             for elem in result: | 
 | 218 |                 try: | 
 | 219 |                     parent = parent_map[elem] | 
 | 220 |                     # FIXME: what if the selector is "*" ? | 
 | 221 |                     elems = list(parent.findall(elem.tag)) | 
 | 222 |                     if elems[index] is elem: | 
 | 223 |                         yield elem | 
 | 224 |                 except (IndexError, KeyError): | 
 | 225 |                     pass | 
 | 226 |         return select | 
 | 227 |     raise SyntaxError("invalid predicate") | 
 | 228 |  | 
 | 229 | ops = { | 
 | 230 |     "": prepare_child, | 
 | 231 |     "*": prepare_star, | 
 | 232 |     ".": prepare_self, | 
 | 233 |     "..": prepare_parent, | 
 | 234 |     "//": prepare_descendant, | 
 | 235 |     "[": prepare_predicate, | 
 | 236 |     } | 
| Armin Rigo | 9ed7306 | 2005-12-14 18:10:45 +0000 | [diff] [blame] | 237 |  | 
 | 238 | _cache = {} | 
 | 239 |  | 
| Florent Xicluna | f15351d | 2010-03-13 23:24:31 +0000 | [diff] [blame] | 240 | class _SelectorContext: | 
 | 241 |     parent_map = None | 
 | 242 |     def __init__(self, root): | 
 | 243 |         self.root = root | 
| Armin Rigo | 9ed7306 | 2005-12-14 18:10:45 +0000 | [diff] [blame] | 244 |  | 
| Florent Xicluna | f15351d | 2010-03-13 23:24:31 +0000 | [diff] [blame] | 245 | # -------------------------------------------------------------------- | 
 | 246 |  | 
 | 247 | ## | 
 | 248 | # Generate all matching objects. | 
 | 249 |  | 
 | 250 | def iterfind(elem, path, namespaces=None): | 
 | 251 |     # compile selector pattern | 
| Eli Bendersky | 2acc525 | 2013-08-03 17:47:47 -0700 | [diff] [blame] | 252 |     cache_key = (path, None if namespaces is None | 
 | 253 |                             else tuple(sorted(namespaces.items()))) | 
| Florent Xicluna | f15351d | 2010-03-13 23:24:31 +0000 | [diff] [blame] | 254 |     if path[-1:] == "/": | 
 | 255 |         path = path + "*" # implicit all (FIXME: keep this?) | 
 | 256 |     try: | 
| Eli Bendersky | 2acc525 | 2013-08-03 17:47:47 -0700 | [diff] [blame] | 257 |         selector = _cache[cache_key] | 
| Florent Xicluna | f15351d | 2010-03-13 23:24:31 +0000 | [diff] [blame] | 258 |     except KeyError: | 
 | 259 |         if len(_cache) > 100: | 
 | 260 |             _cache.clear() | 
 | 261 |         if path[:1] == "/": | 
 | 262 |             raise SyntaxError("cannot use absolute path on element") | 
 | 263 |         next = iter(xpath_tokenizer(path, namespaces)).__next__ | 
 | 264 |         token = next() | 
 | 265 |         selector = [] | 
 | 266 |         while 1: | 
 | 267 |             try: | 
 | 268 |                 selector.append(ops[token[0]](next, token)) | 
 | 269 |             except StopIteration: | 
 | 270 |                 raise SyntaxError("invalid path") | 
 | 271 |             try: | 
 | 272 |                 token = next() | 
 | 273 |                 if token[0] == "/": | 
 | 274 |                     token = next() | 
 | 275 |             except StopIteration: | 
 | 276 |                 break | 
| Eli Bendersky | 2acc525 | 2013-08-03 17:47:47 -0700 | [diff] [blame] | 277 |         _cache[cache_key] = selector | 
| Florent Xicluna | f15351d | 2010-03-13 23:24:31 +0000 | [diff] [blame] | 278 |     # execute selector pattern | 
 | 279 |     result = [elem] | 
 | 280 |     context = _SelectorContext(elem) | 
 | 281 |     for select in selector: | 
 | 282 |         result = select(context, result) | 
 | 283 |     return result | 
| Armin Rigo | 9ed7306 | 2005-12-14 18:10:45 +0000 | [diff] [blame] | 284 |  | 
 | 285 | ## | 
 | 286 | # Find first matching object. | 
 | 287 |  | 
| Florent Xicluna | f15351d | 2010-03-13 23:24:31 +0000 | [diff] [blame] | 288 | def find(elem, path, namespaces=None): | 
 | 289 |     try: | 
 | 290 |         return next(iterfind(elem, path, namespaces)) | 
 | 291 |     except StopIteration: | 
 | 292 |         return None | 
| Armin Rigo | 9ed7306 | 2005-12-14 18:10:45 +0000 | [diff] [blame] | 293 |  | 
 | 294 | ## | 
 | 295 | # Find all matching objects. | 
 | 296 |  | 
| Florent Xicluna | f15351d | 2010-03-13 23:24:31 +0000 | [diff] [blame] | 297 | def findall(elem, path, namespaces=None): | 
 | 298 |     return list(iterfind(elem, path, namespaces)) | 
 | 299 |  | 
 | 300 | ## | 
 | 301 | # Find text for first matching object. | 
 | 302 |  | 
 | 303 | def findtext(elem, path, default=None, namespaces=None): | 
 | 304 |     try: | 
 | 305 |         elem = next(iterfind(elem, path, namespaces)) | 
 | 306 |         return elem.text or "" | 
 | 307 |     except StopIteration: | 
 | 308 |         return default |