| Jeremy Hylton | 1afc169 | 2008-06-18 20:49:58 +0000 | [diff] [blame] | 1 | """Parse (absolute and relative) URLs. | 
|  | 2 |  | 
|  | 3 | See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, | 
|  | 4 | UC Irvine, June 1995. | 
|  | 5 | """ | 
|  | 6 |  | 
|  | 7 | __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", | 
|  | 8 | "urlsplit", "urlunsplit"] | 
|  | 9 |  | 
|  | 10 | # A classification of schemes ('' means apply by default) | 
|  | 11 | uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', | 
|  | 12 | 'wais', 'file', 'https', 'shttp', 'mms', | 
|  | 13 | 'prospero', 'rtsp', 'rtspu', '', 'sftp'] | 
|  | 14 | uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', | 
|  | 15 | 'imap', 'wais', 'file', 'mms', 'https', 'shttp', | 
|  | 16 | 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '', | 
|  | 17 | 'svn', 'svn+ssh', 'sftp'] | 
|  | 18 | non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', | 
|  | 19 | 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] | 
|  | 20 | uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap', | 
|  | 21 | 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips', | 
|  | 22 | 'mms', '', 'sftp'] | 
|  | 23 | uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms', | 
|  | 24 | 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', ''] | 
|  | 25 | uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', | 
|  | 26 | 'nntp', 'wais', 'https', 'shttp', 'snews', | 
|  | 27 | 'file', 'prospero', ''] | 
|  | 28 |  | 
|  | 29 | # Characters valid in scheme names | 
|  | 30 | scheme_chars = ('abcdefghijklmnopqrstuvwxyz' | 
|  | 31 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' | 
|  | 32 | '0123456789' | 
|  | 33 | '+-.') | 
|  | 34 |  | 
|  | 35 | MAX_CACHE_SIZE = 20 | 
|  | 36 | _parse_cache = {} | 
|  | 37 |  | 
|  | 38 | def clear_cache(): | 
|  | 39 | """Clear the parse cache.""" | 
|  | 40 | _parse_cache.clear() | 
|  | 41 |  | 
|  | 42 |  | 
|  | 43 | class ResultMixin(object): | 
|  | 44 | """Shared methods for the parsed result objects.""" | 
|  | 45 |  | 
|  | 46 | @property | 
|  | 47 | def username(self): | 
|  | 48 | netloc = self.netloc | 
|  | 49 | if "@" in netloc: | 
|  | 50 | userinfo = netloc.rsplit("@", 1)[0] | 
|  | 51 | if ":" in userinfo: | 
|  | 52 | userinfo = userinfo.split(":", 1)[0] | 
|  | 53 | return userinfo | 
|  | 54 | return None | 
|  | 55 |  | 
|  | 56 | @property | 
|  | 57 | def password(self): | 
|  | 58 | netloc = self.netloc | 
|  | 59 | if "@" in netloc: | 
|  | 60 | userinfo = netloc.rsplit("@", 1)[0] | 
|  | 61 | if ":" in userinfo: | 
|  | 62 | return userinfo.split(":", 1)[1] | 
|  | 63 | return None | 
|  | 64 |  | 
|  | 65 | @property | 
|  | 66 | def hostname(self): | 
|  | 67 | netloc = self.netloc | 
|  | 68 | if "@" in netloc: | 
|  | 69 | netloc = netloc.rsplit("@", 1)[1] | 
|  | 70 | if ":" in netloc: | 
|  | 71 | netloc = netloc.split(":", 1)[0] | 
|  | 72 | return netloc.lower() or None | 
|  | 73 |  | 
|  | 74 | @property | 
|  | 75 | def port(self): | 
|  | 76 | netloc = self.netloc | 
|  | 77 | if "@" in netloc: | 
|  | 78 | netloc = netloc.rsplit("@", 1)[1] | 
|  | 79 | if ":" in netloc: | 
|  | 80 | port = netloc.split(":", 1)[1] | 
|  | 81 | return int(port, 10) | 
|  | 82 | return None | 
|  | 83 |  | 
|  | 84 | from collections import namedtuple | 
|  | 85 |  | 
|  | 86 | class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin): | 
|  | 87 |  | 
|  | 88 | __slots__ = () | 
|  | 89 |  | 
|  | 90 | def geturl(self): | 
|  | 91 | return urlunsplit(self) | 
|  | 92 |  | 
|  | 93 |  | 
|  | 94 | class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin): | 
|  | 95 |  | 
|  | 96 | __slots__ = () | 
|  | 97 |  | 
|  | 98 | def geturl(self): | 
|  | 99 | return urlunparse(self) | 
|  | 100 |  | 
|  | 101 |  | 
|  | 102 | def urlparse(url, scheme='', allow_fragments=True): | 
|  | 103 | """Parse a URL into 6 components: | 
|  | 104 | <scheme>://<netloc>/<path>;<params>?<query>#<fragment> | 
|  | 105 | Return a 6-tuple: (scheme, netloc, path, params, query, fragment). | 
|  | 106 | Note that we don't break the components up in smaller bits | 
|  | 107 | (e.g. netloc is a single string) and we don't expand % escapes.""" | 
|  | 108 | tuple = urlsplit(url, scheme, allow_fragments) | 
|  | 109 | scheme, netloc, url, query, fragment = tuple | 
|  | 110 | if scheme in uses_params and ';' in url: | 
|  | 111 | url, params = _splitparams(url) | 
|  | 112 | else: | 
|  | 113 | params = '' | 
|  | 114 | return ParseResult(scheme, netloc, url, params, query, fragment) | 
|  | 115 |  | 
|  | 116 | def _splitparams(url): | 
|  | 117 | if '/'  in url: | 
|  | 118 | i = url.find(';', url.rfind('/')) | 
|  | 119 | if i < 0: | 
|  | 120 | return url, '' | 
|  | 121 | else: | 
|  | 122 | i = url.find(';') | 
|  | 123 | return url[:i], url[i+1:] | 
|  | 124 |  | 
|  | 125 | def _splitnetloc(url, start=0): | 
|  | 126 | delim = len(url)   # position of end of domain part of url, default is end | 
|  | 127 | for c in '/?#':    # look for delimiters; the order is NOT important | 
|  | 128 | wdelim = url.find(c, start)        # find first of this delim | 
|  | 129 | if wdelim >= 0:                    # if found | 
|  | 130 | delim = min(delim, wdelim)     # use earliest delim position | 
|  | 131 | return url[start:delim], url[delim:]   # return (domain, rest) | 
|  | 132 |  | 
|  | 133 | def urlsplit(url, scheme='', allow_fragments=True): | 
|  | 134 | """Parse a URL into 5 components: | 
|  | 135 | <scheme>://<netloc>/<path>?<query>#<fragment> | 
|  | 136 | Return a 5-tuple: (scheme, netloc, path, query, fragment). | 
|  | 137 | Note that we don't break the components up in smaller bits | 
|  | 138 | (e.g. netloc is a single string) and we don't expand % escapes.""" | 
|  | 139 | allow_fragments = bool(allow_fragments) | 
|  | 140 | key = url, scheme, allow_fragments, type(url), type(scheme) | 
|  | 141 | cached = _parse_cache.get(key, None) | 
|  | 142 | if cached: | 
|  | 143 | return cached | 
|  | 144 | if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth | 
|  | 145 | clear_cache() | 
|  | 146 | netloc = query = fragment = '' | 
|  | 147 | i = url.find(':') | 
|  | 148 | if i > 0: | 
|  | 149 | if url[:i] == 'http': # optimize the common case | 
|  | 150 | scheme = url[:i].lower() | 
|  | 151 | url = url[i+1:] | 
|  | 152 | if url[:2] == '//': | 
|  | 153 | netloc, url = _splitnetloc(url, 2) | 
|  | 154 | if allow_fragments and '#' in url: | 
|  | 155 | url, fragment = url.split('#', 1) | 
|  | 156 | if '?' in url: | 
|  | 157 | url, query = url.split('?', 1) | 
|  | 158 | v = SplitResult(scheme, netloc, url, query, fragment) | 
|  | 159 | _parse_cache[key] = v | 
|  | 160 | return v | 
|  | 161 | for c in url[:i]: | 
|  | 162 | if c not in scheme_chars: | 
|  | 163 | break | 
|  | 164 | else: | 
|  | 165 | scheme, url = url[:i].lower(), url[i+1:] | 
|  | 166 | if scheme in uses_netloc and url[:2] == '//': | 
|  | 167 | netloc, url = _splitnetloc(url, 2) | 
|  | 168 | if allow_fragments and scheme in uses_fragment and '#' in url: | 
|  | 169 | url, fragment = url.split('#', 1) | 
|  | 170 | if scheme in uses_query and '?' in url: | 
|  | 171 | url, query = url.split('?', 1) | 
|  | 172 | v = SplitResult(scheme, netloc, url, query, fragment) | 
|  | 173 | _parse_cache[key] = v | 
|  | 174 | return v | 
|  | 175 |  | 
|  | 176 | def urlunparse(components): | 
|  | 177 | """Put a parsed URL back together again.  This may result in a | 
|  | 178 | slightly different, but equivalent URL, if the URL that was parsed | 
|  | 179 | originally had redundant delimiters, e.g. a ? with an empty query | 
|  | 180 | (the draft states that these are equivalent).""" | 
|  | 181 | scheme, netloc, url, params, query, fragment = components | 
|  | 182 | if params: | 
|  | 183 | url = "%s;%s" % (url, params) | 
|  | 184 | return urlunsplit((scheme, netloc, url, query, fragment)) | 
|  | 185 |  | 
|  | 186 | def urlunsplit(components): | 
|  | 187 | scheme, netloc, url, query, fragment = components | 
|  | 188 | if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): | 
|  | 189 | if url and url[:1] != '/': url = '/' + url | 
|  | 190 | url = '//' + (netloc or '') + url | 
|  | 191 | if scheme: | 
|  | 192 | url = scheme + ':' + url | 
|  | 193 | if query: | 
|  | 194 | url = url + '?' + query | 
|  | 195 | if fragment: | 
|  | 196 | url = url + '#' + fragment | 
|  | 197 | return url | 
|  | 198 |  | 
|  | 199 | def urljoin(base, url, allow_fragments=True): | 
|  | 200 | """Join a base URL and a possibly relative URL to form an absolute | 
|  | 201 | interpretation of the latter.""" | 
|  | 202 | if not base: | 
|  | 203 | return url | 
|  | 204 | if not url: | 
|  | 205 | return base | 
|  | 206 | bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ | 
|  | 207 | urlparse(base, '', allow_fragments) | 
|  | 208 | scheme, netloc, path, params, query, fragment = \ | 
|  | 209 | urlparse(url, bscheme, allow_fragments) | 
|  | 210 | if scheme != bscheme or scheme not in uses_relative: | 
|  | 211 | return url | 
|  | 212 | if scheme in uses_netloc: | 
|  | 213 | if netloc: | 
|  | 214 | return urlunparse((scheme, netloc, path, | 
|  | 215 | params, query, fragment)) | 
|  | 216 | netloc = bnetloc | 
|  | 217 | if path[:1] == '/': | 
|  | 218 | return urlunparse((scheme, netloc, path, | 
|  | 219 | params, query, fragment)) | 
|  | 220 | if not (path or params or query): | 
|  | 221 | return urlunparse((scheme, netloc, bpath, | 
|  | 222 | bparams, bquery, fragment)) | 
|  | 223 | segments = bpath.split('/')[:-1] + path.split('/') | 
|  | 224 | # XXX The stuff below is bogus in various ways... | 
|  | 225 | if segments[-1] == '.': | 
|  | 226 | segments[-1] = '' | 
|  | 227 | while '.' in segments: | 
|  | 228 | segments.remove('.') | 
|  | 229 | while 1: | 
|  | 230 | i = 1 | 
|  | 231 | n = len(segments) - 1 | 
|  | 232 | while i < n: | 
|  | 233 | if (segments[i] == '..' | 
|  | 234 | and segments[i-1] not in ('', '..')): | 
|  | 235 | del segments[i-1:i+1] | 
|  | 236 | break | 
|  | 237 | i = i+1 | 
|  | 238 | else: | 
|  | 239 | break | 
|  | 240 | if segments == ['', '..']: | 
|  | 241 | segments[-1] = '' | 
|  | 242 | elif len(segments) >= 2 and segments[-1] == '..': | 
|  | 243 | segments[-2:] = [''] | 
|  | 244 | return urlunparse((scheme, netloc, '/'.join(segments), | 
|  | 245 | params, query, fragment)) | 
|  | 246 |  | 
|  | 247 | def urldefrag(url): | 
|  | 248 | """Removes any existing fragment from URL. | 
|  | 249 |  | 
|  | 250 | Returns a tuple of the defragmented URL and the fragment.  If | 
|  | 251 | the URL contained no fragments, the second element is the | 
|  | 252 | empty string. | 
|  | 253 | """ | 
|  | 254 | if '#' in url: | 
|  | 255 | s, n, p, a, q, frag = urlparse(url) | 
|  | 256 | defrag = urlunparse((s, n, p, a, q, '')) | 
|  | 257 | return defrag, frag | 
|  | 258 | else: | 
|  | 259 | return url, '' | 
|  | 260 |  | 
|  | 261 |  | 
|  | 262 | _hextochr = dict(('%02x' % i, chr(i)) for i in range(256)) | 
|  | 263 | _hextochr.update(('%02X' % i, chr(i)) for i in range(256)) | 
|  | 264 |  | 
|  | 265 | def unquote(s): | 
|  | 266 | """unquote('abc%20def') -> 'abc def'.""" | 
|  | 267 | res = s.split('%') | 
|  | 268 | for i in range(1, len(res)): | 
|  | 269 | item = res[i] | 
|  | 270 | try: | 
|  | 271 | res[i] = _hextochr[item[:2]] + item[2:] | 
|  | 272 | except KeyError: | 
|  | 273 | res[i] = '%' + item | 
|  | 274 | except UnicodeDecodeError: | 
|  | 275 | res[i] = chr(int(item[:2], 16)) + item[2:] | 
|  | 276 | return "".join(res) | 
|  | 277 |  | 
|  | 278 | def unquote_plus(s): | 
|  | 279 | """unquote('%7e/abc+def') -> '~/abc def'""" | 
|  | 280 | s = s.replace('+', ' ') | 
|  | 281 | return unquote(s) | 
|  | 282 |  | 
|  | 283 | always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' | 
|  | 284 | 'abcdefghijklmnopqrstuvwxyz' | 
|  | 285 | '0123456789' '_.-') | 
|  | 286 | _safe_quoters= {} | 
|  | 287 |  | 
|  | 288 | class Quoter: | 
|  | 289 | def __init__(self, safe): | 
|  | 290 | self.cache = {} | 
|  | 291 | self.safe = safe + always_safe | 
|  | 292 |  | 
|  | 293 | def __call__(self, c): | 
|  | 294 | try: | 
|  | 295 | return self.cache[c] | 
|  | 296 | except KeyError: | 
|  | 297 | if ord(c) < 256: | 
|  | 298 | res = (c in self.safe) and c or ('%%%02X' % ord(c)) | 
|  | 299 | self.cache[c] = res | 
|  | 300 | return res | 
|  | 301 | else: | 
|  | 302 | return "".join(['%%%02X' % i for i in c.encode("utf-8")]) | 
|  | 303 |  | 
|  | 304 | def quote(s, safe = '/'): | 
|  | 305 | """quote('abc def') -> 'abc%20def' | 
|  | 306 |  | 
|  | 307 | Each part of a URL, e.g. the path info, the query, etc., has a | 
|  | 308 | different set of reserved characters that must be quoted. | 
|  | 309 |  | 
|  | 310 | RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists | 
|  | 311 | the following reserved characters. | 
|  | 312 |  | 
|  | 313 | reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | | 
|  | 314 | "$" | "," | 
|  | 315 |  | 
|  | 316 | Each of these characters is reserved in some component of a URL, | 
|  | 317 | but not necessarily in all of them. | 
|  | 318 |  | 
|  | 319 | By default, the quote function is intended for quoting the path | 
|  | 320 | section of a URL.  Thus, it will not encode '/'.  This character | 
|  | 321 | is reserved, but in typical usage the quote function is being | 
|  | 322 | called on a path where the existing slash characters are used as | 
|  | 323 | reserved characters. | 
|  | 324 | """ | 
|  | 325 | cachekey = (safe, always_safe) | 
|  | 326 | try: | 
|  | 327 | quoter = _safe_quoters[cachekey] | 
|  | 328 | except KeyError: | 
|  | 329 | quoter = Quoter(safe) | 
|  | 330 | _safe_quoters[cachekey] = quoter | 
|  | 331 | res = map(quoter, s) | 
|  | 332 | return ''.join(res) | 
|  | 333 |  | 
|  | 334 | def quote_plus(s, safe = ''): | 
|  | 335 | """Quote the query fragment of a URL; replacing ' ' with '+'""" | 
|  | 336 | if ' ' in s: | 
|  | 337 | s = quote(s, safe + ' ') | 
|  | 338 | return s.replace(' ', '+') | 
|  | 339 | return quote(s, safe) | 
|  | 340 |  | 
|  | 341 | def urlencode(query,doseq=0): | 
|  | 342 | """Encode a sequence of two-element tuples or dictionary into a URL query string. | 
|  | 343 |  | 
|  | 344 | If any values in the query arg are sequences and doseq is true, each | 
|  | 345 | sequence element is converted to a separate parameter. | 
|  | 346 |  | 
|  | 347 | If the query arg is a sequence of two-element tuples, the order of the | 
|  | 348 | parameters in the output will match the order of parameters in the | 
|  | 349 | input. | 
|  | 350 | """ | 
|  | 351 |  | 
|  | 352 | if hasattr(query,"items"): | 
|  | 353 | # mapping objects | 
|  | 354 | query = query.items() | 
|  | 355 | else: | 
|  | 356 | # it's a bother at times that strings and string-like objects are | 
|  | 357 | # sequences... | 
|  | 358 | try: | 
|  | 359 | # non-sequence items should not work with len() | 
|  | 360 | # non-empty strings will fail this | 
|  | 361 | if len(query) and not isinstance(query[0], tuple): | 
|  | 362 | raise TypeError | 
|  | 363 | # zero-length sequences of all types will get here and succeed, | 
|  | 364 | # but that's a minor nit - since the original implementation | 
|  | 365 | # allowed empty dicts that type of behavior probably should be | 
|  | 366 | # preserved for consistency | 
|  | 367 | except TypeError: | 
|  | 368 | ty,va,tb = sys.exc_info() | 
|  | 369 | raise TypeError("not a valid non-string sequence or mapping object").with_traceback(tb) | 
|  | 370 |  | 
|  | 371 | l = [] | 
|  | 372 | if not doseq: | 
|  | 373 | # preserve old behavior | 
|  | 374 | for k, v in query: | 
|  | 375 | k = quote_plus(str(k)) | 
|  | 376 | v = quote_plus(str(v)) | 
|  | 377 | l.append(k + '=' + v) | 
|  | 378 | else: | 
|  | 379 | for k, v in query: | 
|  | 380 | k = quote_plus(str(k)) | 
|  | 381 | if isinstance(v, str): | 
|  | 382 | v = quote_plus(v) | 
|  | 383 | l.append(k + '=' + v) | 
|  | 384 | elif isinstance(v, str): | 
|  | 385 | # is there a reasonable way to convert to ASCII? | 
|  | 386 | # encode generates a string, but "replace" or "ignore" | 
|  | 387 | # lose information and "strict" can raise UnicodeError | 
|  | 388 | v = quote_plus(v.encode("ASCII","replace")) | 
|  | 389 | l.append(k + '=' + v) | 
|  | 390 | else: | 
|  | 391 | try: | 
|  | 392 | # is this a sufficient test for sequence-ness? | 
|  | 393 | x = len(v) | 
|  | 394 | except TypeError: | 
|  | 395 | # not a sequence | 
|  | 396 | v = quote_plus(str(v)) | 
|  | 397 | l.append(k + '=' + v) | 
|  | 398 | else: | 
|  | 399 | # loop over the sequence | 
|  | 400 | for elt in v: | 
|  | 401 | l.append(k + '=' + quote_plus(str(elt))) | 
|  | 402 | return '&'.join(l) | 
|  | 403 |  | 
|  | 404 | # Utilities to parse URLs (most of these return None for missing parts): | 
|  | 405 | # unwrap('<URL:type://host/path>') --> 'type://host/path' | 
|  | 406 | # splittype('type:opaquestring') --> 'type', 'opaquestring' | 
|  | 407 | # splithost('//host[:port]/path') --> 'host[:port]', '/path' | 
|  | 408 | # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]' | 
|  | 409 | # splitpasswd('user:passwd') -> 'user', 'passwd' | 
|  | 410 | # splitport('host:port') --> 'host', 'port' | 
|  | 411 | # splitquery('/path?query') --> '/path', 'query' | 
|  | 412 | # splittag('/path#tag') --> '/path', 'tag' | 
|  | 413 | # splitattr('/path;attr1=value1;attr2=value2;...') -> | 
|  | 414 | #   '/path', ['attr1=value1', 'attr2=value2', ...] | 
|  | 415 | # splitvalue('attr=value') --> 'attr', 'value' | 
|  | 416 | # urllib.parse.unquote('abc%20def') -> 'abc def' | 
|  | 417 | # quote('abc def') -> 'abc%20def') | 
|  | 418 |  | 
|  | 419 | def toBytes(url): | 
|  | 420 | """toBytes(u"URL") --> 'URL'.""" | 
|  | 421 | # Most URL schemes require ASCII. If that changes, the conversion | 
|  | 422 | # can be relaxed. | 
|  | 423 | # XXX get rid of toBytes() | 
|  | 424 | if isinstance(url, str): | 
|  | 425 | try: | 
|  | 426 | url = url.encode("ASCII").decode() | 
|  | 427 | except UnicodeError: | 
|  | 428 | raise UnicodeError("URL " + repr(url) + | 
|  | 429 | " contains non-ASCII characters") | 
|  | 430 | return url | 
|  | 431 |  | 
|  | 432 | def unwrap(url): | 
|  | 433 | """unwrap('<URL:type://host/path>') --> 'type://host/path'.""" | 
|  | 434 | url = str(url).strip() | 
|  | 435 | if url[:1] == '<' and url[-1:] == '>': | 
|  | 436 | url = url[1:-1].strip() | 
|  | 437 | if url[:4] == 'URL:': url = url[4:].strip() | 
|  | 438 | return url | 
|  | 439 |  | 
|  | 440 | _typeprog = None | 
|  | 441 | def splittype(url): | 
|  | 442 | """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" | 
|  | 443 | global _typeprog | 
|  | 444 | if _typeprog is None: | 
|  | 445 | import re | 
|  | 446 | _typeprog = re.compile('^([^/:]+):') | 
|  | 447 |  | 
|  | 448 | match = _typeprog.match(url) | 
|  | 449 | if match: | 
|  | 450 | scheme = match.group(1) | 
|  | 451 | return scheme.lower(), url[len(scheme) + 1:] | 
|  | 452 | return None, url | 
|  | 453 |  | 
|  | 454 | _hostprog = None | 
|  | 455 | def splithost(url): | 
|  | 456 | """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" | 
|  | 457 | global _hostprog | 
|  | 458 | if _hostprog is None: | 
|  | 459 | import re | 
|  | 460 | _hostprog = re.compile('^//([^/?]*)(.*)$') | 
|  | 461 |  | 
|  | 462 | match = _hostprog.match(url) | 
|  | 463 | if match: return match.group(1, 2) | 
|  | 464 | return None, url | 
|  | 465 |  | 
|  | 466 | _userprog = None | 
|  | 467 | def splituser(host): | 
|  | 468 | """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" | 
|  | 469 | global _userprog | 
|  | 470 | if _userprog is None: | 
|  | 471 | import re | 
|  | 472 | _userprog = re.compile('^(.*)@(.*)$') | 
|  | 473 |  | 
|  | 474 | match = _userprog.match(host) | 
|  | 475 | if match: return map(unquote, match.group(1, 2)) | 
|  | 476 | return None, host | 
|  | 477 |  | 
|  | 478 | _passwdprog = None | 
|  | 479 | def splitpasswd(user): | 
|  | 480 | """splitpasswd('user:passwd') -> 'user', 'passwd'.""" | 
|  | 481 | global _passwdprog | 
|  | 482 | if _passwdprog is None: | 
|  | 483 | import re | 
|  | 484 | _passwdprog = re.compile('^([^:]*):(.*)$') | 
|  | 485 |  | 
|  | 486 | match = _passwdprog.match(user) | 
|  | 487 | if match: return match.group(1, 2) | 
|  | 488 | return user, None | 
|  | 489 |  | 
|  | 490 | # splittag('/path#tag') --> '/path', 'tag' | 
|  | 491 | _portprog = None | 
|  | 492 | def splitport(host): | 
|  | 493 | """splitport('host:port') --> 'host', 'port'.""" | 
|  | 494 | global _portprog | 
|  | 495 | if _portprog is None: | 
|  | 496 | import re | 
|  | 497 | _portprog = re.compile('^(.*):([0-9]+)$') | 
|  | 498 |  | 
|  | 499 | match = _portprog.match(host) | 
|  | 500 | if match: return match.group(1, 2) | 
|  | 501 | return host, None | 
|  | 502 |  | 
|  | 503 | _nportprog = None | 
|  | 504 | def splitnport(host, defport=-1): | 
|  | 505 | """Split host and port, returning numeric port. | 
|  | 506 | Return given default port if no ':' found; defaults to -1. | 
|  | 507 | Return numerical port if a valid number are found after ':'. | 
|  | 508 | Return None if ':' but not a valid number.""" | 
|  | 509 | global _nportprog | 
|  | 510 | if _nportprog is None: | 
|  | 511 | import re | 
|  | 512 | _nportprog = re.compile('^(.*):(.*)$') | 
|  | 513 |  | 
|  | 514 | match = _nportprog.match(host) | 
|  | 515 | if match: | 
|  | 516 | host, port = match.group(1, 2) | 
|  | 517 | try: | 
|  | 518 | if not port: raise ValueError("no digits") | 
|  | 519 | nport = int(port) | 
|  | 520 | except ValueError: | 
|  | 521 | nport = None | 
|  | 522 | return host, nport | 
|  | 523 | return host, defport | 
|  | 524 |  | 
|  | 525 | _queryprog = None | 
|  | 526 | def splitquery(url): | 
|  | 527 | """splitquery('/path?query') --> '/path', 'query'.""" | 
|  | 528 | global _queryprog | 
|  | 529 | if _queryprog is None: | 
|  | 530 | import re | 
|  | 531 | _queryprog = re.compile('^(.*)\?([^?]*)$') | 
|  | 532 |  | 
|  | 533 | match = _queryprog.match(url) | 
|  | 534 | if match: return match.group(1, 2) | 
|  | 535 | return url, None | 
|  | 536 |  | 
|  | 537 | _tagprog = None | 
|  | 538 | def splittag(url): | 
|  | 539 | """splittag('/path#tag') --> '/path', 'tag'.""" | 
|  | 540 | global _tagprog | 
|  | 541 | if _tagprog is None: | 
|  | 542 | import re | 
|  | 543 | _tagprog = re.compile('^(.*)#([^#]*)$') | 
|  | 544 |  | 
|  | 545 | match = _tagprog.match(url) | 
|  | 546 | if match: return match.group(1, 2) | 
|  | 547 | return url, None | 
|  | 548 |  | 
|  | 549 | def splitattr(url): | 
|  | 550 | """splitattr('/path;attr1=value1;attr2=value2;...') -> | 
|  | 551 | '/path', ['attr1=value1', 'attr2=value2', ...].""" | 
|  | 552 | words = url.split(';') | 
|  | 553 | return words[0], words[1:] | 
|  | 554 |  | 
|  | 555 | _valueprog = None | 
|  | 556 | def splitvalue(attr): | 
|  | 557 | """splitvalue('attr=value') --> 'attr', 'value'.""" | 
|  | 558 | global _valueprog | 
|  | 559 | if _valueprog is None: | 
|  | 560 | import re | 
|  | 561 | _valueprog = re.compile('^([^=]*)=(.*)$') | 
|  | 562 |  | 
|  | 563 | match = _valueprog.match(attr) | 
|  | 564 | if match: return match.group(1, 2) | 
|  | 565 | return attr, None | 
|  | 566 |  | 
|  | 567 | test_input = """ | 
|  | 568 | http://a/b/c/d | 
|  | 569 |  | 
|  | 570 | g:h        = <URL:g:h> | 
|  | 571 | http:g     = <URL:http://a/b/c/g> | 
|  | 572 | http:      = <URL:http://a/b/c/d> | 
|  | 573 | g          = <URL:http://a/b/c/g> | 
|  | 574 | ./g        = <URL:http://a/b/c/g> | 
|  | 575 | g/         = <URL:http://a/b/c/g/> | 
|  | 576 | /g         = <URL:http://a/g> | 
|  | 577 | //g        = <URL:http://g> | 
|  | 578 | ?y         = <URL:http://a/b/c/d?y> | 
|  | 579 | g?y        = <URL:http://a/b/c/g?y> | 
|  | 580 | g?y/./x    = <URL:http://a/b/c/g?y/./x> | 
|  | 581 | .          = <URL:http://a/b/c/> | 
|  | 582 | ./         = <URL:http://a/b/c/> | 
|  | 583 | ..         = <URL:http://a/b/> | 
|  | 584 | ../        = <URL:http://a/b/> | 
|  | 585 | ../g       = <URL:http://a/b/g> | 
|  | 586 | ../..      = <URL:http://a/> | 
|  | 587 | ../../g    = <URL:http://a/g> | 
|  | 588 | ../../../g = <URL:http://a/../g> | 
|  | 589 | ./../g     = <URL:http://a/b/g> | 
|  | 590 | ./g/.      = <URL:http://a/b/c/g/> | 
|  | 591 | /./g       = <URL:http://a/./g> | 
|  | 592 | g/./h      = <URL:http://a/b/c/g/h> | 
|  | 593 | g/../h     = <URL:http://a/b/c/h> | 
|  | 594 | http:g     = <URL:http://a/b/c/g> | 
|  | 595 | http:      = <URL:http://a/b/c/d> | 
|  | 596 | http:?y         = <URL:http://a/b/c/d?y> | 
|  | 597 | http:g?y        = <URL:http://a/b/c/g?y> | 
|  | 598 | http:g?y/./x    = <URL:http://a/b/c/g?y/./x> | 
|  | 599 | """ | 
|  | 600 |  | 
|  | 601 | def test(): | 
|  | 602 | import sys | 
|  | 603 | base = '' | 
|  | 604 | if sys.argv[1:]: | 
|  | 605 | fn = sys.argv[1] | 
|  | 606 | if fn == '-': | 
|  | 607 | fp = sys.stdin | 
|  | 608 | else: | 
|  | 609 | fp = open(fn) | 
|  | 610 | else: | 
|  | 611 | from io import StringIO | 
|  | 612 | fp = StringIO(test_input) | 
|  | 613 | for line in fp: | 
|  | 614 | words = line.split() | 
|  | 615 | if not words: | 
|  | 616 | continue | 
|  | 617 | url = words[0] | 
|  | 618 | parts = urlparse(url) | 
|  | 619 | print('%-10s : %s' % (url, parts)) | 
|  | 620 | abs = urljoin(base, url) | 
|  | 621 | if not base: | 
|  | 622 | base = abs | 
|  | 623 | wrapped = '<URL:%s>' % abs | 
|  | 624 | print('%-10s = %s' % (url, wrapped)) | 
|  | 625 | if len(words) == 3 and words[1] == '=': | 
|  | 626 | if wrapped != words[2]: | 
|  | 627 | print('EXPECTED', words[2], '!!!!!!!!!!') | 
|  | 628 |  | 
|  | 629 | if __name__ == '__main__': | 
|  | 630 | test() |