blob: 16f46d6e48754f2a5d0c7a070dbe4d234fb6d723 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
3See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
4UC Irvine, June 1995.
5"""
6
Facundo Batista2ac5de22008-07-07 18:24:11 +00007import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +00008import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +00009
Jeremy Hylton1afc1692008-06-18 20:49:58 +000010__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac469d4c2008-09-03 22:49:01 +000011 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000012 "quote", "quote_plus", "quote_from_bytes",
13 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000014
15# A classification of schemes ('' means apply by default)
16uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
17 'wais', 'file', 'https', 'shttp', 'mms',
18 'prospero', 'rtsp', 'rtspu', '', 'sftp']
19uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
20 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
21 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaraneaaec272009-03-30 21:54:41 +000022 'svn', 'svn+ssh', 'sftp','nfs']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000023non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
24 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
25uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
26 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
27 'mms', '', 'sftp']
28uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
29 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
30uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
31 'nntp', 'wais', 'https', 'shttp', 'snews',
32 'file', 'prospero', '']
33
34# Characters valid in scheme names
35scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
36 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
37 '0123456789'
38 '+-.')
39
40MAX_CACHE_SIZE = 20
41_parse_cache = {}
42
43def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000044 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000045 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000046 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000047
48
49class ResultMixin(object):
50 """Shared methods for the parsed result objects."""
51
52 @property
53 def username(self):
54 netloc = self.netloc
55 if "@" in netloc:
56 userinfo = netloc.rsplit("@", 1)[0]
57 if ":" in userinfo:
58 userinfo = userinfo.split(":", 1)[0]
59 return userinfo
60 return None
61
62 @property
63 def password(self):
64 netloc = self.netloc
65 if "@" in netloc:
66 userinfo = netloc.rsplit("@", 1)[0]
67 if ":" in userinfo:
68 return userinfo.split(":", 1)[1]
69 return None
70
71 @property
72 def hostname(self):
73 netloc = self.netloc
74 if "@" in netloc:
75 netloc = netloc.rsplit("@", 1)[1]
76 if ":" in netloc:
77 netloc = netloc.split(":", 1)[0]
78 return netloc.lower() or None
79
80 @property
81 def port(self):
82 netloc = self.netloc
83 if "@" in netloc:
84 netloc = netloc.rsplit("@", 1)[1]
85 if ":" in netloc:
86 port = netloc.split(":", 1)[1]
87 return int(port, 10)
88 return None
89
90from collections import namedtuple
91
92class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
93
94 __slots__ = ()
95
96 def geturl(self):
97 return urlunsplit(self)
98
99
100class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
101
102 __slots__ = ()
103
104 def geturl(self):
105 return urlunparse(self)
106
107
108def urlparse(url, scheme='', allow_fragments=True):
109 """Parse a URL into 6 components:
110 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
111 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
112 Note that we don't break the components up in smaller bits
113 (e.g. netloc is a single string) and we don't expand % escapes."""
114 tuple = urlsplit(url, scheme, allow_fragments)
115 scheme, netloc, url, query, fragment = tuple
116 if scheme in uses_params and ';' in url:
117 url, params = _splitparams(url)
118 else:
119 params = ''
120 return ParseResult(scheme, netloc, url, params, query, fragment)
121
122def _splitparams(url):
123 if '/' in url:
124 i = url.find(';', url.rfind('/'))
125 if i < 0:
126 return url, ''
127 else:
128 i = url.find(';')
129 return url[:i], url[i+1:]
130
131def _splitnetloc(url, start=0):
132 delim = len(url) # position of end of domain part of url, default is end
133 for c in '/?#': # look for delimiters; the order is NOT important
134 wdelim = url.find(c, start) # find first of this delim
135 if wdelim >= 0: # if found
136 delim = min(delim, wdelim) # use earliest delim position
137 return url[start:delim], url[delim:] # return (domain, rest)
138
139def urlsplit(url, scheme='', allow_fragments=True):
140 """Parse a URL into 5 components:
141 <scheme>://<netloc>/<path>?<query>#<fragment>
142 Return a 5-tuple: (scheme, netloc, path, query, fragment).
143 Note that we don't break the components up in smaller bits
144 (e.g. netloc is a single string) and we don't expand % escapes."""
145 allow_fragments = bool(allow_fragments)
146 key = url, scheme, allow_fragments, type(url), type(scheme)
147 cached = _parse_cache.get(key, None)
148 if cached:
149 return cached
150 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
151 clear_cache()
152 netloc = query = fragment = ''
153 i = url.find(':')
154 if i > 0:
155 if url[:i] == 'http': # optimize the common case
156 scheme = url[:i].lower()
157 url = url[i+1:]
158 if url[:2] == '//':
159 netloc, url = _splitnetloc(url, 2)
160 if allow_fragments and '#' in url:
161 url, fragment = url.split('#', 1)
162 if '?' in url:
163 url, query = url.split('?', 1)
164 v = SplitResult(scheme, netloc, url, query, fragment)
165 _parse_cache[key] = v
166 return v
167 for c in url[:i]:
168 if c not in scheme_chars:
169 break
170 else:
171 scheme, url = url[:i].lower(), url[i+1:]
172 if scheme in uses_netloc and url[:2] == '//':
173 netloc, url = _splitnetloc(url, 2)
174 if allow_fragments and scheme in uses_fragment and '#' in url:
175 url, fragment = url.split('#', 1)
176 if scheme in uses_query and '?' in url:
177 url, query = url.split('?', 1)
178 v = SplitResult(scheme, netloc, url, query, fragment)
179 _parse_cache[key] = v
180 return v
181
182def urlunparse(components):
183 """Put a parsed URL back together again. This may result in a
184 slightly different, but equivalent URL, if the URL that was parsed
185 originally had redundant delimiters, e.g. a ? with an empty query
186 (the draft states that these are equivalent)."""
187 scheme, netloc, url, params, query, fragment = components
188 if params:
189 url = "%s;%s" % (url, params)
190 return urlunsplit((scheme, netloc, url, query, fragment))
191
192def urlunsplit(components):
193 scheme, netloc, url, query, fragment = components
194 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
195 if url and url[:1] != '/': url = '/' + url
196 url = '//' + (netloc or '') + url
197 if scheme:
198 url = scheme + ':' + url
199 if query:
200 url = url + '?' + query
201 if fragment:
202 url = url + '#' + fragment
203 return url
204
205def urljoin(base, url, allow_fragments=True):
206 """Join a base URL and a possibly relative URL to form an absolute
207 interpretation of the latter."""
208 if not base:
209 return url
210 if not url:
211 return base
212 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
213 urlparse(base, '', allow_fragments)
214 scheme, netloc, path, params, query, fragment = \
215 urlparse(url, bscheme, allow_fragments)
216 if scheme != bscheme or scheme not in uses_relative:
217 return url
218 if scheme in uses_netloc:
219 if netloc:
220 return urlunparse((scheme, netloc, path,
221 params, query, fragment))
222 netloc = bnetloc
223 if path[:1] == '/':
224 return urlunparse((scheme, netloc, path,
225 params, query, fragment))
Facundo Batista23e38562008-08-14 16:55:14 +0000226 if not path:
227 path = bpath
228 if not params:
229 params = bparams
230 else:
231 path = path[:-1]
232 return urlunparse((scheme, netloc, path,
233 params, query, fragment))
234 if not query:
235 query = bquery
236 return urlunparse((scheme, netloc, path,
237 params, query, fragment))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000238 segments = bpath.split('/')[:-1] + path.split('/')
239 # XXX The stuff below is bogus in various ways...
240 if segments[-1] == '.':
241 segments[-1] = ''
242 while '.' in segments:
243 segments.remove('.')
244 while 1:
245 i = 1
246 n = len(segments) - 1
247 while i < n:
248 if (segments[i] == '..'
249 and segments[i-1] not in ('', '..')):
250 del segments[i-1:i+1]
251 break
252 i = i+1
253 else:
254 break
255 if segments == ['', '..']:
256 segments[-1] = ''
257 elif len(segments) >= 2 and segments[-1] == '..':
258 segments[-2:] = ['']
259 return urlunparse((scheme, netloc, '/'.join(segments),
260 params, query, fragment))
261
262def urldefrag(url):
263 """Removes any existing fragment from URL.
264
265 Returns a tuple of the defragmented URL and the fragment. If
266 the URL contained no fragments, the second element is the
267 empty string.
268 """
269 if '#' in url:
270 s, n, p, a, q, frag = urlparse(url)
271 defrag = urlunparse((s, n, p, a, q, ''))
272 return defrag, frag
273 else:
274 return url, ''
275
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000276def unquote_to_bytes(string):
277 """unquote_to_bytes('abc%20def') -> b'abc def'."""
278 # Note: strings are encoded as UTF-8. This is only an issue if it contains
279 # unescaped non-ASCII characters, which URIs should not.
280 if isinstance(string, str):
281 string = string.encode('utf-8')
282 res = string.split(b'%')
283 res[0] = res[0]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000284 for i in range(1, len(res)):
285 item = res[i]
286 try:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000287 res[i] = bytes([int(item[:2], 16)]) + item[2:]
288 except ValueError:
289 res[i] = b'%' + item
290 return b''.join(res)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000291
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000292def unquote(string, encoding='utf-8', errors='replace'):
293 """Replace %xx escapes by their single-character equivalent. The optional
294 encoding and errors parameters specify how to decode percent-encoded
295 sequences into Unicode characters, as accepted by the bytes.decode()
296 method.
297 By default, percent-encoded sequences are decoded with UTF-8, and invalid
298 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000299
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000300 unquote('abc%20def') -> 'abc def'.
301 """
302 if encoding is None: encoding = 'utf-8'
303 if errors is None: errors = 'replace'
304 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
305 # (list of single-byte bytes objects)
306 pct_sequence = []
307 res = string.split('%')
308 for i in range(1, len(res)):
309 item = res[i]
310 try:
311 if not item: raise ValueError
312 pct_sequence.append(bytes.fromhex(item[:2]))
313 rest = item[2:]
314 except ValueError:
315 rest = '%' + item
316 if not rest:
317 # This segment was just a single percent-encoded character.
318 # May be part of a sequence of code units, so delay decoding.
319 # (Stored in pct_sequence).
320 res[i] = ''
321 else:
322 # Encountered non-percent-encoded characters. Flush the current
323 # pct_sequence.
324 res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
325 pct_sequence = []
326 if pct_sequence:
327 # Flush the final pct_sequence
328 # res[-1] will always be empty if pct_sequence != []
329 assert not res[-1], "string=%r, res=%r" % (string, res)
330 res[-1] = b''.join(pct_sequence).decode(encoding, errors)
331 return ''.join(res)
332
Georg Brandl3d6575d2009-09-16 14:36:22 +0000333def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000334 """Parse a query given as a string argument.
335
336 Arguments:
337
338 qs: URL-encoded query string to be parsed
339
340 keep_blank_values: flag indicating whether blank values in
341 URL encoded queries should be treated as blank strings.
342 A true value indicates that blanks should be retained as
343 blank strings. The default false value indicates that
344 blank values are to be ignored and treated as if they were
345 not included.
346
347 strict_parsing: flag indicating what to do with parsing errors.
348 If false (the default), errors are silently ignored.
349 If true, errors raise a ValueError exception.
350 """
351 dict = {}
352 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
353 if name in dict:
354 dict[name].append(value)
355 else:
356 dict[name] = [value]
357 return dict
358
Georg Brandl3d6575d2009-09-16 14:36:22 +0000359def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000360 """Parse a query given as a string argument.
361
362 Arguments:
363
364 qs: URL-encoded query string to be parsed
365
366 keep_blank_values: flag indicating whether blank values in
367 URL encoded queries should be treated as blank strings. A
368 true value indicates that blanks should be retained as blank
369 strings. The default false value indicates that blank values
370 are to be ignored and treated as if they were not included.
371
372 strict_parsing: flag indicating what to do with parsing errors. If
373 false (the default), errors are silently ignored. If true,
374 errors raise a ValueError exception.
375
376 Returns a list, as G-d intended.
377 """
378 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
379 r = []
380 for name_value in pairs:
381 if not name_value and not strict_parsing:
382 continue
383 nv = name_value.split('=', 1)
384 if len(nv) != 2:
385 if strict_parsing:
386 raise ValueError("bad query field: %r" % (name_value,))
387 # Handle case of a control-name with no equal sign
388 if keep_blank_values:
389 nv.append('')
390 else:
391 continue
392 if len(nv[1]) or keep_blank_values:
393 name = unquote(nv[0].replace('+', ' '))
394 value = unquote(nv[1].replace('+', ' '))
395 r.append((name, value))
396
397 return r
398
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000399def unquote_plus(string, encoding='utf-8', errors='replace'):
400 """Like unquote(), but also replace plus signs by spaces, as required for
401 unquoting HTML form values.
402
403 unquote_plus('%7e/abc+def') -> '~/abc def'
404 """
405 string = string.replace('+', ' ')
406 return unquote(string, encoding, errors)
407
408_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
409 b'abcdefghijklmnopqrstuvwxyz'
410 b'0123456789'
411 b'_.-')
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000412_safe_quoters= {}
413
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000414class Quoter(collections.defaultdict):
415 """A mapping from bytes (in range(0,256)) to strings.
416
417 String values are percent-encoded byte values, unless the key < 128, and
418 in the "safe" set (either the specified safe set, or default set).
419 """
420 # Keeps a cache internally, using defaultdict, for efficiency (lookups
421 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000422 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000423 """safe: bytes object."""
424 self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000425
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000426 def __repr__(self):
427 # Without this, will just display as a defaultdict
428 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000429
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000430 def __missing__(self, b):
431 # Handle a cache miss. Store quoted string in cache and return.
432 res = b in self.safe and chr(b) or ('%%%02X' % b)
433 self[b] = res
434 return res
435
436def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000437 """quote('abc def') -> 'abc%20def'
438
439 Each part of a URL, e.g. the path info, the query, etc., has a
440 different set of reserved characters that must be quoted.
441
442 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
443 the following reserved characters.
444
445 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
446 "$" | ","
447
448 Each of these characters is reserved in some component of a URL,
449 but not necessarily in all of them.
450
451 By default, the quote function is intended for quoting the path
452 section of a URL. Thus, it will not encode '/'. This character
453 is reserved, but in typical usage the quote function is being
454 called on a path where the existing slash characters are used as
455 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000456
457 string and safe may be either str or bytes objects. encoding must
458 not be specified if string is a str.
459
460 The optional encoding and errors parameters specify how to deal with
461 non-ASCII characters, as accepted by the str.encode method.
462 By default, encoding='utf-8' (characters are encoded with UTF-8), and
463 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000464 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000465 if isinstance(string, str):
466 if encoding is None:
467 encoding = 'utf-8'
468 if errors is None:
469 errors = 'strict'
470 string = string.encode(encoding, errors)
471 else:
472 if encoding is not None:
473 raise TypeError("quote() doesn't support 'encoding' for bytes")
474 if errors is not None:
475 raise TypeError("quote() doesn't support 'errors' for bytes")
476 return quote_from_bytes(string, safe)
477
478def quote_plus(string, safe='', encoding=None, errors=None):
479 """Like quote(), but also replace ' ' with '+', as required for quoting
480 HTML form values. Plus signs in the original string are escaped unless
481 they are included in safe. It also does not have safe default to '/'.
482 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000483 # Check if ' ' in string, where string may either be a str or bytes. If
484 # there are no spaces, the regular quote will produce the right answer.
485 if ((isinstance(string, str) and ' ' not in string) or
486 (isinstance(string, bytes) and b' ' not in string)):
487 return quote(string, safe, encoding, errors)
488 if isinstance(safe, str):
489 space = ' '
490 else:
491 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000492 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000493 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000494
495def quote_from_bytes(bs, safe='/'):
496 """Like quote(), but accepts a bytes object rather than a str, and does
497 not perform string-to-bytes encoding. It always returns an ASCII string.
498 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
499 """
500 if isinstance(safe, str):
501 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
502 safe = safe.encode('ascii', 'ignore')
503 cachekey = bytes(safe) # In case it was a bytearray
504 if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
505 raise TypeError("quote_from_bytes() expected a bytes")
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000506 try:
507 quoter = _safe_quoters[cachekey]
508 except KeyError:
509 quoter = Quoter(safe)
510 _safe_quoters[cachekey] = quoter
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000511 return ''.join([quoter[char] for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000512
Georg Brandl3d6575d2009-09-16 14:36:22 +0000513def urlencode(query, doseq=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000514 """Encode a sequence of two-element tuples or dictionary into a URL query string.
515
516 If any values in the query arg are sequences and doseq is true, each
517 sequence element is converted to a separate parameter.
518
519 If the query arg is a sequence of two-element tuples, the order of the
520 parameters in the output will match the order of parameters in the
521 input.
522 """
523
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000524 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000525 query = query.items()
526 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000527 # It's a bother at times that strings and string-like objects are
528 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000529 try:
530 # non-sequence items should not work with len()
531 # non-empty strings will fail this
532 if len(query) and not isinstance(query[0], tuple):
533 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000534 # Zero-length sequences of all types will get here and succeed,
535 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000536 # allowed empty dicts that type of behavior probably should be
537 # preserved for consistency
538 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000539 ty, va, tb = sys.exc_info()
540 raise TypeError("not a valid non-string sequence "
541 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000542
543 l = []
544 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000545 for k, v in query:
546 k = quote_plus(str(k))
547 v = quote_plus(str(v))
548 l.append(k + '=' + v)
549 else:
550 for k, v in query:
551 k = quote_plus(str(k))
552 if isinstance(v, str):
553 v = quote_plus(v)
554 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000555 else:
556 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000557 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000558 x = len(v)
559 except TypeError:
560 # not a sequence
561 v = quote_plus(str(v))
562 l.append(k + '=' + v)
563 else:
564 # loop over the sequence
565 for elt in v:
566 l.append(k + '=' + quote_plus(str(elt)))
567 return '&'.join(l)
568
569# Utilities to parse URLs (most of these return None for missing parts):
570# unwrap('<URL:type://host/path>') --> 'type://host/path'
571# splittype('type:opaquestring') --> 'type', 'opaquestring'
572# splithost('//host[:port]/path') --> 'host[:port]', '/path'
573# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
574# splitpasswd('user:passwd') -> 'user', 'passwd'
575# splitport('host:port') --> 'host', 'port'
576# splitquery('/path?query') --> '/path', 'query'
577# splittag('/path#tag') --> '/path', 'tag'
578# splitattr('/path;attr1=value1;attr2=value2;...') ->
579# '/path', ['attr1=value1', 'attr2=value2', ...]
580# splitvalue('attr=value') --> 'attr', 'value'
581# urllib.parse.unquote('abc%20def') -> 'abc def'
582# quote('abc def') -> 'abc%20def')
583
Georg Brandl13e89462008-07-01 19:56:00 +0000584def to_bytes(url):
585 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000586 # Most URL schemes require ASCII. If that changes, the conversion
587 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000588 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000589 if isinstance(url, str):
590 try:
591 url = url.encode("ASCII").decode()
592 except UnicodeError:
593 raise UnicodeError("URL " + repr(url) +
594 " contains non-ASCII characters")
595 return url
596
597def unwrap(url):
598 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
599 url = str(url).strip()
600 if url[:1] == '<' and url[-1:] == '>':
601 url = url[1:-1].strip()
602 if url[:4] == 'URL:': url = url[4:].strip()
603 return url
604
605_typeprog = None
606def splittype(url):
607 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
608 global _typeprog
609 if _typeprog is None:
610 import re
611 _typeprog = re.compile('^([^/:]+):')
612
613 match = _typeprog.match(url)
614 if match:
615 scheme = match.group(1)
616 return scheme.lower(), url[len(scheme) + 1:]
617 return None, url
618
619_hostprog = None
620def splithost(url):
621 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
622 global _hostprog
623 if _hostprog is None:
624 import re
625 _hostprog = re.compile('^//([^/?]*)(.*)$')
626
627 match = _hostprog.match(url)
628 if match: return match.group(1, 2)
629 return None, url
630
631_userprog = None
632def splituser(host):
633 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
634 global _userprog
635 if _userprog is None:
636 import re
637 _userprog = re.compile('^(.*)@(.*)$')
638
639 match = _userprog.match(host)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000640 if match: return map(unquote, match.group(1, 2))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000641 return None, host
642
643_passwdprog = None
644def splitpasswd(user):
645 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
646 global _passwdprog
647 if _passwdprog is None:
648 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000649 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000650
651 match = _passwdprog.match(user)
652 if match: return match.group(1, 2)
653 return user, None
654
655# splittag('/path#tag') --> '/path', 'tag'
656_portprog = None
657def splitport(host):
658 """splitport('host:port') --> 'host', 'port'."""
659 global _portprog
660 if _portprog is None:
661 import re
662 _portprog = re.compile('^(.*):([0-9]+)$')
663
664 match = _portprog.match(host)
665 if match: return match.group(1, 2)
666 return host, None
667
668_nportprog = None
669def splitnport(host, defport=-1):
670 """Split host and port, returning numeric port.
671 Return given default port if no ':' found; defaults to -1.
672 Return numerical port if a valid number are found after ':'.
673 Return None if ':' but not a valid number."""
674 global _nportprog
675 if _nportprog is None:
676 import re
677 _nportprog = re.compile('^(.*):(.*)$')
678
679 match = _nportprog.match(host)
680 if match:
681 host, port = match.group(1, 2)
682 try:
683 if not port: raise ValueError("no digits")
684 nport = int(port)
685 except ValueError:
686 nport = None
687 return host, nport
688 return host, defport
689
690_queryprog = None
691def splitquery(url):
692 """splitquery('/path?query') --> '/path', 'query'."""
693 global _queryprog
694 if _queryprog is None:
695 import re
696 _queryprog = re.compile('^(.*)\?([^?]*)$')
697
698 match = _queryprog.match(url)
699 if match: return match.group(1, 2)
700 return url, None
701
702_tagprog = None
703def splittag(url):
704 """splittag('/path#tag') --> '/path', 'tag'."""
705 global _tagprog
706 if _tagprog is None:
707 import re
708 _tagprog = re.compile('^(.*)#([^#]*)$')
709
710 match = _tagprog.match(url)
711 if match: return match.group(1, 2)
712 return url, None
713
714def splitattr(url):
715 """splitattr('/path;attr1=value1;attr2=value2;...') ->
716 '/path', ['attr1=value1', 'attr2=value2', ...]."""
717 words = url.split(';')
718 return words[0], words[1:]
719
720_valueprog = None
721def splitvalue(attr):
722 """splitvalue('attr=value') --> 'attr', 'value'."""
723 global _valueprog
724 if _valueprog is None:
725 import re
726 _valueprog = re.compile('^([^=]*)=(.*)$')
727
728 match = _valueprog.match(attr)
729 if match: return match.group(1, 2)
730 return attr, None
731
732test_input = """
733 http://a/b/c/d
734
735 g:h = <URL:g:h>
736 http:g = <URL:http://a/b/c/g>
737 http: = <URL:http://a/b/c/d>
738 g = <URL:http://a/b/c/g>
739 ./g = <URL:http://a/b/c/g>
740 g/ = <URL:http://a/b/c/g/>
741 /g = <URL:http://a/g>
742 //g = <URL:http://g>
743 ?y = <URL:http://a/b/c/d?y>
744 g?y = <URL:http://a/b/c/g?y>
745 g?y/./x = <URL:http://a/b/c/g?y/./x>
746 . = <URL:http://a/b/c/>
747 ./ = <URL:http://a/b/c/>
748 .. = <URL:http://a/b/>
749 ../ = <URL:http://a/b/>
750 ../g = <URL:http://a/b/g>
751 ../.. = <URL:http://a/>
752 ../../g = <URL:http://a/g>
753 ../../../g = <URL:http://a/../g>
754 ./../g = <URL:http://a/b/g>
755 ./g/. = <URL:http://a/b/c/g/>
756 /./g = <URL:http://a/./g>
757 g/./h = <URL:http://a/b/c/g/h>
758 g/../h = <URL:http://a/b/c/h>
759 http:g = <URL:http://a/b/c/g>
760 http: = <URL:http://a/b/c/d>
761 http:?y = <URL:http://a/b/c/d?y>
762 http:g?y = <URL:http://a/b/c/g?y>
763 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
764"""
765
766def test():
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000767 base = ''
768 if sys.argv[1:]:
769 fn = sys.argv[1]
770 if fn == '-':
771 fp = sys.stdin
772 else:
773 fp = open(fn)
774 else:
775 from io import StringIO
776 fp = StringIO(test_input)
777 for line in fp:
778 words = line.split()
779 if not words:
780 continue
781 url = words[0]
782 parts = urlparse(url)
783 print('%-10s : %s' % (url, parts))
784 abs = urljoin(base, url)
785 if not base:
786 base = abs
787 wrapped = '<URL:%s>' % abs
788 print('%-10s = %s' % (url, wrapped))
789 if len(words) == 3 and words[1] == '=':
790 if wrapped != words[2]:
791 print('EXPECTED', words[2], '!!!!!!!!!!')
792
793if __name__ == '__main__':
794 test()