blob: 3a8fd6046f47764637f5b7190242ed5fdaf5ab93 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
3See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
4UC Irvine, June 1995.
5"""
6
Facundo Batista2ac5de22008-07-07 18:24:11 +00007import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +00008import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +00009
Jeremy Hylton1afc1692008-06-18 20:49:58 +000010__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac469d4c2008-09-03 22:49:01 +000011 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000012 "quote", "quote_plus", "quote_from_bytes",
13 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000014
15# A classification of schemes ('' means apply by default)
16uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
17 'wais', 'file', 'https', 'shttp', 'mms',
18 'prospero', 'rtsp', 'rtspu', '', 'sftp']
19uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
20 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
21 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaraneaaec272009-03-30 21:54:41 +000022 'svn', 'svn+ssh', 'sftp','nfs']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000023non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
24 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
25uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
26 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
27 'mms', '', 'sftp']
28uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
29 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
30uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
31 'nntp', 'wais', 'https', 'shttp', 'snews',
32 'file', 'prospero', '']
33
34# Characters valid in scheme names
35scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
36 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
37 '0123456789'
38 '+-.')
39
40MAX_CACHE_SIZE = 20
41_parse_cache = {}
42
43def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000044 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000045 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000046 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000047
48
49class ResultMixin(object):
50 """Shared methods for the parsed result objects."""
51
52 @property
53 def username(self):
54 netloc = self.netloc
55 if "@" in netloc:
56 userinfo = netloc.rsplit("@", 1)[0]
57 if ":" in userinfo:
58 userinfo = userinfo.split(":", 1)[0]
59 return userinfo
60 return None
61
62 @property
63 def password(self):
64 netloc = self.netloc
65 if "@" in netloc:
66 userinfo = netloc.rsplit("@", 1)[0]
67 if ":" in userinfo:
68 return userinfo.split(":", 1)[1]
69 return None
70
71 @property
72 def hostname(self):
Senthil Kumaranad02d232010-04-16 03:02:13 +000073 netloc = self.netloc.split('@')[-1]
74 if '[' in netloc and ']' in netloc:
75 return netloc.split(']')[0][1:].lower()
76 elif '[' in netloc or ']' in netloc:
77 raise ValueError("Invalid IPv6 hostname")
78 elif ':' in netloc:
79 return netloc.split(':')[0].lower()
80 elif netloc == '':
81 return None
82 else:
83 return netloc.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000084
85 @property
86 def port(self):
Senthil Kumaranad02d232010-04-16 03:02:13 +000087 netloc = self.netloc.split('@')[-1].split(']')[-1]
88 if ':' in netloc:
89 port = netloc.split(':')[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000090 return int(port, 10)
Senthil Kumaranad02d232010-04-16 03:02:13 +000091 else:
92 return None
Jeremy Hylton1afc1692008-06-18 20:49:58 +000093
94from collections import namedtuple
95
96class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
97
98 __slots__ = ()
99
100 def geturl(self):
101 return urlunsplit(self)
102
103
104class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
105
106 __slots__ = ()
107
108 def geturl(self):
109 return urlunparse(self)
110
111
112def urlparse(url, scheme='', allow_fragments=True):
113 """Parse a URL into 6 components:
114 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
115 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
116 Note that we don't break the components up in smaller bits
117 (e.g. netloc is a single string) and we don't expand % escapes."""
118 tuple = urlsplit(url, scheme, allow_fragments)
119 scheme, netloc, url, query, fragment = tuple
120 if scheme in uses_params and ';' in url:
121 url, params = _splitparams(url)
122 else:
123 params = ''
124 return ParseResult(scheme, netloc, url, params, query, fragment)
125
126def _splitparams(url):
127 if '/' in url:
128 i = url.find(';', url.rfind('/'))
129 if i < 0:
130 return url, ''
131 else:
132 i = url.find(';')
133 return url[:i], url[i+1:]
134
135def _splitnetloc(url, start=0):
136 delim = len(url) # position of end of domain part of url, default is end
Senthil Kumaranad02d232010-04-16 03:02:13 +0000137 if '[' in url: # check for invalid IPv6 URL
138 if not ']' in url: raise ValueError("Invalid IPv6 URL")
139 elif ']' in url:
140 if not '[' in url: raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000141 for c in '/?#': # look for delimiters; the order is NOT important
142 wdelim = url.find(c, start) # find first of this delim
143 if wdelim >= 0: # if found
144 delim = min(delim, wdelim) # use earliest delim position
145 return url[start:delim], url[delim:] # return (domain, rest)
146
147def urlsplit(url, scheme='', allow_fragments=True):
148 """Parse a URL into 5 components:
149 <scheme>://<netloc>/<path>?<query>#<fragment>
150 Return a 5-tuple: (scheme, netloc, path, query, fragment).
151 Note that we don't break the components up in smaller bits
152 (e.g. netloc is a single string) and we don't expand % escapes."""
153 allow_fragments = bool(allow_fragments)
154 key = url, scheme, allow_fragments, type(url), type(scheme)
155 cached = _parse_cache.get(key, None)
156 if cached:
157 return cached
158 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
159 clear_cache()
160 netloc = query = fragment = ''
161 i = url.find(':')
162 if i > 0:
163 if url[:i] == 'http': # optimize the common case
164 scheme = url[:i].lower()
165 url = url[i+1:]
166 if url[:2] == '//':
167 netloc, url = _splitnetloc(url, 2)
168 if allow_fragments and '#' in url:
169 url, fragment = url.split('#', 1)
170 if '?' in url:
171 url, query = url.split('?', 1)
172 v = SplitResult(scheme, netloc, url, query, fragment)
173 _parse_cache[key] = v
174 return v
175 for c in url[:i]:
176 if c not in scheme_chars:
177 break
178 else:
179 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000180 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000181 netloc, url = _splitnetloc(url, 2)
182 if allow_fragments and scheme in uses_fragment and '#' in url:
183 url, fragment = url.split('#', 1)
184 if scheme in uses_query and '?' in url:
185 url, query = url.split('?', 1)
186 v = SplitResult(scheme, netloc, url, query, fragment)
187 _parse_cache[key] = v
188 return v
189
190def urlunparse(components):
191 """Put a parsed URL back together again. This may result in a
192 slightly different, but equivalent URL, if the URL that was parsed
193 originally had redundant delimiters, e.g. a ? with an empty query
194 (the draft states that these are equivalent)."""
195 scheme, netloc, url, params, query, fragment = components
196 if params:
197 url = "%s;%s" % (url, params)
198 return urlunsplit((scheme, netloc, url, query, fragment))
199
200def urlunsplit(components):
201 scheme, netloc, url, query, fragment = components
202 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
203 if url and url[:1] != '/': url = '/' + url
204 url = '//' + (netloc or '') + url
205 if scheme:
206 url = scheme + ':' + url
207 if query:
208 url = url + '?' + query
209 if fragment:
210 url = url + '#' + fragment
211 return url
212
213def urljoin(base, url, allow_fragments=True):
214 """Join a base URL and a possibly relative URL to form an absolute
215 interpretation of the latter."""
216 if not base:
217 return url
218 if not url:
219 return base
220 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
221 urlparse(base, '', allow_fragments)
222 scheme, netloc, path, params, query, fragment = \
223 urlparse(url, bscheme, allow_fragments)
224 if scheme != bscheme or scheme not in uses_relative:
225 return url
226 if scheme in uses_netloc:
227 if netloc:
228 return urlunparse((scheme, netloc, path,
229 params, query, fragment))
230 netloc = bnetloc
231 if path[:1] == '/':
232 return urlunparse((scheme, netloc, path,
233 params, query, fragment))
Facundo Batista23e38562008-08-14 16:55:14 +0000234 if not path:
235 path = bpath
236 if not params:
237 params = bparams
238 else:
239 path = path[:-1]
240 return urlunparse((scheme, netloc, path,
241 params, query, fragment))
242 if not query:
243 query = bquery
244 return urlunparse((scheme, netloc, path,
245 params, query, fragment))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000246 segments = bpath.split('/')[:-1] + path.split('/')
247 # XXX The stuff below is bogus in various ways...
248 if segments[-1] == '.':
249 segments[-1] = ''
250 while '.' in segments:
251 segments.remove('.')
252 while 1:
253 i = 1
254 n = len(segments) - 1
255 while i < n:
256 if (segments[i] == '..'
257 and segments[i-1] not in ('', '..')):
258 del segments[i-1:i+1]
259 break
260 i = i+1
261 else:
262 break
263 if segments == ['', '..']:
264 segments[-1] = ''
265 elif len(segments) >= 2 and segments[-1] == '..':
266 segments[-2:] = ['']
267 return urlunparse((scheme, netloc, '/'.join(segments),
268 params, query, fragment))
269
270def urldefrag(url):
271 """Removes any existing fragment from URL.
272
273 Returns a tuple of the defragmented URL and the fragment. If
274 the URL contained no fragments, the second element is the
275 empty string.
276 """
277 if '#' in url:
278 s, n, p, a, q, frag = urlparse(url)
279 defrag = urlunparse((s, n, p, a, q, ''))
280 return defrag, frag
281 else:
282 return url, ''
283
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000284def unquote_to_bytes(string):
285 """unquote_to_bytes('abc%20def') -> b'abc def'."""
286 # Note: strings are encoded as UTF-8. This is only an issue if it contains
287 # unescaped non-ASCII characters, which URIs should not.
288 if isinstance(string, str):
289 string = string.encode('utf-8')
290 res = string.split(b'%')
291 res[0] = res[0]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000292 for i in range(1, len(res)):
293 item = res[i]
294 try:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000295 res[i] = bytes([int(item[:2], 16)]) + item[2:]
296 except ValueError:
297 res[i] = b'%' + item
298 return b''.join(res)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000299
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000300def unquote(string, encoding='utf-8', errors='replace'):
301 """Replace %xx escapes by their single-character equivalent. The optional
302 encoding and errors parameters specify how to decode percent-encoded
303 sequences into Unicode characters, as accepted by the bytes.decode()
304 method.
305 By default, percent-encoded sequences are decoded with UTF-8, and invalid
306 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000307
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000308 unquote('abc%20def') -> 'abc def'.
309 """
310 if encoding is None: encoding = 'utf-8'
311 if errors is None: errors = 'replace'
312 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
313 # (list of single-byte bytes objects)
314 pct_sequence = []
315 res = string.split('%')
316 for i in range(1, len(res)):
317 item = res[i]
318 try:
319 if not item: raise ValueError
320 pct_sequence.append(bytes.fromhex(item[:2]))
321 rest = item[2:]
322 except ValueError:
323 rest = '%' + item
324 if not rest:
325 # This segment was just a single percent-encoded character.
326 # May be part of a sequence of code units, so delay decoding.
327 # (Stored in pct_sequence).
328 res[i] = ''
329 else:
330 # Encountered non-percent-encoded characters. Flush the current
331 # pct_sequence.
332 res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
333 pct_sequence = []
334 if pct_sequence:
335 # Flush the final pct_sequence
336 # res[-1] will always be empty if pct_sequence != []
337 assert not res[-1], "string=%r, res=%r" % (string, res)
338 res[-1] = b''.join(pct_sequence).decode(encoding, errors)
339 return ''.join(res)
340
Georg Brandl3d6575d2009-09-16 14:36:22 +0000341def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000342 """Parse a query given as a string argument.
343
344 Arguments:
345
346 qs: URL-encoded query string to be parsed
347
348 keep_blank_values: flag indicating whether blank values in
349 URL encoded queries should be treated as blank strings.
350 A true value indicates that blanks should be retained as
351 blank strings. The default false value indicates that
352 blank values are to be ignored and treated as if they were
353 not included.
354
355 strict_parsing: flag indicating what to do with parsing errors.
356 If false (the default), errors are silently ignored.
357 If true, errors raise a ValueError exception.
358 """
359 dict = {}
360 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
361 if name in dict:
362 dict[name].append(value)
363 else:
364 dict[name] = [value]
365 return dict
366
Georg Brandl3d6575d2009-09-16 14:36:22 +0000367def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000368 """Parse a query given as a string argument.
369
370 Arguments:
371
372 qs: URL-encoded query string to be parsed
373
374 keep_blank_values: flag indicating whether blank values in
375 URL encoded queries should be treated as blank strings. A
376 true value indicates that blanks should be retained as blank
377 strings. The default false value indicates that blank values
378 are to be ignored and treated as if they were not included.
379
380 strict_parsing: flag indicating what to do with parsing errors. If
381 false (the default), errors are silently ignored. If true,
382 errors raise a ValueError exception.
383
384 Returns a list, as G-d intended.
385 """
386 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
387 r = []
388 for name_value in pairs:
389 if not name_value and not strict_parsing:
390 continue
391 nv = name_value.split('=', 1)
392 if len(nv) != 2:
393 if strict_parsing:
394 raise ValueError("bad query field: %r" % (name_value,))
395 # Handle case of a control-name with no equal sign
396 if keep_blank_values:
397 nv.append('')
398 else:
399 continue
400 if len(nv[1]) or keep_blank_values:
401 name = unquote(nv[0].replace('+', ' '))
402 value = unquote(nv[1].replace('+', ' '))
403 r.append((name, value))
404
405 return r
406
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000407def unquote_plus(string, encoding='utf-8', errors='replace'):
408 """Like unquote(), but also replace plus signs by spaces, as required for
409 unquoting HTML form values.
410
411 unquote_plus('%7e/abc+def') -> '~/abc def'
412 """
413 string = string.replace('+', ' ')
414 return unquote(string, encoding, errors)
415
416_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
417 b'abcdefghijklmnopqrstuvwxyz'
418 b'0123456789'
419 b'_.-')
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000420_safe_quoters= {}
421
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000422class Quoter(collections.defaultdict):
423 """A mapping from bytes (in range(0,256)) to strings.
424
425 String values are percent-encoded byte values, unless the key < 128, and
426 in the "safe" set (either the specified safe set, or default set).
427 """
428 # Keeps a cache internally, using defaultdict, for efficiency (lookups
429 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000430 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000431 """safe: bytes object."""
432 self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000433
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000434 def __repr__(self):
435 # Without this, will just display as a defaultdict
436 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000437
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000438 def __missing__(self, b):
439 # Handle a cache miss. Store quoted string in cache and return.
440 res = b in self.safe and chr(b) or ('%%%02X' % b)
441 self[b] = res
442 return res
443
444def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000445 """quote('abc def') -> 'abc%20def'
446
447 Each part of a URL, e.g. the path info, the query, etc., has a
448 different set of reserved characters that must be quoted.
449
450 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
451 the following reserved characters.
452
453 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
454 "$" | ","
455
456 Each of these characters is reserved in some component of a URL,
457 but not necessarily in all of them.
458
459 By default, the quote function is intended for quoting the path
460 section of a URL. Thus, it will not encode '/'. This character
461 is reserved, but in typical usage the quote function is being
462 called on a path where the existing slash characters are used as
463 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000464
465 string and safe may be either str or bytes objects. encoding must
466 not be specified if string is a str.
467
468 The optional encoding and errors parameters specify how to deal with
469 non-ASCII characters, as accepted by the str.encode method.
470 By default, encoding='utf-8' (characters are encoded with UTF-8), and
471 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000472 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000473 if isinstance(string, str):
474 if encoding is None:
475 encoding = 'utf-8'
476 if errors is None:
477 errors = 'strict'
478 string = string.encode(encoding, errors)
479 else:
480 if encoding is not None:
481 raise TypeError("quote() doesn't support 'encoding' for bytes")
482 if errors is not None:
483 raise TypeError("quote() doesn't support 'errors' for bytes")
484 return quote_from_bytes(string, safe)
485
486def quote_plus(string, safe='', encoding=None, errors=None):
487 """Like quote(), but also replace ' ' with '+', as required for quoting
488 HTML form values. Plus signs in the original string are escaped unless
489 they are included in safe. It also does not have safe default to '/'.
490 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000491 # Check if ' ' in string, where string may either be a str or bytes. If
492 # there are no spaces, the regular quote will produce the right answer.
493 if ((isinstance(string, str) and ' ' not in string) or
494 (isinstance(string, bytes) and b' ' not in string)):
495 return quote(string, safe, encoding, errors)
496 if isinstance(safe, str):
497 space = ' '
498 else:
499 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000500 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000501 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000502
503def quote_from_bytes(bs, safe='/'):
504 """Like quote(), but accepts a bytes object rather than a str, and does
505 not perform string-to-bytes encoding. It always returns an ASCII string.
506 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
507 """
508 if isinstance(safe, str):
509 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
510 safe = safe.encode('ascii', 'ignore')
511 cachekey = bytes(safe) # In case it was a bytearray
512 if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
513 raise TypeError("quote_from_bytes() expected a bytes")
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000514 try:
515 quoter = _safe_quoters[cachekey]
516 except KeyError:
517 quoter = Quoter(safe)
518 _safe_quoters[cachekey] = quoter
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000519 return ''.join([quoter[char] for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000520
Georg Brandl3d6575d2009-09-16 14:36:22 +0000521def urlencode(query, doseq=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000522 """Encode a sequence of two-element tuples or dictionary into a URL query string.
523
524 If any values in the query arg are sequences and doseq is true, each
525 sequence element is converted to a separate parameter.
526
527 If the query arg is a sequence of two-element tuples, the order of the
528 parameters in the output will match the order of parameters in the
529 input.
530 """
531
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000532 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000533 query = query.items()
534 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000535 # It's a bother at times that strings and string-like objects are
536 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000537 try:
538 # non-sequence items should not work with len()
539 # non-empty strings will fail this
540 if len(query) and not isinstance(query[0], tuple):
541 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000542 # Zero-length sequences of all types will get here and succeed,
543 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000544 # allowed empty dicts that type of behavior probably should be
545 # preserved for consistency
546 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000547 ty, va, tb = sys.exc_info()
548 raise TypeError("not a valid non-string sequence "
549 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000550
551 l = []
552 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000553 for k, v in query:
554 k = quote_plus(str(k))
555 v = quote_plus(str(v))
556 l.append(k + '=' + v)
557 else:
558 for k, v in query:
559 k = quote_plus(str(k))
560 if isinstance(v, str):
561 v = quote_plus(v)
562 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000563 else:
564 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000565 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000566 x = len(v)
567 except TypeError:
568 # not a sequence
569 v = quote_plus(str(v))
570 l.append(k + '=' + v)
571 else:
572 # loop over the sequence
573 for elt in v:
574 l.append(k + '=' + quote_plus(str(elt)))
575 return '&'.join(l)
576
577# Utilities to parse URLs (most of these return None for missing parts):
578# unwrap('<URL:type://host/path>') --> 'type://host/path'
579# splittype('type:opaquestring') --> 'type', 'opaquestring'
580# splithost('//host[:port]/path') --> 'host[:port]', '/path'
581# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
582# splitpasswd('user:passwd') -> 'user', 'passwd'
583# splitport('host:port') --> 'host', 'port'
584# splitquery('/path?query') --> '/path', 'query'
585# splittag('/path#tag') --> '/path', 'tag'
586# splitattr('/path;attr1=value1;attr2=value2;...') ->
587# '/path', ['attr1=value1', 'attr2=value2', ...]
588# splitvalue('attr=value') --> 'attr', 'value'
589# urllib.parse.unquote('abc%20def') -> 'abc def'
590# quote('abc def') -> 'abc%20def')
591
Georg Brandl13e89462008-07-01 19:56:00 +0000592def to_bytes(url):
593 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000594 # Most URL schemes require ASCII. If that changes, the conversion
595 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000596 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000597 if isinstance(url, str):
598 try:
599 url = url.encode("ASCII").decode()
600 except UnicodeError:
601 raise UnicodeError("URL " + repr(url) +
602 " contains non-ASCII characters")
603 return url
604
605def unwrap(url):
606 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
607 url = str(url).strip()
608 if url[:1] == '<' and url[-1:] == '>':
609 url = url[1:-1].strip()
610 if url[:4] == 'URL:': url = url[4:].strip()
611 return url
612
613_typeprog = None
614def splittype(url):
615 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
616 global _typeprog
617 if _typeprog is None:
618 import re
619 _typeprog = re.compile('^([^/:]+):')
620
621 match = _typeprog.match(url)
622 if match:
623 scheme = match.group(1)
624 return scheme.lower(), url[len(scheme) + 1:]
625 return None, url
626
627_hostprog = None
628def splithost(url):
629 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
630 global _hostprog
631 if _hostprog is None:
632 import re
633 _hostprog = re.compile('^//([^/?]*)(.*)$')
634
635 match = _hostprog.match(url)
636 if match: return match.group(1, 2)
637 return None, url
638
639_userprog = None
640def splituser(host):
641 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
642 global _userprog
643 if _userprog is None:
644 import re
645 _userprog = re.compile('^(.*)@(.*)$')
646
647 match = _userprog.match(host)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000648 if match: return map(unquote, match.group(1, 2))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000649 return None, host
650
651_passwdprog = None
652def splitpasswd(user):
653 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
654 global _passwdprog
655 if _passwdprog is None:
656 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000657 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000658
659 match = _passwdprog.match(user)
660 if match: return match.group(1, 2)
661 return user, None
662
663# splittag('/path#tag') --> '/path', 'tag'
664_portprog = None
665def splitport(host):
666 """splitport('host:port') --> 'host', 'port'."""
667 global _portprog
668 if _portprog is None:
669 import re
670 _portprog = re.compile('^(.*):([0-9]+)$')
671
672 match = _portprog.match(host)
673 if match: return match.group(1, 2)
674 return host, None
675
676_nportprog = None
677def splitnport(host, defport=-1):
678 """Split host and port, returning numeric port.
679 Return given default port if no ':' found; defaults to -1.
680 Return numerical port if a valid number are found after ':'.
681 Return None if ':' but not a valid number."""
682 global _nportprog
683 if _nportprog is None:
684 import re
685 _nportprog = re.compile('^(.*):(.*)$')
686
687 match = _nportprog.match(host)
688 if match:
689 host, port = match.group(1, 2)
690 try:
691 if not port: raise ValueError("no digits")
692 nport = int(port)
693 except ValueError:
694 nport = None
695 return host, nport
696 return host, defport
697
698_queryprog = None
699def splitquery(url):
700 """splitquery('/path?query') --> '/path', 'query'."""
701 global _queryprog
702 if _queryprog is None:
703 import re
704 _queryprog = re.compile('^(.*)\?([^?]*)$')
705
706 match = _queryprog.match(url)
707 if match: return match.group(1, 2)
708 return url, None
709
710_tagprog = None
711def splittag(url):
712 """splittag('/path#tag') --> '/path', 'tag'."""
713 global _tagprog
714 if _tagprog is None:
715 import re
716 _tagprog = re.compile('^(.*)#([^#]*)$')
717
718 match = _tagprog.match(url)
719 if match: return match.group(1, 2)
720 return url, None
721
722def splitattr(url):
723 """splitattr('/path;attr1=value1;attr2=value2;...') ->
724 '/path', ['attr1=value1', 'attr2=value2', ...]."""
725 words = url.split(';')
726 return words[0], words[1:]
727
728_valueprog = None
729def splitvalue(attr):
730 """splitvalue('attr=value') --> 'attr', 'value'."""
731 global _valueprog
732 if _valueprog is None:
733 import re
734 _valueprog = re.compile('^([^=]*)=(.*)$')
735
736 match = _valueprog.match(attr)
737 if match: return match.group(1, 2)
738 return attr, None
739
740test_input = """
741 http://a/b/c/d
742
743 g:h = <URL:g:h>
744 http:g = <URL:http://a/b/c/g>
745 http: = <URL:http://a/b/c/d>
746 g = <URL:http://a/b/c/g>
747 ./g = <URL:http://a/b/c/g>
748 g/ = <URL:http://a/b/c/g/>
749 /g = <URL:http://a/g>
750 //g = <URL:http://g>
751 ?y = <URL:http://a/b/c/d?y>
752 g?y = <URL:http://a/b/c/g?y>
753 g?y/./x = <URL:http://a/b/c/g?y/./x>
754 . = <URL:http://a/b/c/>
755 ./ = <URL:http://a/b/c/>
756 .. = <URL:http://a/b/>
757 ../ = <URL:http://a/b/>
758 ../g = <URL:http://a/b/g>
759 ../.. = <URL:http://a/>
760 ../../g = <URL:http://a/g>
761 ../../../g = <URL:http://a/../g>
762 ./../g = <URL:http://a/b/g>
763 ./g/. = <URL:http://a/b/c/g/>
764 /./g = <URL:http://a/./g>
765 g/./h = <URL:http://a/b/c/g/h>
766 g/../h = <URL:http://a/b/c/h>
767 http:g = <URL:http://a/b/c/g>
768 http: = <URL:http://a/b/c/d>
769 http:?y = <URL:http://a/b/c/d?y>
770 http:g?y = <URL:http://a/b/c/g?y>
771 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
772"""
773
774def test():
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000775 base = ''
776 if sys.argv[1:]:
777 fn = sys.argv[1]
778 if fn == '-':
779 fp = sys.stdin
780 else:
781 fp = open(fn)
782 else:
783 from io import StringIO
784 fp = StringIO(test_input)
785 for line in fp:
786 words = line.split()
787 if not words:
788 continue
789 url = words[0]
790 parts = urlparse(url)
791 print('%-10s : %s' % (url, parts))
792 abs = urljoin(base, url)
793 if not base:
794 base = abs
795 wrapped = '<URL:%s>' % abs
796 print('%-10s = %s' % (url, wrapped))
797 if len(words) == 3 and words[1] == '=':
798 if wrapped != words[2]:
799 print('EXPECTED', words[2], '!!!!!!!!!!')
800
801if __name__ == '__main__':
802 test()