blob: 4f48b258bb61423da1341768e4f6dd0eba568e1b [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
11RFC2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
14RFC2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
15
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
19RFC1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20McCahill, December 1994
21
22RFC 3986 is considered the current standard and any changes to urlparse module
23should conform to this. urlparse module is not entirely compliant with this.
24The defacto scenarios of parsing are considered sometimes and for backward
25compatiblity purposes, older RFC uses of parsing are retained. The testcases in
26test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000027"""
28
Facundo Batista2ac5de22008-07-07 18:24:11 +000029import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000030import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000031
Jeremy Hylton1afc1692008-06-18 20:49:58 +000032__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac469d4c2008-09-03 22:49:01 +000033 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000034 "quote", "quote_plus", "quote_from_bytes",
35 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036
37# A classification of schemes ('' means apply by default)
38uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
39 'wais', 'file', 'https', 'shttp', 'mms',
40 'prospero', 'rtsp', 'rtspu', '', 'sftp']
41uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
42 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
43 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaraneaaec272009-03-30 21:54:41 +000044 'svn', 'svn+ssh', 'sftp','nfs']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000045non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
46 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
47uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
48 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
49 'mms', '', 'sftp']
50uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
51 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
52uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
53 'nntp', 'wais', 'https', 'shttp', 'snews',
54 'file', 'prospero', '']
55
56# Characters valid in scheme names
57scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
58 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
59 '0123456789'
60 '+-.')
61
62MAX_CACHE_SIZE = 20
63_parse_cache = {}
64
65def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000066 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000067 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000068 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000069
70
71class ResultMixin(object):
72 """Shared methods for the parsed result objects."""
73
74 @property
75 def username(self):
76 netloc = self.netloc
77 if "@" in netloc:
78 userinfo = netloc.rsplit("@", 1)[0]
79 if ":" in userinfo:
80 userinfo = userinfo.split(":", 1)[0]
81 return userinfo
82 return None
83
84 @property
85 def password(self):
86 netloc = self.netloc
87 if "@" in netloc:
88 userinfo = netloc.rsplit("@", 1)[0]
89 if ":" in userinfo:
90 return userinfo.split(":", 1)[1]
91 return None
92
93 @property
94 def hostname(self):
Senthil Kumaranad02d232010-04-16 03:02:13 +000095 netloc = self.netloc.split('@')[-1]
96 if '[' in netloc and ']' in netloc:
97 return netloc.split(']')[0][1:].lower()
Senthil Kumaranad02d232010-04-16 03:02:13 +000098 elif ':' in netloc:
99 return netloc.split(':')[0].lower()
100 elif netloc == '':
101 return None
102 else:
103 return netloc.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000104
105 @property
106 def port(self):
Senthil Kumaranad02d232010-04-16 03:02:13 +0000107 netloc = self.netloc.split('@')[-1].split(']')[-1]
108 if ':' in netloc:
109 port = netloc.split(':')[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000110 return int(port, 10)
Senthil Kumaranad02d232010-04-16 03:02:13 +0000111 else:
112 return None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000113
114from collections import namedtuple
115
116class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
117
118 __slots__ = ()
119
120 def geturl(self):
121 return urlunsplit(self)
122
123
124class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
125
126 __slots__ = ()
127
128 def geturl(self):
129 return urlunparse(self)
130
131
132def urlparse(url, scheme='', allow_fragments=True):
133 """Parse a URL into 6 components:
134 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
135 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
136 Note that we don't break the components up in smaller bits
137 (e.g. netloc is a single string) and we don't expand % escapes."""
138 tuple = urlsplit(url, scheme, allow_fragments)
139 scheme, netloc, url, query, fragment = tuple
140 if scheme in uses_params and ';' in url:
141 url, params = _splitparams(url)
142 else:
143 params = ''
144 return ParseResult(scheme, netloc, url, params, query, fragment)
145
146def _splitparams(url):
147 if '/' in url:
148 i = url.find(';', url.rfind('/'))
149 if i < 0:
150 return url, ''
151 else:
152 i = url.find(';')
153 return url[:i], url[i+1:]
154
155def _splitnetloc(url, start=0):
156 delim = len(url) # position of end of domain part of url, default is end
157 for c in '/?#': # look for delimiters; the order is NOT important
158 wdelim = url.find(c, start) # find first of this delim
159 if wdelim >= 0: # if found
160 delim = min(delim, wdelim) # use earliest delim position
161 return url[start:delim], url[delim:] # return (domain, rest)
162
163def urlsplit(url, scheme='', allow_fragments=True):
164 """Parse a URL into 5 components:
165 <scheme>://<netloc>/<path>?<query>#<fragment>
166 Return a 5-tuple: (scheme, netloc, path, query, fragment).
167 Note that we don't break the components up in smaller bits
168 (e.g. netloc is a single string) and we don't expand % escapes."""
169 allow_fragments = bool(allow_fragments)
170 key = url, scheme, allow_fragments, type(url), type(scheme)
171 cached = _parse_cache.get(key, None)
172 if cached:
173 return cached
174 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
175 clear_cache()
176 netloc = query = fragment = ''
177 i = url.find(':')
178 if i > 0:
179 if url[:i] == 'http': # optimize the common case
180 scheme = url[:i].lower()
181 url = url[i+1:]
182 if url[:2] == '//':
183 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran2eaef052010-04-20 20:42:50 +0000184 if '[' in netloc :
185 if not ']' in netloc: raise ValueError("Invalid IPv6 URL")
186 if ']' in netloc:
187 if not '[' in netloc: raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000188 if allow_fragments and '#' in url:
189 url, fragment = url.split('#', 1)
190 if '?' in url:
191 url, query = url.split('?', 1)
192 v = SplitResult(scheme, netloc, url, query, fragment)
193 _parse_cache[key] = v
194 return v
195 for c in url[:i]:
196 if c not in scheme_chars:
197 break
198 else:
199 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000200 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000201 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran2eaef052010-04-20 20:42:50 +0000202 if '[' in netloc:
203 if not ']' in netloc: raise ValueError("Invalid IPv6 URL")
204 if ']' in netloc:
205 if not '[' in netloc: raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000206 if allow_fragments and scheme in uses_fragment and '#' in url:
207 url, fragment = url.split('#', 1)
208 if scheme in uses_query and '?' in url:
209 url, query = url.split('?', 1)
210 v = SplitResult(scheme, netloc, url, query, fragment)
211 _parse_cache[key] = v
212 return v
213
214def urlunparse(components):
215 """Put a parsed URL back together again. This may result in a
216 slightly different, but equivalent URL, if the URL that was parsed
217 originally had redundant delimiters, e.g. a ? with an empty query
218 (the draft states that these are equivalent)."""
219 scheme, netloc, url, params, query, fragment = components
220 if params:
221 url = "%s;%s" % (url, params)
222 return urlunsplit((scheme, netloc, url, query, fragment))
223
224def urlunsplit(components):
225 scheme, netloc, url, query, fragment = components
226 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
227 if url and url[:1] != '/': url = '/' + url
228 url = '//' + (netloc or '') + url
229 if scheme:
230 url = scheme + ':' + url
231 if query:
232 url = url + '?' + query
233 if fragment:
234 url = url + '#' + fragment
235 return url
236
237def urljoin(base, url, allow_fragments=True):
238 """Join a base URL and a possibly relative URL to form an absolute
239 interpretation of the latter."""
240 if not base:
241 return url
242 if not url:
243 return base
244 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
245 urlparse(base, '', allow_fragments)
246 scheme, netloc, path, params, query, fragment = \
247 urlparse(url, bscheme, allow_fragments)
248 if scheme != bscheme or scheme not in uses_relative:
249 return url
250 if scheme in uses_netloc:
251 if netloc:
252 return urlunparse((scheme, netloc, path,
253 params, query, fragment))
254 netloc = bnetloc
255 if path[:1] == '/':
256 return urlunparse((scheme, netloc, path,
257 params, query, fragment))
Facundo Batista23e38562008-08-14 16:55:14 +0000258 if not path:
259 path = bpath
260 if not params:
261 params = bparams
262 else:
263 path = path[:-1]
264 return urlunparse((scheme, netloc, path,
265 params, query, fragment))
266 if not query:
267 query = bquery
268 return urlunparse((scheme, netloc, path,
269 params, query, fragment))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000270 segments = bpath.split('/')[:-1] + path.split('/')
271 # XXX The stuff below is bogus in various ways...
272 if segments[-1] == '.':
273 segments[-1] = ''
274 while '.' in segments:
275 segments.remove('.')
276 while 1:
277 i = 1
278 n = len(segments) - 1
279 while i < n:
280 if (segments[i] == '..'
281 and segments[i-1] not in ('', '..')):
282 del segments[i-1:i+1]
283 break
284 i = i+1
285 else:
286 break
287 if segments == ['', '..']:
288 segments[-1] = ''
289 elif len(segments) >= 2 and segments[-1] == '..':
290 segments[-2:] = ['']
291 return urlunparse((scheme, netloc, '/'.join(segments),
292 params, query, fragment))
293
294def urldefrag(url):
295 """Removes any existing fragment from URL.
296
297 Returns a tuple of the defragmented URL and the fragment. If
298 the URL contained no fragments, the second element is the
299 empty string.
300 """
301 if '#' in url:
302 s, n, p, a, q, frag = urlparse(url)
303 defrag = urlunparse((s, n, p, a, q, ''))
304 return defrag, frag
305 else:
306 return url, ''
307
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000308def unquote_to_bytes(string):
309 """unquote_to_bytes('abc%20def') -> b'abc def'."""
310 # Note: strings are encoded as UTF-8. This is only an issue if it contains
311 # unescaped non-ASCII characters, which URIs should not.
312 if isinstance(string, str):
313 string = string.encode('utf-8')
314 res = string.split(b'%')
315 res[0] = res[0]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000316 for i in range(1, len(res)):
317 item = res[i]
318 try:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000319 res[i] = bytes([int(item[:2], 16)]) + item[2:]
320 except ValueError:
321 res[i] = b'%' + item
322 return b''.join(res)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000323
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000324def unquote(string, encoding='utf-8', errors='replace'):
325 """Replace %xx escapes by their single-character equivalent. The optional
326 encoding and errors parameters specify how to decode percent-encoded
327 sequences into Unicode characters, as accepted by the bytes.decode()
328 method.
329 By default, percent-encoded sequences are decoded with UTF-8, and invalid
330 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000331
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000332 unquote('abc%20def') -> 'abc def'.
333 """
334 if encoding is None: encoding = 'utf-8'
335 if errors is None: errors = 'replace'
336 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
337 # (list of single-byte bytes objects)
338 pct_sequence = []
339 res = string.split('%')
340 for i in range(1, len(res)):
341 item = res[i]
342 try:
343 if not item: raise ValueError
344 pct_sequence.append(bytes.fromhex(item[:2]))
345 rest = item[2:]
346 except ValueError:
347 rest = '%' + item
348 if not rest:
349 # This segment was just a single percent-encoded character.
350 # May be part of a sequence of code units, so delay decoding.
351 # (Stored in pct_sequence).
352 res[i] = ''
353 else:
354 # Encountered non-percent-encoded characters. Flush the current
355 # pct_sequence.
356 res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
357 pct_sequence = []
358 if pct_sequence:
359 # Flush the final pct_sequence
360 # res[-1] will always be empty if pct_sequence != []
361 assert not res[-1], "string=%r, res=%r" % (string, res)
362 res[-1] = b''.join(pct_sequence).decode(encoding, errors)
363 return ''.join(res)
364
Georg Brandl3d6575d2009-09-16 14:36:22 +0000365def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000366 """Parse a query given as a string argument.
367
368 Arguments:
369
370 qs: URL-encoded query string to be parsed
371
372 keep_blank_values: flag indicating whether blank values in
373 URL encoded queries should be treated as blank strings.
374 A true value indicates that blanks should be retained as
375 blank strings. The default false value indicates that
376 blank values are to be ignored and treated as if they were
377 not included.
378
379 strict_parsing: flag indicating what to do with parsing errors.
380 If false (the default), errors are silently ignored.
381 If true, errors raise a ValueError exception.
382 """
383 dict = {}
384 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
385 if name in dict:
386 dict[name].append(value)
387 else:
388 dict[name] = [value]
389 return dict
390
Georg Brandl3d6575d2009-09-16 14:36:22 +0000391def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000392 """Parse a query given as a string argument.
393
394 Arguments:
395
396 qs: URL-encoded query string to be parsed
397
398 keep_blank_values: flag indicating whether blank values in
399 URL encoded queries should be treated as blank strings. A
400 true value indicates that blanks should be retained as blank
401 strings. The default false value indicates that blank values
402 are to be ignored and treated as if they were not included.
403
404 strict_parsing: flag indicating what to do with parsing errors. If
405 false (the default), errors are silently ignored. If true,
406 errors raise a ValueError exception.
407
408 Returns a list, as G-d intended.
409 """
410 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
411 r = []
412 for name_value in pairs:
413 if not name_value and not strict_parsing:
414 continue
415 nv = name_value.split('=', 1)
416 if len(nv) != 2:
417 if strict_parsing:
418 raise ValueError("bad query field: %r" % (name_value,))
419 # Handle case of a control-name with no equal sign
420 if keep_blank_values:
421 nv.append('')
422 else:
423 continue
424 if len(nv[1]) or keep_blank_values:
425 name = unquote(nv[0].replace('+', ' '))
426 value = unquote(nv[1].replace('+', ' '))
427 r.append((name, value))
428
429 return r
430
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000431def unquote_plus(string, encoding='utf-8', errors='replace'):
432 """Like unquote(), but also replace plus signs by spaces, as required for
433 unquoting HTML form values.
434
435 unquote_plus('%7e/abc+def') -> '~/abc def'
436 """
437 string = string.replace('+', ' ')
438 return unquote(string, encoding, errors)
439
440_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
441 b'abcdefghijklmnopqrstuvwxyz'
442 b'0123456789'
443 b'_.-')
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000444_safe_quoters= {}
445
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000446class Quoter(collections.defaultdict):
447 """A mapping from bytes (in range(0,256)) to strings.
448
449 String values are percent-encoded byte values, unless the key < 128, and
450 in the "safe" set (either the specified safe set, or default set).
451 """
452 # Keeps a cache internally, using defaultdict, for efficiency (lookups
453 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000454 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000455 """safe: bytes object."""
456 self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000457
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000458 def __repr__(self):
459 # Without this, will just display as a defaultdict
460 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000461
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000462 def __missing__(self, b):
463 # Handle a cache miss. Store quoted string in cache and return.
464 res = b in self.safe and chr(b) or ('%%%02X' % b)
465 self[b] = res
466 return res
467
468def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000469 """quote('abc def') -> 'abc%20def'
470
471 Each part of a URL, e.g. the path info, the query, etc., has a
472 different set of reserved characters that must be quoted.
473
474 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
475 the following reserved characters.
476
477 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
478 "$" | ","
479
480 Each of these characters is reserved in some component of a URL,
481 but not necessarily in all of them.
482
483 By default, the quote function is intended for quoting the path
484 section of a URL. Thus, it will not encode '/'. This character
485 is reserved, but in typical usage the quote function is being
486 called on a path where the existing slash characters are used as
487 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000488
489 string and safe may be either str or bytes objects. encoding must
490 not be specified if string is a str.
491
492 The optional encoding and errors parameters specify how to deal with
493 non-ASCII characters, as accepted by the str.encode method.
494 By default, encoding='utf-8' (characters are encoded with UTF-8), and
495 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000496 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000497 if isinstance(string, str):
498 if encoding is None:
499 encoding = 'utf-8'
500 if errors is None:
501 errors = 'strict'
502 string = string.encode(encoding, errors)
503 else:
504 if encoding is not None:
505 raise TypeError("quote() doesn't support 'encoding' for bytes")
506 if errors is not None:
507 raise TypeError("quote() doesn't support 'errors' for bytes")
508 return quote_from_bytes(string, safe)
509
510def quote_plus(string, safe='', encoding=None, errors=None):
511 """Like quote(), but also replace ' ' with '+', as required for quoting
512 HTML form values. Plus signs in the original string are escaped unless
513 they are included in safe. It also does not have safe default to '/'.
514 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000515 # Check if ' ' in string, where string may either be a str or bytes. If
516 # there are no spaces, the regular quote will produce the right answer.
517 if ((isinstance(string, str) and ' ' not in string) or
518 (isinstance(string, bytes) and b' ' not in string)):
519 return quote(string, safe, encoding, errors)
520 if isinstance(safe, str):
521 space = ' '
522 else:
523 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000524 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000525 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000526
527def quote_from_bytes(bs, safe='/'):
528 """Like quote(), but accepts a bytes object rather than a str, and does
529 not perform string-to-bytes encoding. It always returns an ASCII string.
530 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
531 """
532 if isinstance(safe, str):
533 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
534 safe = safe.encode('ascii', 'ignore')
535 cachekey = bytes(safe) # In case it was a bytearray
536 if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
537 raise TypeError("quote_from_bytes() expected a bytes")
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000538 try:
539 quoter = _safe_quoters[cachekey]
540 except KeyError:
541 quoter = Quoter(safe)
542 _safe_quoters[cachekey] = quoter
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000543 return ''.join([quoter[char] for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000544
Georg Brandl3d6575d2009-09-16 14:36:22 +0000545def urlencode(query, doseq=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000546 """Encode a sequence of two-element tuples or dictionary into a URL query string.
547
548 If any values in the query arg are sequences and doseq is true, each
549 sequence element is converted to a separate parameter.
550
551 If the query arg is a sequence of two-element tuples, the order of the
552 parameters in the output will match the order of parameters in the
553 input.
554 """
555
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000556 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000557 query = query.items()
558 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000559 # It's a bother at times that strings and string-like objects are
560 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000561 try:
562 # non-sequence items should not work with len()
563 # non-empty strings will fail this
564 if len(query) and not isinstance(query[0], tuple):
565 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000566 # Zero-length sequences of all types will get here and succeed,
567 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000568 # allowed empty dicts that type of behavior probably should be
569 # preserved for consistency
570 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000571 ty, va, tb = sys.exc_info()
572 raise TypeError("not a valid non-string sequence "
573 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000574
575 l = []
576 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000577 for k, v in query:
578 k = quote_plus(str(k))
579 v = quote_plus(str(v))
580 l.append(k + '=' + v)
581 else:
582 for k, v in query:
583 k = quote_plus(str(k))
584 if isinstance(v, str):
585 v = quote_plus(v)
586 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000587 else:
588 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000589 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000590 x = len(v)
591 except TypeError:
592 # not a sequence
593 v = quote_plus(str(v))
594 l.append(k + '=' + v)
595 else:
596 # loop over the sequence
597 for elt in v:
598 l.append(k + '=' + quote_plus(str(elt)))
599 return '&'.join(l)
600
601# Utilities to parse URLs (most of these return None for missing parts):
602# unwrap('<URL:type://host/path>') --> 'type://host/path'
603# splittype('type:opaquestring') --> 'type', 'opaquestring'
604# splithost('//host[:port]/path') --> 'host[:port]', '/path'
605# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
606# splitpasswd('user:passwd') -> 'user', 'passwd'
607# splitport('host:port') --> 'host', 'port'
608# splitquery('/path?query') --> '/path', 'query'
609# splittag('/path#tag') --> '/path', 'tag'
610# splitattr('/path;attr1=value1;attr2=value2;...') ->
611# '/path', ['attr1=value1', 'attr2=value2', ...]
612# splitvalue('attr=value') --> 'attr', 'value'
613# urllib.parse.unquote('abc%20def') -> 'abc def'
614# quote('abc def') -> 'abc%20def')
615
Georg Brandl13e89462008-07-01 19:56:00 +0000616def to_bytes(url):
617 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000618 # Most URL schemes require ASCII. If that changes, the conversion
619 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000620 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000621 if isinstance(url, str):
622 try:
623 url = url.encode("ASCII").decode()
624 except UnicodeError:
625 raise UnicodeError("URL " + repr(url) +
626 " contains non-ASCII characters")
627 return url
628
629def unwrap(url):
630 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
631 url = str(url).strip()
632 if url[:1] == '<' and url[-1:] == '>':
633 url = url[1:-1].strip()
634 if url[:4] == 'URL:': url = url[4:].strip()
635 return url
636
637_typeprog = None
638def splittype(url):
639 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
640 global _typeprog
641 if _typeprog is None:
642 import re
643 _typeprog = re.compile('^([^/:]+):')
644
645 match = _typeprog.match(url)
646 if match:
647 scheme = match.group(1)
648 return scheme.lower(), url[len(scheme) + 1:]
649 return None, url
650
651_hostprog = None
652def splithost(url):
653 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
654 global _hostprog
655 if _hostprog is None:
656 import re
657 _hostprog = re.compile('^//([^/?]*)(.*)$')
658
659 match = _hostprog.match(url)
660 if match: return match.group(1, 2)
661 return None, url
662
663_userprog = None
664def splituser(host):
665 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
666 global _userprog
667 if _userprog is None:
668 import re
669 _userprog = re.compile('^(.*)@(.*)$')
670
671 match = _userprog.match(host)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000672 if match: return map(unquote, match.group(1, 2))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000673 return None, host
674
675_passwdprog = None
676def splitpasswd(user):
677 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
678 global _passwdprog
679 if _passwdprog is None:
680 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000681 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000682
683 match = _passwdprog.match(user)
684 if match: return match.group(1, 2)
685 return user, None
686
687# splittag('/path#tag') --> '/path', 'tag'
688_portprog = None
689def splitport(host):
690 """splitport('host:port') --> 'host', 'port'."""
691 global _portprog
692 if _portprog is None:
693 import re
694 _portprog = re.compile('^(.*):([0-9]+)$')
695
696 match = _portprog.match(host)
697 if match: return match.group(1, 2)
698 return host, None
699
700_nportprog = None
701def splitnport(host, defport=-1):
702 """Split host and port, returning numeric port.
703 Return given default port if no ':' found; defaults to -1.
704 Return numerical port if a valid number are found after ':'.
705 Return None if ':' but not a valid number."""
706 global _nportprog
707 if _nportprog is None:
708 import re
709 _nportprog = re.compile('^(.*):(.*)$')
710
711 match = _nportprog.match(host)
712 if match:
713 host, port = match.group(1, 2)
714 try:
715 if not port: raise ValueError("no digits")
716 nport = int(port)
717 except ValueError:
718 nport = None
719 return host, nport
720 return host, defport
721
722_queryprog = None
723def splitquery(url):
724 """splitquery('/path?query') --> '/path', 'query'."""
725 global _queryprog
726 if _queryprog is None:
727 import re
728 _queryprog = re.compile('^(.*)\?([^?]*)$')
729
730 match = _queryprog.match(url)
731 if match: return match.group(1, 2)
732 return url, None
733
734_tagprog = None
735def splittag(url):
736 """splittag('/path#tag') --> '/path', 'tag'."""
737 global _tagprog
738 if _tagprog is None:
739 import re
740 _tagprog = re.compile('^(.*)#([^#]*)$')
741
742 match = _tagprog.match(url)
743 if match: return match.group(1, 2)
744 return url, None
745
746def splitattr(url):
747 """splitattr('/path;attr1=value1;attr2=value2;...') ->
748 '/path', ['attr1=value1', 'attr2=value2', ...]."""
749 words = url.split(';')
750 return words[0], words[1:]
751
752_valueprog = None
753def splitvalue(attr):
754 """splitvalue('attr=value') --> 'attr', 'value'."""
755 global _valueprog
756 if _valueprog is None:
757 import re
758 _valueprog = re.compile('^([^=]*)=(.*)$')
759
760 match = _valueprog.match(attr)
761 if match: return match.group(1, 2)
762 return attr, None
763
764test_input = """
765 http://a/b/c/d
766
767 g:h = <URL:g:h>
768 http:g = <URL:http://a/b/c/g>
769 http: = <URL:http://a/b/c/d>
770 g = <URL:http://a/b/c/g>
771 ./g = <URL:http://a/b/c/g>
772 g/ = <URL:http://a/b/c/g/>
773 /g = <URL:http://a/g>
774 //g = <URL:http://g>
775 ?y = <URL:http://a/b/c/d?y>
776 g?y = <URL:http://a/b/c/g?y>
777 g?y/./x = <URL:http://a/b/c/g?y/./x>
778 . = <URL:http://a/b/c/>
779 ./ = <URL:http://a/b/c/>
780 .. = <URL:http://a/b/>
781 ../ = <URL:http://a/b/>
782 ../g = <URL:http://a/b/g>
783 ../.. = <URL:http://a/>
784 ../../g = <URL:http://a/g>
785 ../../../g = <URL:http://a/../g>
786 ./../g = <URL:http://a/b/g>
787 ./g/. = <URL:http://a/b/c/g/>
788 /./g = <URL:http://a/./g>
789 g/./h = <URL:http://a/b/c/g/h>
790 g/../h = <URL:http://a/b/c/h>
791 http:g = <URL:http://a/b/c/g>
792 http: = <URL:http://a/b/c/d>
793 http:?y = <URL:http://a/b/c/d?y>
794 http:g?y = <URL:http://a/b/c/g?y>
795 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
796"""
797
798def test():
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000799 base = ''
800 if sys.argv[1:]:
801 fn = sys.argv[1]
802 if fn == '-':
803 fp = sys.stdin
804 else:
805 fp = open(fn)
806 else:
807 from io import StringIO
808 fp = StringIO(test_input)
809 for line in fp:
810 words = line.split()
811 if not words:
812 continue
813 url = words[0]
814 parts = urlparse(url)
815 print('%-10s : %s' % (url, parts))
816 abs = urljoin(base, url)
817 if not base:
818 base = abs
819 wrapped = '<URL:%s>' % abs
820 print('%-10s = %s' % (url, wrapped))
821 if len(words) == 3 and words[1] == '=':
822 if wrapped != words[2]:
823 print('EXPECTED', words[2], '!!!!!!!!!!')
824
825if __name__ == '__main__':
826 test()