blob: 1ac6f4dff6d9d414c54f07be9dfc6889fb93387e [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +00008RFC2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
9Berners-Lee, R. Fielding, and L. Masinter, August 1998.
10
11RFC2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
12
13RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
141995.
15
16RFC1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
17McCahill, December 1994
18
19RFC 3986 is considered the current standard and any changes to urlparse module
20should conform to this. urlparse module is not entirely compliant with this.
21The defacto scenarios of parsing are considered sometimes and for backward
22compatiblity purposes, older RFC uses of parsing are retained. The testcases in
23test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000024"""
25
Facundo Batista2ac5de22008-07-07 18:24:11 +000026import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000027import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000028
Jeremy Hylton1afc1692008-06-18 20:49:58 +000029__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac469d4c2008-09-03 22:49:01 +000030 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000031 "quote", "quote_plus", "quote_from_bytes",
32 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000033
34# A classification of schemes ('' means apply by default)
35uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
36 'wais', 'file', 'https', 'shttp', 'mms',
37 'prospero', 'rtsp', 'rtspu', '', 'sftp']
38uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
39 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
40 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumarand4cd1882010-05-13 03:43:13 +000041 'svn', 'svn+ssh', 'sftp', 'nfs',' git', 'git+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000042non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
43 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
44uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
45 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
46 'mms', '', 'sftp']
47uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
48 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
49uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
50 'nntp', 'wais', 'https', 'shttp', 'snews',
51 'file', 'prospero', '']
52
53# Characters valid in scheme names
54scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
55 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
56 '0123456789'
57 '+-.')
58
59MAX_CACHE_SIZE = 20
60_parse_cache = {}
61
62def clear_cache():
63 """Clear the parse cache."""
64 _parse_cache.clear()
65
66
67class ResultMixin(object):
68 """Shared methods for the parsed result objects."""
69
70 @property
71 def username(self):
72 netloc = self.netloc
73 if "@" in netloc:
74 userinfo = netloc.rsplit("@", 1)[0]
75 if ":" in userinfo:
76 userinfo = userinfo.split(":", 1)[0]
77 return userinfo
78 return None
79
80 @property
81 def password(self):
82 netloc = self.netloc
83 if "@" in netloc:
84 userinfo = netloc.rsplit("@", 1)[0]
85 if ":" in userinfo:
86 return userinfo.split(":", 1)[1]
87 return None
88
89 @property
90 def hostname(self):
Senthil Kumarana6023ca2010-04-16 11:28:05 +000091 netloc = self.netloc
92 if "@" in netloc:
93 netloc = netloc.rsplit("@", 1)[1]
94 if ":" in netloc:
95 netloc = netloc.split(":", 1)[0]
96 return netloc.lower() or None
Jeremy Hylton1afc1692008-06-18 20:49:58 +000097
98 @property
99 def port(self):
Senthil Kumarana6023ca2010-04-16 11:28:05 +0000100 netloc = self.netloc
101 if "@" in netloc:
102 netloc = netloc.rsplit("@", 1)[1]
103 if ":" in netloc:
104 port = netloc.split(":", 1)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000105 return int(port, 10)
Senthil Kumarana6023ca2010-04-16 11:28:05 +0000106 return None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000107
108from collections import namedtuple
109
110class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
111
112 __slots__ = ()
113
114 def geturl(self):
115 return urlunsplit(self)
116
117
118class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
119
120 __slots__ = ()
121
122 def geturl(self):
123 return urlunparse(self)
124
125
126def urlparse(url, scheme='', allow_fragments=True):
127 """Parse a URL into 6 components:
128 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
129 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
130 Note that we don't break the components up in smaller bits
131 (e.g. netloc is a single string) and we don't expand % escapes."""
132 tuple = urlsplit(url, scheme, allow_fragments)
133 scheme, netloc, url, query, fragment = tuple
134 if scheme in uses_params and ';' in url:
135 url, params = _splitparams(url)
136 else:
137 params = ''
138 return ParseResult(scheme, netloc, url, params, query, fragment)
139
140def _splitparams(url):
141 if '/' in url:
142 i = url.find(';', url.rfind('/'))
143 if i < 0:
144 return url, ''
145 else:
146 i = url.find(';')
147 return url[:i], url[i+1:]
148
149def _splitnetloc(url, start=0):
150 delim = len(url) # position of end of domain part of url, default is end
151 for c in '/?#': # look for delimiters; the order is NOT important
152 wdelim = url.find(c, start) # find first of this delim
153 if wdelim >= 0: # if found
154 delim = min(delim, wdelim) # use earliest delim position
155 return url[start:delim], url[delim:] # return (domain, rest)
156
157def urlsplit(url, scheme='', allow_fragments=True):
158 """Parse a URL into 5 components:
159 <scheme>://<netloc>/<path>?<query>#<fragment>
160 Return a 5-tuple: (scheme, netloc, path, query, fragment).
161 Note that we don't break the components up in smaller bits
162 (e.g. netloc is a single string) and we don't expand % escapes."""
163 allow_fragments = bool(allow_fragments)
164 key = url, scheme, allow_fragments, type(url), type(scheme)
165 cached = _parse_cache.get(key, None)
166 if cached:
167 return cached
168 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
169 clear_cache()
170 netloc = query = fragment = ''
171 i = url.find(':')
172 if i > 0:
173 if url[:i] == 'http': # optimize the common case
174 scheme = url[:i].lower()
175 url = url[i+1:]
176 if url[:2] == '//':
177 netloc, url = _splitnetloc(url, 2)
178 if allow_fragments and '#' in url:
179 url, fragment = url.split('#', 1)
180 if '?' in url:
181 url, query = url.split('?', 1)
182 v = SplitResult(scheme, netloc, url, query, fragment)
183 _parse_cache[key] = v
184 return v
185 for c in url[:i]:
186 if c not in scheme_chars:
187 break
188 else:
189 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumarana8dbb242010-02-19 07:45:03 +0000190 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000191 netloc, url = _splitnetloc(url, 2)
192 if allow_fragments and scheme in uses_fragment and '#' in url:
193 url, fragment = url.split('#', 1)
194 if scheme in uses_query and '?' in url:
195 url, query = url.split('?', 1)
196 v = SplitResult(scheme, netloc, url, query, fragment)
197 _parse_cache[key] = v
198 return v
199
200def urlunparse(components):
201 """Put a parsed URL back together again. This may result in a
202 slightly different, but equivalent URL, if the URL that was parsed
203 originally had redundant delimiters, e.g. a ? with an empty query
204 (the draft states that these are equivalent)."""
205 scheme, netloc, url, params, query, fragment = components
206 if params:
207 url = "%s;%s" % (url, params)
208 return urlunsplit((scheme, netloc, url, query, fragment))
209
210def urlunsplit(components):
211 scheme, netloc, url, query, fragment = components
212 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
213 if url and url[:1] != '/': url = '/' + url
214 url = '//' + (netloc or '') + url
215 if scheme:
216 url = scheme + ':' + url
217 if query:
218 url = url + '?' + query
219 if fragment:
220 url = url + '#' + fragment
221 return url
222
223def urljoin(base, url, allow_fragments=True):
224 """Join a base URL and a possibly relative URL to form an absolute
225 interpretation of the latter."""
226 if not base:
227 return url
228 if not url:
229 return base
230 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
231 urlparse(base, '', allow_fragments)
232 scheme, netloc, path, params, query, fragment = \
233 urlparse(url, bscheme, allow_fragments)
234 if scheme != bscheme or scheme not in uses_relative:
235 return url
236 if scheme in uses_netloc:
237 if netloc:
238 return urlunparse((scheme, netloc, path,
239 params, query, fragment))
240 netloc = bnetloc
241 if path[:1] == '/':
242 return urlunparse((scheme, netloc, path,
243 params, query, fragment))
Facundo Batista23e38562008-08-14 16:55:14 +0000244 if not path:
245 path = bpath
246 if not params:
247 params = bparams
248 else:
249 path = path[:-1]
250 return urlunparse((scheme, netloc, path,
251 params, query, fragment))
252 if not query:
253 query = bquery
254 return urlunparse((scheme, netloc, path,
255 params, query, fragment))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000256 segments = bpath.split('/')[:-1] + path.split('/')
257 # XXX The stuff below is bogus in various ways...
258 if segments[-1] == '.':
259 segments[-1] = ''
260 while '.' in segments:
261 segments.remove('.')
262 while 1:
263 i = 1
264 n = len(segments) - 1
265 while i < n:
266 if (segments[i] == '..'
267 and segments[i-1] not in ('', '..')):
268 del segments[i-1:i+1]
269 break
270 i = i+1
271 else:
272 break
273 if segments == ['', '..']:
274 segments[-1] = ''
275 elif len(segments) >= 2 and segments[-1] == '..':
276 segments[-2:] = ['']
277 return urlunparse((scheme, netloc, '/'.join(segments),
278 params, query, fragment))
279
280def urldefrag(url):
281 """Removes any existing fragment from URL.
282
283 Returns a tuple of the defragmented URL and the fragment. If
284 the URL contained no fragments, the second element is the
285 empty string.
286 """
287 if '#' in url:
288 s, n, p, a, q, frag = urlparse(url)
289 defrag = urlunparse((s, n, p, a, q, ''))
290 return defrag, frag
291 else:
292 return url, ''
293
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000294def unquote_to_bytes(string):
295 """unquote_to_bytes('abc%20def') -> b'abc def'."""
296 # Note: strings are encoded as UTF-8. This is only an issue if it contains
297 # unescaped non-ASCII characters, which URIs should not.
298 if isinstance(string, str):
299 string = string.encode('utf-8')
300 res = string.split(b'%')
301 res[0] = res[0]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000302 for i in range(1, len(res)):
303 item = res[i]
304 try:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000305 res[i] = bytes([int(item[:2], 16)]) + item[2:]
306 except ValueError:
307 res[i] = b'%' + item
308 return b''.join(res)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000309
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000310def unquote(string, encoding='utf-8', errors='replace'):
311 """Replace %xx escapes by their single-character equivalent. The optional
312 encoding and errors parameters specify how to decode percent-encoded
313 sequences into Unicode characters, as accepted by the bytes.decode()
314 method.
315 By default, percent-encoded sequences are decoded with UTF-8, and invalid
316 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000317
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000318 unquote('abc%20def') -> 'abc def'.
319 """
320 if encoding is None: encoding = 'utf-8'
321 if errors is None: errors = 'replace'
322 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
323 # (list of single-byte bytes objects)
324 pct_sequence = []
325 res = string.split('%')
326 for i in range(1, len(res)):
327 item = res[i]
328 try:
329 if not item: raise ValueError
330 pct_sequence.append(bytes.fromhex(item[:2]))
331 rest = item[2:]
332 except ValueError:
333 rest = '%' + item
334 if not rest:
335 # This segment was just a single percent-encoded character.
336 # May be part of a sequence of code units, so delay decoding.
337 # (Stored in pct_sequence).
338 res[i] = ''
339 else:
340 # Encountered non-percent-encoded characters. Flush the current
341 # pct_sequence.
342 res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
343 pct_sequence = []
344 if pct_sequence:
345 # Flush the final pct_sequence
346 # res[-1] will always be empty if pct_sequence != []
347 assert not res[-1], "string=%r, res=%r" % (string, res)
348 res[-1] = b''.join(pct_sequence).decode(encoding, errors)
349 return ''.join(res)
350
Georg Brandlb044b2a2009-09-16 16:05:59 +0000351def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000352 """Parse a query given as a string argument.
353
354 Arguments:
355
356 qs: URL-encoded query string to be parsed
357
358 keep_blank_values: flag indicating whether blank values in
359 URL encoded queries should be treated as blank strings.
360 A true value indicates that blanks should be retained as
361 blank strings. The default false value indicates that
362 blank values are to be ignored and treated as if they were
363 not included.
364
365 strict_parsing: flag indicating what to do with parsing errors.
366 If false (the default), errors are silently ignored.
367 If true, errors raise a ValueError exception.
368 """
369 dict = {}
370 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
371 if name in dict:
372 dict[name].append(value)
373 else:
374 dict[name] = [value]
375 return dict
376
Georg Brandlb044b2a2009-09-16 16:05:59 +0000377def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000378 """Parse a query given as a string argument.
379
380 Arguments:
381
382 qs: URL-encoded query string to be parsed
383
384 keep_blank_values: flag indicating whether blank values in
385 URL encoded queries should be treated as blank strings. A
386 true value indicates that blanks should be retained as blank
387 strings. The default false value indicates that blank values
388 are to be ignored and treated as if they were not included.
389
390 strict_parsing: flag indicating what to do with parsing errors. If
391 false (the default), errors are silently ignored. If true,
392 errors raise a ValueError exception.
393
394 Returns a list, as G-d intended.
395 """
396 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
397 r = []
398 for name_value in pairs:
399 if not name_value and not strict_parsing:
400 continue
401 nv = name_value.split('=', 1)
402 if len(nv) != 2:
403 if strict_parsing:
404 raise ValueError("bad query field: %r" % (name_value,))
405 # Handle case of a control-name with no equal sign
406 if keep_blank_values:
407 nv.append('')
408 else:
409 continue
410 if len(nv[1]) or keep_blank_values:
411 name = unquote(nv[0].replace('+', ' '))
412 value = unquote(nv[1].replace('+', ' '))
413 r.append((name, value))
414
415 return r
416
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000417def unquote_plus(string, encoding='utf-8', errors='replace'):
418 """Like unquote(), but also replace plus signs by spaces, as required for
419 unquoting HTML form values.
420
421 unquote_plus('%7e/abc+def') -> '~/abc def'
422 """
423 string = string.replace('+', ' ')
424 return unquote(string, encoding, errors)
425
426_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
427 b'abcdefghijklmnopqrstuvwxyz'
428 b'0123456789'
429 b'_.-')
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000430_safe_quoters= {}
431
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000432class Quoter(collections.defaultdict):
433 """A mapping from bytes (in range(0,256)) to strings.
434
435 String values are percent-encoded byte values, unless the key < 128, and
436 in the "safe" set (either the specified safe set, or default set).
437 """
438 # Keeps a cache internally, using defaultdict, for efficiency (lookups
439 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000440 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000441 """safe: bytes object."""
442 self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000443
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000444 def __repr__(self):
445 # Without this, will just display as a defaultdict
446 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000447
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000448 def __missing__(self, b):
449 # Handle a cache miss. Store quoted string in cache and return.
450 res = b in self.safe and chr(b) or ('%%%02X' % b)
451 self[b] = res
452 return res
453
454def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000455 """quote('abc def') -> 'abc%20def'
456
457 Each part of a URL, e.g. the path info, the query, etc., has a
458 different set of reserved characters that must be quoted.
459
460 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
461 the following reserved characters.
462
463 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
464 "$" | ","
465
466 Each of these characters is reserved in some component of a URL,
467 but not necessarily in all of them.
468
469 By default, the quote function is intended for quoting the path
470 section of a URL. Thus, it will not encode '/'. This character
471 is reserved, but in typical usage the quote function is being
472 called on a path where the existing slash characters are used as
473 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000474
475 string and safe may be either str or bytes objects. encoding must
476 not be specified if string is a str.
477
478 The optional encoding and errors parameters specify how to deal with
479 non-ASCII characters, as accepted by the str.encode method.
480 By default, encoding='utf-8' (characters are encoded with UTF-8), and
481 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000482 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000483 if isinstance(string, str):
484 if encoding is None:
485 encoding = 'utf-8'
486 if errors is None:
487 errors = 'strict'
488 string = string.encode(encoding, errors)
489 else:
490 if encoding is not None:
491 raise TypeError("quote() doesn't support 'encoding' for bytes")
492 if errors is not None:
493 raise TypeError("quote() doesn't support 'errors' for bytes")
494 return quote_from_bytes(string, safe)
495
496def quote_plus(string, safe='', encoding=None, errors=None):
497 """Like quote(), but also replace ' ' with '+', as required for quoting
498 HTML form values. Plus signs in the original string are escaped unless
499 they are included in safe. It also does not have safe default to '/'.
500 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000501 # Check if ' ' in string, where string may either be a str or bytes. If
502 # there are no spaces, the regular quote will produce the right answer.
503 if ((isinstance(string, str) and ' ' not in string) or
504 (isinstance(string, bytes) and b' ' not in string)):
505 return quote(string, safe, encoding, errors)
506 if isinstance(safe, str):
507 space = ' '
508 else:
509 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000510 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000511 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000512
513def quote_from_bytes(bs, safe='/'):
514 """Like quote(), but accepts a bytes object rather than a str, and does
515 not perform string-to-bytes encoding. It always returns an ASCII string.
516 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
517 """
518 if isinstance(safe, str):
519 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
520 safe = safe.encode('ascii', 'ignore')
521 cachekey = bytes(safe) # In case it was a bytearray
522 if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
523 raise TypeError("quote_from_bytes() expected a bytes")
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000524 try:
525 quoter = _safe_quoters[cachekey]
526 except KeyError:
527 quoter = Quoter(safe)
528 _safe_quoters[cachekey] = quoter
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000529 return ''.join([quoter[char] for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000530
Georg Brandlb044b2a2009-09-16 16:05:59 +0000531def urlencode(query, doseq=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000532 """Encode a sequence of two-element tuples or dictionary into a URL query string.
533
534 If any values in the query arg are sequences and doseq is true, each
535 sequence element is converted to a separate parameter.
536
537 If the query arg is a sequence of two-element tuples, the order of the
538 parameters in the output will match the order of parameters in the
539 input.
540 """
541
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000542 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000543 query = query.items()
544 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000545 # It's a bother at times that strings and string-like objects are
546 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000547 try:
548 # non-sequence items should not work with len()
549 # non-empty strings will fail this
550 if len(query) and not isinstance(query[0], tuple):
551 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000552 # Zero-length sequences of all types will get here and succeed,
553 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000554 # allowed empty dicts that type of behavior probably should be
555 # preserved for consistency
556 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000557 ty, va, tb = sys.exc_info()
558 raise TypeError("not a valid non-string sequence "
559 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000560
561 l = []
562 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000563 for k, v in query:
564 k = quote_plus(str(k))
565 v = quote_plus(str(v))
566 l.append(k + '=' + v)
567 else:
568 for k, v in query:
569 k = quote_plus(str(k))
570 if isinstance(v, str):
571 v = quote_plus(v)
572 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000573 else:
574 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000575 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000576 x = len(v)
577 except TypeError:
578 # not a sequence
579 v = quote_plus(str(v))
580 l.append(k + '=' + v)
581 else:
582 # loop over the sequence
583 for elt in v:
584 l.append(k + '=' + quote_plus(str(elt)))
585 return '&'.join(l)
586
587# Utilities to parse URLs (most of these return None for missing parts):
588# unwrap('<URL:type://host/path>') --> 'type://host/path'
589# splittype('type:opaquestring') --> 'type', 'opaquestring'
590# splithost('//host[:port]/path') --> 'host[:port]', '/path'
591# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
592# splitpasswd('user:passwd') -> 'user', 'passwd'
593# splitport('host:port') --> 'host', 'port'
594# splitquery('/path?query') --> '/path', 'query'
595# splittag('/path#tag') --> '/path', 'tag'
596# splitattr('/path;attr1=value1;attr2=value2;...') ->
597# '/path', ['attr1=value1', 'attr2=value2', ...]
598# splitvalue('attr=value') --> 'attr', 'value'
599# urllib.parse.unquote('abc%20def') -> 'abc def'
600# quote('abc def') -> 'abc%20def')
601
Georg Brandl13e89462008-07-01 19:56:00 +0000602def to_bytes(url):
603 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000604 # Most URL schemes require ASCII. If that changes, the conversion
605 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000606 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000607 if isinstance(url, str):
608 try:
609 url = url.encode("ASCII").decode()
610 except UnicodeError:
611 raise UnicodeError("URL " + repr(url) +
612 " contains non-ASCII characters")
613 return url
614
615def unwrap(url):
616 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
617 url = str(url).strip()
618 if url[:1] == '<' and url[-1:] == '>':
619 url = url[1:-1].strip()
620 if url[:4] == 'URL:': url = url[4:].strip()
621 return url
622
623_typeprog = None
624def splittype(url):
625 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
626 global _typeprog
627 if _typeprog is None:
628 import re
629 _typeprog = re.compile('^([^/:]+):')
630
631 match = _typeprog.match(url)
632 if match:
633 scheme = match.group(1)
634 return scheme.lower(), url[len(scheme) + 1:]
635 return None, url
636
637_hostprog = None
638def splithost(url):
639 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
640 global _hostprog
641 if _hostprog is None:
642 import re
643 _hostprog = re.compile('^//([^/?]*)(.*)$')
644
645 match = _hostprog.match(url)
646 if match: return match.group(1, 2)
647 return None, url
648
649_userprog = None
650def splituser(host):
651 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
652 global _userprog
653 if _userprog is None:
654 import re
655 _userprog = re.compile('^(.*)@(.*)$')
656
657 match = _userprog.match(host)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000658 if match: return map(unquote, match.group(1, 2))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000659 return None, host
660
661_passwdprog = None
662def splitpasswd(user):
663 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
664 global _passwdprog
665 if _passwdprog is None:
666 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000667 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000668
669 match = _passwdprog.match(user)
670 if match: return match.group(1, 2)
671 return user, None
672
673# splittag('/path#tag') --> '/path', 'tag'
674_portprog = None
675def splitport(host):
676 """splitport('host:port') --> 'host', 'port'."""
677 global _portprog
678 if _portprog is None:
679 import re
680 _portprog = re.compile('^(.*):([0-9]+)$')
681
682 match = _portprog.match(host)
683 if match: return match.group(1, 2)
684 return host, None
685
686_nportprog = None
687def splitnport(host, defport=-1):
688 """Split host and port, returning numeric port.
689 Return given default port if no ':' found; defaults to -1.
690 Return numerical port if a valid number are found after ':'.
691 Return None if ':' but not a valid number."""
692 global _nportprog
693 if _nportprog is None:
694 import re
695 _nportprog = re.compile('^(.*):(.*)$')
696
697 match = _nportprog.match(host)
698 if match:
699 host, port = match.group(1, 2)
700 try:
701 if not port: raise ValueError("no digits")
702 nport = int(port)
703 except ValueError:
704 nport = None
705 return host, nport
706 return host, defport
707
708_queryprog = None
709def splitquery(url):
710 """splitquery('/path?query') --> '/path', 'query'."""
711 global _queryprog
712 if _queryprog is None:
713 import re
714 _queryprog = re.compile('^(.*)\?([^?]*)$')
715
716 match = _queryprog.match(url)
717 if match: return match.group(1, 2)
718 return url, None
719
720_tagprog = None
721def splittag(url):
722 """splittag('/path#tag') --> '/path', 'tag'."""
723 global _tagprog
724 if _tagprog is None:
725 import re
726 _tagprog = re.compile('^(.*)#([^#]*)$')
727
728 match = _tagprog.match(url)
729 if match: return match.group(1, 2)
730 return url, None
731
732def splitattr(url):
733 """splitattr('/path;attr1=value1;attr2=value2;...') ->
734 '/path', ['attr1=value1', 'attr2=value2', ...]."""
735 words = url.split(';')
736 return words[0], words[1:]
737
738_valueprog = None
739def splitvalue(attr):
740 """splitvalue('attr=value') --> 'attr', 'value'."""
741 global _valueprog
742 if _valueprog is None:
743 import re
744 _valueprog = re.compile('^([^=]*)=(.*)$')
745
746 match = _valueprog.match(attr)
747 if match: return match.group(1, 2)
748 return attr, None
749
750test_input = """
751 http://a/b/c/d
752
753 g:h = <URL:g:h>
754 http:g = <URL:http://a/b/c/g>
755 http: = <URL:http://a/b/c/d>
756 g = <URL:http://a/b/c/g>
757 ./g = <URL:http://a/b/c/g>
758 g/ = <URL:http://a/b/c/g/>
759 /g = <URL:http://a/g>
760 //g = <URL:http://g>
761 ?y = <URL:http://a/b/c/d?y>
762 g?y = <URL:http://a/b/c/g?y>
763 g?y/./x = <URL:http://a/b/c/g?y/./x>
764 . = <URL:http://a/b/c/>
765 ./ = <URL:http://a/b/c/>
766 .. = <URL:http://a/b/>
767 ../ = <URL:http://a/b/>
768 ../g = <URL:http://a/b/g>
769 ../.. = <URL:http://a/>
770 ../../g = <URL:http://a/g>
771 ../../../g = <URL:http://a/../g>
772 ./../g = <URL:http://a/b/g>
773 ./g/. = <URL:http://a/b/c/g/>
774 /./g = <URL:http://a/./g>
775 g/./h = <URL:http://a/b/c/g/h>
776 g/../h = <URL:http://a/b/c/h>
777 http:g = <URL:http://a/b/c/g>
778 http: = <URL:http://a/b/c/d>
779 http:?y = <URL:http://a/b/c/d?y>
780 http:g?y = <URL:http://a/b/c/g?y>
781 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
782"""
783
784def test():
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000785 base = ''
786 if sys.argv[1:]:
787 fn = sys.argv[1]
788 if fn == '-':
789 fp = sys.stdin
790 else:
791 fp = open(fn)
792 else:
793 from io import StringIO
794 fp = StringIO(test_input)
795 for line in fp:
796 words = line.split()
797 if not words:
798 continue
799 url = words[0]
800 parts = urlparse(url)
801 print('%-10s : %s' % (url, parts))
802 abs = urljoin(base, url)
803 if not base:
804 base = abs
805 wrapped = '<URL:%s>' % abs
806 print('%-10s = %s' % (url, wrapped))
807 if len(words) == 3 and words[1] == '=':
808 if wrapped != words[2]:
809 print('EXPECTED', words[2], '!!!!!!!!!!')
810
811if __name__ == '__main__':
812 test()