blob: 8bba1500e35b76d97c8a0d32cba63366c9a698ae [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
11RFC2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
14RFC2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
15
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
19RFC1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20McCahill, December 1994
21
22RFC 3986 is considered the current standard and any changes to urlparse module
23should conform to this. urlparse module is not entirely compliant with this.
24The defacto scenarios of parsing are considered sometimes and for backward
25compatiblity purposes, older RFC uses of parsing are retained. The testcases in
26test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000027"""
28
Facundo Batista2ac5de22008-07-07 18:24:11 +000029import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000030import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000031
Jeremy Hylton1afc1692008-06-18 20:49:58 +000032__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac469d4c2008-09-03 22:49:01 +000033 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000034 "quote", "quote_plus", "quote_from_bytes",
35 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036
37# A classification of schemes ('' means apply by default)
38uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
39 'wais', 'file', 'https', 'shttp', 'mms',
40 'prospero', 'rtsp', 'rtspu', '', 'sftp']
41uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
42 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
43 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaraneaaec272009-03-30 21:54:41 +000044 'svn', 'svn+ssh', 'sftp','nfs']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000045non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
46 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
47uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
48 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
49 'mms', '', 'sftp']
50uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
51 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
52uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
53 'nntp', 'wais', 'https', 'shttp', 'snews',
54 'file', 'prospero', '']
55
56# Characters valid in scheme names
57scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
58 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
59 '0123456789'
60 '+-.')
61
62MAX_CACHE_SIZE = 20
63_parse_cache = {}
64
65def clear_cache():
66 """Clear the parse cache."""
67 _parse_cache.clear()
68
69
70class ResultMixin(object):
71 """Shared methods for the parsed result objects."""
72
73 @property
74 def username(self):
75 netloc = self.netloc
76 if "@" in netloc:
77 userinfo = netloc.rsplit("@", 1)[0]
78 if ":" in userinfo:
79 userinfo = userinfo.split(":", 1)[0]
80 return userinfo
81 return None
82
83 @property
84 def password(self):
85 netloc = self.netloc
86 if "@" in netloc:
87 userinfo = netloc.rsplit("@", 1)[0]
88 if ":" in userinfo:
89 return userinfo.split(":", 1)[1]
90 return None
91
92 @property
93 def hostname(self):
Senthil Kumarana6023ca2010-04-16 11:28:05 +000094 netloc = self.netloc
95 if "@" in netloc:
96 netloc = netloc.rsplit("@", 1)[1]
97 if ":" in netloc:
98 netloc = netloc.split(":", 1)[0]
99 return netloc.lower() or None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000100
101 @property
102 def port(self):
Senthil Kumarana6023ca2010-04-16 11:28:05 +0000103 netloc = self.netloc
104 if "@" in netloc:
105 netloc = netloc.rsplit("@", 1)[1]
106 if ":" in netloc:
107 port = netloc.split(":", 1)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000108 return int(port, 10)
Senthil Kumarana6023ca2010-04-16 11:28:05 +0000109 return None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000110
111from collections import namedtuple
112
113class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
114
115 __slots__ = ()
116
117 def geturl(self):
118 return urlunsplit(self)
119
120
121class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
122
123 __slots__ = ()
124
125 def geturl(self):
126 return urlunparse(self)
127
128
129def urlparse(url, scheme='', allow_fragments=True):
130 """Parse a URL into 6 components:
131 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
132 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
133 Note that we don't break the components up in smaller bits
134 (e.g. netloc is a single string) and we don't expand % escapes."""
135 tuple = urlsplit(url, scheme, allow_fragments)
136 scheme, netloc, url, query, fragment = tuple
137 if scheme in uses_params and ';' in url:
138 url, params = _splitparams(url)
139 else:
140 params = ''
141 return ParseResult(scheme, netloc, url, params, query, fragment)
142
143def _splitparams(url):
144 if '/' in url:
145 i = url.find(';', url.rfind('/'))
146 if i < 0:
147 return url, ''
148 else:
149 i = url.find(';')
150 return url[:i], url[i+1:]
151
152def _splitnetloc(url, start=0):
153 delim = len(url) # position of end of domain part of url, default is end
154 for c in '/?#': # look for delimiters; the order is NOT important
155 wdelim = url.find(c, start) # find first of this delim
156 if wdelim >= 0: # if found
157 delim = min(delim, wdelim) # use earliest delim position
158 return url[start:delim], url[delim:] # return (domain, rest)
159
160def urlsplit(url, scheme='', allow_fragments=True):
161 """Parse a URL into 5 components:
162 <scheme>://<netloc>/<path>?<query>#<fragment>
163 Return a 5-tuple: (scheme, netloc, path, query, fragment).
164 Note that we don't break the components up in smaller bits
165 (e.g. netloc is a single string) and we don't expand % escapes."""
166 allow_fragments = bool(allow_fragments)
167 key = url, scheme, allow_fragments, type(url), type(scheme)
168 cached = _parse_cache.get(key, None)
169 if cached:
170 return cached
171 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
172 clear_cache()
173 netloc = query = fragment = ''
174 i = url.find(':')
175 if i > 0:
176 if url[:i] == 'http': # optimize the common case
177 scheme = url[:i].lower()
178 url = url[i+1:]
179 if url[:2] == '//':
180 netloc, url = _splitnetloc(url, 2)
181 if allow_fragments and '#' in url:
182 url, fragment = url.split('#', 1)
183 if '?' in url:
184 url, query = url.split('?', 1)
185 v = SplitResult(scheme, netloc, url, query, fragment)
186 _parse_cache[key] = v
187 return v
188 for c in url[:i]:
189 if c not in scheme_chars:
190 break
191 else:
192 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumarana8dbb242010-02-19 07:45:03 +0000193 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000194 netloc, url = _splitnetloc(url, 2)
195 if allow_fragments and scheme in uses_fragment and '#' in url:
196 url, fragment = url.split('#', 1)
197 if scheme in uses_query and '?' in url:
198 url, query = url.split('?', 1)
199 v = SplitResult(scheme, netloc, url, query, fragment)
200 _parse_cache[key] = v
201 return v
202
203def urlunparse(components):
204 """Put a parsed URL back together again. This may result in a
205 slightly different, but equivalent URL, if the URL that was parsed
206 originally had redundant delimiters, e.g. a ? with an empty query
207 (the draft states that these are equivalent)."""
208 scheme, netloc, url, params, query, fragment = components
209 if params:
210 url = "%s;%s" % (url, params)
211 return urlunsplit((scheme, netloc, url, query, fragment))
212
213def urlunsplit(components):
214 scheme, netloc, url, query, fragment = components
215 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
216 if url and url[:1] != '/': url = '/' + url
217 url = '//' + (netloc or '') + url
218 if scheme:
219 url = scheme + ':' + url
220 if query:
221 url = url + '?' + query
222 if fragment:
223 url = url + '#' + fragment
224 return url
225
226def urljoin(base, url, allow_fragments=True):
227 """Join a base URL and a possibly relative URL to form an absolute
228 interpretation of the latter."""
229 if not base:
230 return url
231 if not url:
232 return base
233 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
234 urlparse(base, '', allow_fragments)
235 scheme, netloc, path, params, query, fragment = \
236 urlparse(url, bscheme, allow_fragments)
237 if scheme != bscheme or scheme not in uses_relative:
238 return url
239 if scheme in uses_netloc:
240 if netloc:
241 return urlunparse((scheme, netloc, path,
242 params, query, fragment))
243 netloc = bnetloc
244 if path[:1] == '/':
245 return urlunparse((scheme, netloc, path,
246 params, query, fragment))
Facundo Batista23e38562008-08-14 16:55:14 +0000247 if not path:
248 path = bpath
249 if not params:
250 params = bparams
251 else:
252 path = path[:-1]
253 return urlunparse((scheme, netloc, path,
254 params, query, fragment))
255 if not query:
256 query = bquery
257 return urlunparse((scheme, netloc, path,
258 params, query, fragment))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000259 segments = bpath.split('/')[:-1] + path.split('/')
260 # XXX The stuff below is bogus in various ways...
261 if segments[-1] == '.':
262 segments[-1] = ''
263 while '.' in segments:
264 segments.remove('.')
265 while 1:
266 i = 1
267 n = len(segments) - 1
268 while i < n:
269 if (segments[i] == '..'
270 and segments[i-1] not in ('', '..')):
271 del segments[i-1:i+1]
272 break
273 i = i+1
274 else:
275 break
276 if segments == ['', '..']:
277 segments[-1] = ''
278 elif len(segments) >= 2 and segments[-1] == '..':
279 segments[-2:] = ['']
280 return urlunparse((scheme, netloc, '/'.join(segments),
281 params, query, fragment))
282
283def urldefrag(url):
284 """Removes any existing fragment from URL.
285
286 Returns a tuple of the defragmented URL and the fragment. If
287 the URL contained no fragments, the second element is the
288 empty string.
289 """
290 if '#' in url:
291 s, n, p, a, q, frag = urlparse(url)
292 defrag = urlunparse((s, n, p, a, q, ''))
293 return defrag, frag
294 else:
295 return url, ''
296
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000297def unquote_to_bytes(string):
298 """unquote_to_bytes('abc%20def') -> b'abc def'."""
299 # Note: strings are encoded as UTF-8. This is only an issue if it contains
300 # unescaped non-ASCII characters, which URIs should not.
301 if isinstance(string, str):
302 string = string.encode('utf-8')
303 res = string.split(b'%')
304 res[0] = res[0]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000305 for i in range(1, len(res)):
306 item = res[i]
307 try:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000308 res[i] = bytes([int(item[:2], 16)]) + item[2:]
309 except ValueError:
310 res[i] = b'%' + item
311 return b''.join(res)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000312
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000313def unquote(string, encoding='utf-8', errors='replace'):
314 """Replace %xx escapes by their single-character equivalent. The optional
315 encoding and errors parameters specify how to decode percent-encoded
316 sequences into Unicode characters, as accepted by the bytes.decode()
317 method.
318 By default, percent-encoded sequences are decoded with UTF-8, and invalid
319 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000320
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000321 unquote('abc%20def') -> 'abc def'.
322 """
323 if encoding is None: encoding = 'utf-8'
324 if errors is None: errors = 'replace'
325 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
326 # (list of single-byte bytes objects)
327 pct_sequence = []
328 res = string.split('%')
329 for i in range(1, len(res)):
330 item = res[i]
331 try:
332 if not item: raise ValueError
333 pct_sequence.append(bytes.fromhex(item[:2]))
334 rest = item[2:]
335 except ValueError:
336 rest = '%' + item
337 if not rest:
338 # This segment was just a single percent-encoded character.
339 # May be part of a sequence of code units, so delay decoding.
340 # (Stored in pct_sequence).
341 res[i] = ''
342 else:
343 # Encountered non-percent-encoded characters. Flush the current
344 # pct_sequence.
345 res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
346 pct_sequence = []
347 if pct_sequence:
348 # Flush the final pct_sequence
349 # res[-1] will always be empty if pct_sequence != []
350 assert not res[-1], "string=%r, res=%r" % (string, res)
351 res[-1] = b''.join(pct_sequence).decode(encoding, errors)
352 return ''.join(res)
353
Georg Brandlb044b2a2009-09-16 16:05:59 +0000354def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000355 """Parse a query given as a string argument.
356
357 Arguments:
358
359 qs: URL-encoded query string to be parsed
360
361 keep_blank_values: flag indicating whether blank values in
362 URL encoded queries should be treated as blank strings.
363 A true value indicates that blanks should be retained as
364 blank strings. The default false value indicates that
365 blank values are to be ignored and treated as if they were
366 not included.
367
368 strict_parsing: flag indicating what to do with parsing errors.
369 If false (the default), errors are silently ignored.
370 If true, errors raise a ValueError exception.
371 """
372 dict = {}
373 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
374 if name in dict:
375 dict[name].append(value)
376 else:
377 dict[name] = [value]
378 return dict
379
Georg Brandlb044b2a2009-09-16 16:05:59 +0000380def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000381 """Parse a query given as a string argument.
382
383 Arguments:
384
385 qs: URL-encoded query string to be parsed
386
387 keep_blank_values: flag indicating whether blank values in
388 URL encoded queries should be treated as blank strings. A
389 true value indicates that blanks should be retained as blank
390 strings. The default false value indicates that blank values
391 are to be ignored and treated as if they were not included.
392
393 strict_parsing: flag indicating what to do with parsing errors. If
394 false (the default), errors are silently ignored. If true,
395 errors raise a ValueError exception.
396
397 Returns a list, as G-d intended.
398 """
399 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
400 r = []
401 for name_value in pairs:
402 if not name_value and not strict_parsing:
403 continue
404 nv = name_value.split('=', 1)
405 if len(nv) != 2:
406 if strict_parsing:
407 raise ValueError("bad query field: %r" % (name_value,))
408 # Handle case of a control-name with no equal sign
409 if keep_blank_values:
410 nv.append('')
411 else:
412 continue
413 if len(nv[1]) or keep_blank_values:
414 name = unquote(nv[0].replace('+', ' '))
415 value = unquote(nv[1].replace('+', ' '))
416 r.append((name, value))
417
418 return r
419
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000420def unquote_plus(string, encoding='utf-8', errors='replace'):
421 """Like unquote(), but also replace plus signs by spaces, as required for
422 unquoting HTML form values.
423
424 unquote_plus('%7e/abc+def') -> '~/abc def'
425 """
426 string = string.replace('+', ' ')
427 return unquote(string, encoding, errors)
428
429_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
430 b'abcdefghijklmnopqrstuvwxyz'
431 b'0123456789'
432 b'_.-')
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000433_safe_quoters= {}
434
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000435class Quoter(collections.defaultdict):
436 """A mapping from bytes (in range(0,256)) to strings.
437
438 String values are percent-encoded byte values, unless the key < 128, and
439 in the "safe" set (either the specified safe set, or default set).
440 """
441 # Keeps a cache internally, using defaultdict, for efficiency (lookups
442 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000443 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000444 """safe: bytes object."""
445 self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000446
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000447 def __repr__(self):
448 # Without this, will just display as a defaultdict
449 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000450
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000451 def __missing__(self, b):
452 # Handle a cache miss. Store quoted string in cache and return.
453 res = b in self.safe and chr(b) or ('%%%02X' % b)
454 self[b] = res
455 return res
456
457def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000458 """quote('abc def') -> 'abc%20def'
459
460 Each part of a URL, e.g. the path info, the query, etc., has a
461 different set of reserved characters that must be quoted.
462
463 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
464 the following reserved characters.
465
466 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
467 "$" | ","
468
469 Each of these characters is reserved in some component of a URL,
470 but not necessarily in all of them.
471
472 By default, the quote function is intended for quoting the path
473 section of a URL. Thus, it will not encode '/'. This character
474 is reserved, but in typical usage the quote function is being
475 called on a path where the existing slash characters are used as
476 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000477
478 string and safe may be either str or bytes objects. encoding must
479 not be specified if string is a str.
480
481 The optional encoding and errors parameters specify how to deal with
482 non-ASCII characters, as accepted by the str.encode method.
483 By default, encoding='utf-8' (characters are encoded with UTF-8), and
484 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000485 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000486 if isinstance(string, str):
487 if encoding is None:
488 encoding = 'utf-8'
489 if errors is None:
490 errors = 'strict'
491 string = string.encode(encoding, errors)
492 else:
493 if encoding is not None:
494 raise TypeError("quote() doesn't support 'encoding' for bytes")
495 if errors is not None:
496 raise TypeError("quote() doesn't support 'errors' for bytes")
497 return quote_from_bytes(string, safe)
498
499def quote_plus(string, safe='', encoding=None, errors=None):
500 """Like quote(), but also replace ' ' with '+', as required for quoting
501 HTML form values. Plus signs in the original string are escaped unless
502 they are included in safe. It also does not have safe default to '/'.
503 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000504 # Check if ' ' in string, where string may either be a str or bytes. If
505 # there are no spaces, the regular quote will produce the right answer.
506 if ((isinstance(string, str) and ' ' not in string) or
507 (isinstance(string, bytes) and b' ' not in string)):
508 return quote(string, safe, encoding, errors)
509 if isinstance(safe, str):
510 space = ' '
511 else:
512 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000513 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000514 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000515
516def quote_from_bytes(bs, safe='/'):
517 """Like quote(), but accepts a bytes object rather than a str, and does
518 not perform string-to-bytes encoding. It always returns an ASCII string.
519 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
520 """
521 if isinstance(safe, str):
522 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
523 safe = safe.encode('ascii', 'ignore')
524 cachekey = bytes(safe) # In case it was a bytearray
525 if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
526 raise TypeError("quote_from_bytes() expected a bytes")
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000527 try:
528 quoter = _safe_quoters[cachekey]
529 except KeyError:
530 quoter = Quoter(safe)
531 _safe_quoters[cachekey] = quoter
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000532 return ''.join([quoter[char] for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000533
Georg Brandlb044b2a2009-09-16 16:05:59 +0000534def urlencode(query, doseq=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000535 """Encode a sequence of two-element tuples or dictionary into a URL query string.
536
537 If any values in the query arg are sequences and doseq is true, each
538 sequence element is converted to a separate parameter.
539
540 If the query arg is a sequence of two-element tuples, the order of the
541 parameters in the output will match the order of parameters in the
542 input.
543 """
544
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000545 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000546 query = query.items()
547 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000548 # It's a bother at times that strings and string-like objects are
549 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000550 try:
551 # non-sequence items should not work with len()
552 # non-empty strings will fail this
553 if len(query) and not isinstance(query[0], tuple):
554 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000555 # Zero-length sequences of all types will get here and succeed,
556 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000557 # allowed empty dicts that type of behavior probably should be
558 # preserved for consistency
559 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000560 ty, va, tb = sys.exc_info()
561 raise TypeError("not a valid non-string sequence "
562 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000563
564 l = []
565 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000566 for k, v in query:
567 k = quote_plus(str(k))
568 v = quote_plus(str(v))
569 l.append(k + '=' + v)
570 else:
571 for k, v in query:
572 k = quote_plus(str(k))
573 if isinstance(v, str):
574 v = quote_plus(v)
575 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000576 else:
577 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000578 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000579 x = len(v)
580 except TypeError:
581 # not a sequence
582 v = quote_plus(str(v))
583 l.append(k + '=' + v)
584 else:
585 # loop over the sequence
586 for elt in v:
587 l.append(k + '=' + quote_plus(str(elt)))
588 return '&'.join(l)
589
590# Utilities to parse URLs (most of these return None for missing parts):
591# unwrap('<URL:type://host/path>') --> 'type://host/path'
592# splittype('type:opaquestring') --> 'type', 'opaquestring'
593# splithost('//host[:port]/path') --> 'host[:port]', '/path'
594# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
595# splitpasswd('user:passwd') -> 'user', 'passwd'
596# splitport('host:port') --> 'host', 'port'
597# splitquery('/path?query') --> '/path', 'query'
598# splittag('/path#tag') --> '/path', 'tag'
599# splitattr('/path;attr1=value1;attr2=value2;...') ->
600# '/path', ['attr1=value1', 'attr2=value2', ...]
601# splitvalue('attr=value') --> 'attr', 'value'
602# urllib.parse.unquote('abc%20def') -> 'abc def'
603# quote('abc def') -> 'abc%20def')
604
Georg Brandl13e89462008-07-01 19:56:00 +0000605def to_bytes(url):
606 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000607 # Most URL schemes require ASCII. If that changes, the conversion
608 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000609 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000610 if isinstance(url, str):
611 try:
612 url = url.encode("ASCII").decode()
613 except UnicodeError:
614 raise UnicodeError("URL " + repr(url) +
615 " contains non-ASCII characters")
616 return url
617
618def unwrap(url):
619 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
620 url = str(url).strip()
621 if url[:1] == '<' and url[-1:] == '>':
622 url = url[1:-1].strip()
623 if url[:4] == 'URL:': url = url[4:].strip()
624 return url
625
626_typeprog = None
627def splittype(url):
628 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
629 global _typeprog
630 if _typeprog is None:
631 import re
632 _typeprog = re.compile('^([^/:]+):')
633
634 match = _typeprog.match(url)
635 if match:
636 scheme = match.group(1)
637 return scheme.lower(), url[len(scheme) + 1:]
638 return None, url
639
640_hostprog = None
641def splithost(url):
642 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
643 global _hostprog
644 if _hostprog is None:
645 import re
646 _hostprog = re.compile('^//([^/?]*)(.*)$')
647
648 match = _hostprog.match(url)
649 if match: return match.group(1, 2)
650 return None, url
651
652_userprog = None
653def splituser(host):
654 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
655 global _userprog
656 if _userprog is None:
657 import re
658 _userprog = re.compile('^(.*)@(.*)$')
659
660 match = _userprog.match(host)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000661 if match: return map(unquote, match.group(1, 2))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000662 return None, host
663
664_passwdprog = None
665def splitpasswd(user):
666 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
667 global _passwdprog
668 if _passwdprog is None:
669 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000670 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000671
672 match = _passwdprog.match(user)
673 if match: return match.group(1, 2)
674 return user, None
675
676# splittag('/path#tag') --> '/path', 'tag'
677_portprog = None
678def splitport(host):
679 """splitport('host:port') --> 'host', 'port'."""
680 global _portprog
681 if _portprog is None:
682 import re
683 _portprog = re.compile('^(.*):([0-9]+)$')
684
685 match = _portprog.match(host)
686 if match: return match.group(1, 2)
687 return host, None
688
689_nportprog = None
690def splitnport(host, defport=-1):
691 """Split host and port, returning numeric port.
692 Return given default port if no ':' found; defaults to -1.
693 Return numerical port if a valid number are found after ':'.
694 Return None if ':' but not a valid number."""
695 global _nportprog
696 if _nportprog is None:
697 import re
698 _nportprog = re.compile('^(.*):(.*)$')
699
700 match = _nportprog.match(host)
701 if match:
702 host, port = match.group(1, 2)
703 try:
704 if not port: raise ValueError("no digits")
705 nport = int(port)
706 except ValueError:
707 nport = None
708 return host, nport
709 return host, defport
710
711_queryprog = None
712def splitquery(url):
713 """splitquery('/path?query') --> '/path', 'query'."""
714 global _queryprog
715 if _queryprog is None:
716 import re
717 _queryprog = re.compile('^(.*)\?([^?]*)$')
718
719 match = _queryprog.match(url)
720 if match: return match.group(1, 2)
721 return url, None
722
723_tagprog = None
724def splittag(url):
725 """splittag('/path#tag') --> '/path', 'tag'."""
726 global _tagprog
727 if _tagprog is None:
728 import re
729 _tagprog = re.compile('^(.*)#([^#]*)$')
730
731 match = _tagprog.match(url)
732 if match: return match.group(1, 2)
733 return url, None
734
735def splitattr(url):
736 """splitattr('/path;attr1=value1;attr2=value2;...') ->
737 '/path', ['attr1=value1', 'attr2=value2', ...]."""
738 words = url.split(';')
739 return words[0], words[1:]
740
741_valueprog = None
742def splitvalue(attr):
743 """splitvalue('attr=value') --> 'attr', 'value'."""
744 global _valueprog
745 if _valueprog is None:
746 import re
747 _valueprog = re.compile('^([^=]*)=(.*)$')
748
749 match = _valueprog.match(attr)
750 if match: return match.group(1, 2)
751 return attr, None
752
753test_input = """
754 http://a/b/c/d
755
756 g:h = <URL:g:h>
757 http:g = <URL:http://a/b/c/g>
758 http: = <URL:http://a/b/c/d>
759 g = <URL:http://a/b/c/g>
760 ./g = <URL:http://a/b/c/g>
761 g/ = <URL:http://a/b/c/g/>
762 /g = <URL:http://a/g>
763 //g = <URL:http://g>
764 ?y = <URL:http://a/b/c/d?y>
765 g?y = <URL:http://a/b/c/g?y>
766 g?y/./x = <URL:http://a/b/c/g?y/./x>
767 . = <URL:http://a/b/c/>
768 ./ = <URL:http://a/b/c/>
769 .. = <URL:http://a/b/>
770 ../ = <URL:http://a/b/>
771 ../g = <URL:http://a/b/g>
772 ../.. = <URL:http://a/>
773 ../../g = <URL:http://a/g>
774 ../../../g = <URL:http://a/../g>
775 ./../g = <URL:http://a/b/g>
776 ./g/. = <URL:http://a/b/c/g/>
777 /./g = <URL:http://a/./g>
778 g/./h = <URL:http://a/b/c/g/h>
779 g/../h = <URL:http://a/b/c/h>
780 http:g = <URL:http://a/b/c/g>
781 http: = <URL:http://a/b/c/d>
782 http:?y = <URL:http://a/b/c/d?y>
783 http:g?y = <URL:http://a/b/c/g?y>
784 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
785"""
786
787def test():
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000788 base = ''
789 if sys.argv[1:]:
790 fn = sys.argv[1]
791 if fn == '-':
792 fp = sys.stdin
793 else:
794 fp = open(fn)
795 else:
796 from io import StringIO
797 fp = StringIO(test_input)
798 for line in fp:
799 words = line.split()
800 if not words:
801 continue
802 url = words[0]
803 parts = urlparse(url)
804 print('%-10s : %s' % (url, parts))
805 abs = urljoin(base, url)
806 if not base:
807 base = abs
808 wrapped = '<URL:%s>' % abs
809 print('%-10s = %s' % (url, wrapped))
810 if len(words) == 3 and words[1] == '=':
811 if wrapped != words[2]:
812 print('EXPECTED', words[2], '!!!!!!!!!!')
813
814if __name__ == '__main__':
815 test()