blob: 27b732b78cda3c67acd7ecd01ac99643c96a8095 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +00008RFC2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
9Berners-Lee, R. Fielding, and L. Masinter, August 1998.
10
11RFC2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
12
13RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
141995.
15
16RFC1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
17McCahill, December 1994
18
19RFC 3986 is considered the current standard and any changes to urlparse module
20should conform to this. urlparse module is not entirely compliant with this.
21The defacto scenarios of parsing are considered sometimes and for backward
22compatiblity purposes, older RFC uses of parsing are retained. The testcases in
23test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000024"""
25
Facundo Batista2ac5de22008-07-07 18:24:11 +000026import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000027import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000028
Jeremy Hylton1afc1692008-06-18 20:49:58 +000029__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac469d4c2008-09-03 22:49:01 +000030 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000031 "quote", "quote_plus", "quote_from_bytes",
32 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000033
34# A classification of schemes ('' means apply by default)
35uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
36 'wais', 'file', 'https', 'shttp', 'mms',
37 'prospero', 'rtsp', 'rtspu', '', 'sftp']
38uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
39 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
40 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumarand4cd1882010-05-13 03:43:13 +000041 'svn', 'svn+ssh', 'sftp', 'nfs',' git', 'git+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000042non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
43 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
44uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
45 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
46 'mms', '', 'sftp']
47uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
48 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
49uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
50 'nntp', 'wais', 'https', 'shttp', 'snews',
51 'file', 'prospero', '']
52
53# Characters valid in scheme names
54scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
55 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
56 '0123456789'
57 '+-.')
58
59MAX_CACHE_SIZE = 20
60_parse_cache = {}
61
62def clear_cache():
63 """Clear the parse cache."""
64 _parse_cache.clear()
65
66
67class ResultMixin(object):
68 """Shared methods for the parsed result objects."""
69
70 @property
71 def username(self):
72 netloc = self.netloc
73 if "@" in netloc:
74 userinfo = netloc.rsplit("@", 1)[0]
75 if ":" in userinfo:
76 userinfo = userinfo.split(":", 1)[0]
77 return userinfo
78 return None
79
80 @property
81 def password(self):
82 netloc = self.netloc
83 if "@" in netloc:
84 userinfo = netloc.rsplit("@", 1)[0]
85 if ":" in userinfo:
86 return userinfo.split(":", 1)[1]
87 return None
88
89 @property
90 def hostname(self):
Senthil Kumarana6023ca2010-04-16 11:28:05 +000091 netloc = self.netloc
92 if "@" in netloc:
93 netloc = netloc.rsplit("@", 1)[1]
94 if ":" in netloc:
95 netloc = netloc.split(":", 1)[0]
96 return netloc.lower() or None
Jeremy Hylton1afc1692008-06-18 20:49:58 +000097
98 @property
99 def port(self):
Senthil Kumarana6023ca2010-04-16 11:28:05 +0000100 netloc = self.netloc
101 if "@" in netloc:
102 netloc = netloc.rsplit("@", 1)[1]
103 if ":" in netloc:
104 port = netloc.split(":", 1)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000105 return int(port, 10)
Senthil Kumarana6023ca2010-04-16 11:28:05 +0000106 return None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000107
108from collections import namedtuple
109
110class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
111
112 __slots__ = ()
113
114 def geturl(self):
115 return urlunsplit(self)
116
117
118class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
119
120 __slots__ = ()
121
122 def geturl(self):
123 return urlunparse(self)
124
125
126def urlparse(url, scheme='', allow_fragments=True):
127 """Parse a URL into 6 components:
128 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
129 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
130 Note that we don't break the components up in smaller bits
131 (e.g. netloc is a single string) and we don't expand % escapes."""
132 tuple = urlsplit(url, scheme, allow_fragments)
133 scheme, netloc, url, query, fragment = tuple
134 if scheme in uses_params and ';' in url:
135 url, params = _splitparams(url)
136 else:
137 params = ''
138 return ParseResult(scheme, netloc, url, params, query, fragment)
139
140def _splitparams(url):
141 if '/' in url:
142 i = url.find(';', url.rfind('/'))
143 if i < 0:
144 return url, ''
145 else:
146 i = url.find(';')
147 return url[:i], url[i+1:]
148
149def _splitnetloc(url, start=0):
150 delim = len(url) # position of end of domain part of url, default is end
151 for c in '/?#': # look for delimiters; the order is NOT important
152 wdelim = url.find(c, start) # find first of this delim
153 if wdelim >= 0: # if found
154 delim = min(delim, wdelim) # use earliest delim position
155 return url[start:delim], url[delim:] # return (domain, rest)
156
157def urlsplit(url, scheme='', allow_fragments=True):
158 """Parse a URL into 5 components:
159 <scheme>://<netloc>/<path>?<query>#<fragment>
160 Return a 5-tuple: (scheme, netloc, path, query, fragment).
161 Note that we don't break the components up in smaller bits
162 (e.g. netloc is a single string) and we don't expand % escapes."""
163 allow_fragments = bool(allow_fragments)
164 key = url, scheme, allow_fragments, type(url), type(scheme)
165 cached = _parse_cache.get(key, None)
166 if cached:
167 return cached
168 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
169 clear_cache()
170 netloc = query = fragment = ''
171 i = url.find(':')
172 if i > 0:
173 if url[:i] == 'http': # optimize the common case
174 scheme = url[:i].lower()
175 url = url[i+1:]
176 if url[:2] == '//':
177 netloc, url = _splitnetloc(url, 2)
178 if allow_fragments and '#' in url:
179 url, fragment = url.split('#', 1)
180 if '?' in url:
181 url, query = url.split('?', 1)
182 v = SplitResult(scheme, netloc, url, query, fragment)
183 _parse_cache[key] = v
184 return v
185 for c in url[:i]:
186 if c not in scheme_chars:
187 break
188 else:
189 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumarana8dbb242010-02-19 07:45:03 +0000190 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000191 netloc, url = _splitnetloc(url, 2)
192 if allow_fragments and scheme in uses_fragment and '#' in url:
193 url, fragment = url.split('#', 1)
194 if scheme in uses_query and '?' in url:
195 url, query = url.split('?', 1)
196 v = SplitResult(scheme, netloc, url, query, fragment)
197 _parse_cache[key] = v
198 return v
199
200def urlunparse(components):
201 """Put a parsed URL back together again. This may result in a
202 slightly different, but equivalent URL, if the URL that was parsed
203 originally had redundant delimiters, e.g. a ? with an empty query
204 (the draft states that these are equivalent)."""
205 scheme, netloc, url, params, query, fragment = components
206 if params:
207 url = "%s;%s" % (url, params)
208 return urlunsplit((scheme, netloc, url, query, fragment))
209
210def urlunsplit(components):
Senthil Kumaran930049b2010-06-28 14:12:18 +0000211 """Combine the elements of a tuple as returned by urlsplit() into a
212 complete URL as a string. The data argument can be any five-item iterable.
213 This may result in a slightly different, but equivalent URL, if the URL that
214 was parsed originally had unnecessary delimiters (for example, a ? with an
215 empty query; the RFC states that these are equivalent)."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000216 scheme, netloc, url, query, fragment = components
217 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
218 if url and url[:1] != '/': url = '/' + url
219 url = '//' + (netloc or '') + url
220 if scheme:
221 url = scheme + ':' + url
222 if query:
223 url = url + '?' + query
224 if fragment:
225 url = url + '#' + fragment
226 return url
227
228def urljoin(base, url, allow_fragments=True):
229 """Join a base URL and a possibly relative URL to form an absolute
230 interpretation of the latter."""
231 if not base:
232 return url
233 if not url:
234 return base
235 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
236 urlparse(base, '', allow_fragments)
237 scheme, netloc, path, params, query, fragment = \
238 urlparse(url, bscheme, allow_fragments)
239 if scheme != bscheme or scheme not in uses_relative:
240 return url
241 if scheme in uses_netloc:
242 if netloc:
243 return urlunparse((scheme, netloc, path,
244 params, query, fragment))
245 netloc = bnetloc
246 if path[:1] == '/':
247 return urlunparse((scheme, netloc, path,
248 params, query, fragment))
Facundo Batista23e38562008-08-14 16:55:14 +0000249 if not path:
250 path = bpath
251 if not params:
252 params = bparams
253 else:
254 path = path[:-1]
255 return urlunparse((scheme, netloc, path,
256 params, query, fragment))
257 if not query:
258 query = bquery
259 return urlunparse((scheme, netloc, path,
260 params, query, fragment))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000261 segments = bpath.split('/')[:-1] + path.split('/')
262 # XXX The stuff below is bogus in various ways...
263 if segments[-1] == '.':
264 segments[-1] = ''
265 while '.' in segments:
266 segments.remove('.')
267 while 1:
268 i = 1
269 n = len(segments) - 1
270 while i < n:
271 if (segments[i] == '..'
272 and segments[i-1] not in ('', '..')):
273 del segments[i-1:i+1]
274 break
275 i = i+1
276 else:
277 break
278 if segments == ['', '..']:
279 segments[-1] = ''
280 elif len(segments) >= 2 and segments[-1] == '..':
281 segments[-2:] = ['']
282 return urlunparse((scheme, netloc, '/'.join(segments),
283 params, query, fragment))
284
285def urldefrag(url):
286 """Removes any existing fragment from URL.
287
288 Returns a tuple of the defragmented URL and the fragment. If
289 the URL contained no fragments, the second element is the
290 empty string.
291 """
292 if '#' in url:
293 s, n, p, a, q, frag = urlparse(url)
294 defrag = urlunparse((s, n, p, a, q, ''))
295 return defrag, frag
296 else:
297 return url, ''
298
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000299def unquote_to_bytes(string):
300 """unquote_to_bytes('abc%20def') -> b'abc def'."""
301 # Note: strings are encoded as UTF-8. This is only an issue if it contains
302 # unescaped non-ASCII characters, which URIs should not.
303 if isinstance(string, str):
304 string = string.encode('utf-8')
305 res = string.split(b'%')
306 res[0] = res[0]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000307 for i in range(1, len(res)):
308 item = res[i]
309 try:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000310 res[i] = bytes([int(item[:2], 16)]) + item[2:]
311 except ValueError:
312 res[i] = b'%' + item
313 return b''.join(res)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000314
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000315def unquote(string, encoding='utf-8', errors='replace'):
316 """Replace %xx escapes by their single-character equivalent. The optional
317 encoding and errors parameters specify how to decode percent-encoded
318 sequences into Unicode characters, as accepted by the bytes.decode()
319 method.
320 By default, percent-encoded sequences are decoded with UTF-8, and invalid
321 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000322
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000323 unquote('abc%20def') -> 'abc def'.
324 """
325 if encoding is None: encoding = 'utf-8'
326 if errors is None: errors = 'replace'
327 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
328 # (list of single-byte bytes objects)
329 pct_sequence = []
330 res = string.split('%')
331 for i in range(1, len(res)):
332 item = res[i]
333 try:
334 if not item: raise ValueError
335 pct_sequence.append(bytes.fromhex(item[:2]))
336 rest = item[2:]
337 except ValueError:
338 rest = '%' + item
339 if not rest:
340 # This segment was just a single percent-encoded character.
341 # May be part of a sequence of code units, so delay decoding.
342 # (Stored in pct_sequence).
343 res[i] = ''
344 else:
345 # Encountered non-percent-encoded characters. Flush the current
346 # pct_sequence.
347 res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
348 pct_sequence = []
349 if pct_sequence:
350 # Flush the final pct_sequence
351 # res[-1] will always be empty if pct_sequence != []
352 assert not res[-1], "string=%r, res=%r" % (string, res)
353 res[-1] = b''.join(pct_sequence).decode(encoding, errors)
354 return ''.join(res)
355
Georg Brandlb044b2a2009-09-16 16:05:59 +0000356def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000357 """Parse a query given as a string argument.
358
359 Arguments:
360
361 qs: URL-encoded query string to be parsed
362
363 keep_blank_values: flag indicating whether blank values in
364 URL encoded queries should be treated as blank strings.
365 A true value indicates that blanks should be retained as
366 blank strings. The default false value indicates that
367 blank values are to be ignored and treated as if they were
368 not included.
369
370 strict_parsing: flag indicating what to do with parsing errors.
371 If false (the default), errors are silently ignored.
372 If true, errors raise a ValueError exception.
373 """
374 dict = {}
375 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
376 if name in dict:
377 dict[name].append(value)
378 else:
379 dict[name] = [value]
380 return dict
381
Georg Brandlb044b2a2009-09-16 16:05:59 +0000382def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000383 """Parse a query given as a string argument.
384
385 Arguments:
386
387 qs: URL-encoded query string to be parsed
388
389 keep_blank_values: flag indicating whether blank values in
390 URL encoded queries should be treated as blank strings. A
391 true value indicates that blanks should be retained as blank
392 strings. The default false value indicates that blank values
393 are to be ignored and treated as if they were not included.
394
395 strict_parsing: flag indicating what to do with parsing errors. If
396 false (the default), errors are silently ignored. If true,
397 errors raise a ValueError exception.
398
399 Returns a list, as G-d intended.
400 """
401 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
402 r = []
403 for name_value in pairs:
404 if not name_value and not strict_parsing:
405 continue
406 nv = name_value.split('=', 1)
407 if len(nv) != 2:
408 if strict_parsing:
409 raise ValueError("bad query field: %r" % (name_value,))
410 # Handle case of a control-name with no equal sign
411 if keep_blank_values:
412 nv.append('')
413 else:
414 continue
415 if len(nv[1]) or keep_blank_values:
416 name = unquote(nv[0].replace('+', ' '))
417 value = unquote(nv[1].replace('+', ' '))
418 r.append((name, value))
419
420 return r
421
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000422def unquote_plus(string, encoding='utf-8', errors='replace'):
423 """Like unquote(), but also replace plus signs by spaces, as required for
424 unquoting HTML form values.
425
426 unquote_plus('%7e/abc+def') -> '~/abc def'
427 """
428 string = string.replace('+', ' ')
429 return unquote(string, encoding, errors)
430
431_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
432 b'abcdefghijklmnopqrstuvwxyz'
433 b'0123456789'
434 b'_.-')
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000435_safe_quoters= {}
436
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000437class Quoter(collections.defaultdict):
438 """A mapping from bytes (in range(0,256)) to strings.
439
440 String values are percent-encoded byte values, unless the key < 128, and
441 in the "safe" set (either the specified safe set, or default set).
442 """
443 # Keeps a cache internally, using defaultdict, for efficiency (lookups
444 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000445 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000446 """safe: bytes object."""
447 self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000448
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000449 def __repr__(self):
450 # Without this, will just display as a defaultdict
451 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000452
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000453 def __missing__(self, b):
454 # Handle a cache miss. Store quoted string in cache and return.
455 res = b in self.safe and chr(b) or ('%%%02X' % b)
456 self[b] = res
457 return res
458
459def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000460 """quote('abc def') -> 'abc%20def'
461
462 Each part of a URL, e.g. the path info, the query, etc., has a
463 different set of reserved characters that must be quoted.
464
465 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
466 the following reserved characters.
467
468 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
469 "$" | ","
470
471 Each of these characters is reserved in some component of a URL,
472 but not necessarily in all of them.
473
474 By default, the quote function is intended for quoting the path
475 section of a URL. Thus, it will not encode '/'. This character
476 is reserved, but in typical usage the quote function is being
477 called on a path where the existing slash characters are used as
478 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000479
480 string and safe may be either str or bytes objects. encoding must
481 not be specified if string is a str.
482
483 The optional encoding and errors parameters specify how to deal with
484 non-ASCII characters, as accepted by the str.encode method.
485 By default, encoding='utf-8' (characters are encoded with UTF-8), and
486 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000487 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000488 if isinstance(string, str):
489 if encoding is None:
490 encoding = 'utf-8'
491 if errors is None:
492 errors = 'strict'
493 string = string.encode(encoding, errors)
494 else:
495 if encoding is not None:
496 raise TypeError("quote() doesn't support 'encoding' for bytes")
497 if errors is not None:
498 raise TypeError("quote() doesn't support 'errors' for bytes")
499 return quote_from_bytes(string, safe)
500
501def quote_plus(string, safe='', encoding=None, errors=None):
502 """Like quote(), but also replace ' ' with '+', as required for quoting
503 HTML form values. Plus signs in the original string are escaped unless
504 they are included in safe. It also does not have safe default to '/'.
505 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000506 # Check if ' ' in string, where string may either be a str or bytes. If
507 # there are no spaces, the regular quote will produce the right answer.
508 if ((isinstance(string, str) and ' ' not in string) or
509 (isinstance(string, bytes) and b' ' not in string)):
510 return quote(string, safe, encoding, errors)
511 if isinstance(safe, str):
512 space = ' '
513 else:
514 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000515 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000516 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000517
518def quote_from_bytes(bs, safe='/'):
519 """Like quote(), but accepts a bytes object rather than a str, and does
520 not perform string-to-bytes encoding. It always returns an ASCII string.
521 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
522 """
523 if isinstance(safe, str):
524 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
525 safe = safe.encode('ascii', 'ignore')
526 cachekey = bytes(safe) # In case it was a bytearray
527 if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
528 raise TypeError("quote_from_bytes() expected a bytes")
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000529 try:
530 quoter = _safe_quoters[cachekey]
531 except KeyError:
532 quoter = Quoter(safe)
533 _safe_quoters[cachekey] = quoter
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000534 return ''.join([quoter[char] for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000535
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000536def urlencode(query, doseq=False, safe='', encoding=None, errors=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000537 """Encode a sequence of two-element tuples or dictionary into a URL query string.
538
539 If any values in the query arg are sequences and doseq is true, each
540 sequence element is converted to a separate parameter.
541
542 If the query arg is a sequence of two-element tuples, the order of the
543 parameters in the output will match the order of parameters in the
544 input.
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000545
546 The query arg may be either a string or a bytes type. When query arg is a
547 string, the safe, encoding and error parameters are sent the quote_plus for
548 encoding.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000549 """
550
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000551 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000552 query = query.items()
553 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000554 # It's a bother at times that strings and string-like objects are
555 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000556 try:
557 # non-sequence items should not work with len()
558 # non-empty strings will fail this
559 if len(query) and not isinstance(query[0], tuple):
560 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000561 # Zero-length sequences of all types will get here and succeed,
562 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000563 # allowed empty dicts that type of behavior probably should be
564 # preserved for consistency
565 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000566 ty, va, tb = sys.exc_info()
567 raise TypeError("not a valid non-string sequence "
568 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000569
570 l = []
571 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000572 for k, v in query:
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000573 if isinstance(k, bytes):
574 k = quote_plus(k, safe)
575 else:
576 k = quote_plus(str(k), safe, encoding, errors)
577
578 if isinstance(v, bytes):
579 v = quote_plus(v, safe)
580 else:
581 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000582 l.append(k + '=' + v)
583 else:
584 for k, v in query:
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000585 if isinstance(k, bytes):
586 k = quote_plus(k, safe)
587 else:
588 k = quote_plus(str(k), safe, encoding, errors)
589
590 if isinstance(v, bytes):
591 v = quote_plus(v, safe)
592 l.append(k + '=' + v)
593 elif isinstance(v, str):
594 v = quote_plus(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000595 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000596 else:
597 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000598 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000599 x = len(v)
600 except TypeError:
601 # not a sequence
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000602 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000603 l.append(k + '=' + v)
604 else:
605 # loop over the sequence
606 for elt in v:
Senthil Kumaranfe1ad152010-07-03 17:55:41 +0000607 if isinstance(elt, bytes):
608 elt = quote_plus(elt, safe)
609 else:
610 elt = quote_plus(str(elt), safe, encoding, errors)
611 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000612 return '&'.join(l)
613
614# Utilities to parse URLs (most of these return None for missing parts):
615# unwrap('<URL:type://host/path>') --> 'type://host/path'
616# splittype('type:opaquestring') --> 'type', 'opaquestring'
617# splithost('//host[:port]/path') --> 'host[:port]', '/path'
618# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
619# splitpasswd('user:passwd') -> 'user', 'passwd'
620# splitport('host:port') --> 'host', 'port'
621# splitquery('/path?query') --> '/path', 'query'
622# splittag('/path#tag') --> '/path', 'tag'
623# splitattr('/path;attr1=value1;attr2=value2;...') ->
624# '/path', ['attr1=value1', 'attr2=value2', ...]
625# splitvalue('attr=value') --> 'attr', 'value'
626# urllib.parse.unquote('abc%20def') -> 'abc def'
627# quote('abc def') -> 'abc%20def')
628
Georg Brandl13e89462008-07-01 19:56:00 +0000629def to_bytes(url):
630 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000631 # Most URL schemes require ASCII. If that changes, the conversion
632 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000633 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000634 if isinstance(url, str):
635 try:
636 url = url.encode("ASCII").decode()
637 except UnicodeError:
638 raise UnicodeError("URL " + repr(url) +
639 " contains non-ASCII characters")
640 return url
641
642def unwrap(url):
643 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
644 url = str(url).strip()
645 if url[:1] == '<' and url[-1:] == '>':
646 url = url[1:-1].strip()
647 if url[:4] == 'URL:': url = url[4:].strip()
648 return url
649
650_typeprog = None
651def splittype(url):
652 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
653 global _typeprog
654 if _typeprog is None:
655 import re
656 _typeprog = re.compile('^([^/:]+):')
657
658 match = _typeprog.match(url)
659 if match:
660 scheme = match.group(1)
661 return scheme.lower(), url[len(scheme) + 1:]
662 return None, url
663
664_hostprog = None
665def splithost(url):
666 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
667 global _hostprog
668 if _hostprog is None:
669 import re
670 _hostprog = re.compile('^//([^/?]*)(.*)$')
671
672 match = _hostprog.match(url)
673 if match: return match.group(1, 2)
674 return None, url
675
676_userprog = None
677def splituser(host):
678 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
679 global _userprog
680 if _userprog is None:
681 import re
682 _userprog = re.compile('^(.*)@(.*)$')
683
684 match = _userprog.match(host)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000685 if match: return map(unquote, match.group(1, 2))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000686 return None, host
687
688_passwdprog = None
689def splitpasswd(user):
690 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
691 global _passwdprog
692 if _passwdprog is None:
693 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000694 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000695
696 match = _passwdprog.match(user)
697 if match: return match.group(1, 2)
698 return user, None
699
700# splittag('/path#tag') --> '/path', 'tag'
701_portprog = None
702def splitport(host):
703 """splitport('host:port') --> 'host', 'port'."""
704 global _portprog
705 if _portprog is None:
706 import re
707 _portprog = re.compile('^(.*):([0-9]+)$')
708
709 match = _portprog.match(host)
710 if match: return match.group(1, 2)
711 return host, None
712
713_nportprog = None
714def splitnport(host, defport=-1):
715 """Split host and port, returning numeric port.
716 Return given default port if no ':' found; defaults to -1.
717 Return numerical port if a valid number are found after ':'.
718 Return None if ':' but not a valid number."""
719 global _nportprog
720 if _nportprog is None:
721 import re
722 _nportprog = re.compile('^(.*):(.*)$')
723
724 match = _nportprog.match(host)
725 if match:
726 host, port = match.group(1, 2)
727 try:
728 if not port: raise ValueError("no digits")
729 nport = int(port)
730 except ValueError:
731 nport = None
732 return host, nport
733 return host, defport
734
735_queryprog = None
736def splitquery(url):
737 """splitquery('/path?query') --> '/path', 'query'."""
738 global _queryprog
739 if _queryprog is None:
740 import re
741 _queryprog = re.compile('^(.*)\?([^?]*)$')
742
743 match = _queryprog.match(url)
744 if match: return match.group(1, 2)
745 return url, None
746
747_tagprog = None
748def splittag(url):
749 """splittag('/path#tag') --> '/path', 'tag'."""
750 global _tagprog
751 if _tagprog is None:
752 import re
753 _tagprog = re.compile('^(.*)#([^#]*)$')
754
755 match = _tagprog.match(url)
756 if match: return match.group(1, 2)
757 return url, None
758
759def splitattr(url):
760 """splitattr('/path;attr1=value1;attr2=value2;...') ->
761 '/path', ['attr1=value1', 'attr2=value2', ...]."""
762 words = url.split(';')
763 return words[0], words[1:]
764
765_valueprog = None
766def splitvalue(attr):
767 """splitvalue('attr=value') --> 'attr', 'value'."""
768 global _valueprog
769 if _valueprog is None:
770 import re
771 _valueprog = re.compile('^([^=]*)=(.*)$')
772
773 match = _valueprog.match(attr)
774 if match: return match.group(1, 2)
775 return attr, None
776
777test_input = """
778 http://a/b/c/d
779
780 g:h = <URL:g:h>
781 http:g = <URL:http://a/b/c/g>
782 http: = <URL:http://a/b/c/d>
783 g = <URL:http://a/b/c/g>
784 ./g = <URL:http://a/b/c/g>
785 g/ = <URL:http://a/b/c/g/>
786 /g = <URL:http://a/g>
787 //g = <URL:http://g>
788 ?y = <URL:http://a/b/c/d?y>
789 g?y = <URL:http://a/b/c/g?y>
790 g?y/./x = <URL:http://a/b/c/g?y/./x>
791 . = <URL:http://a/b/c/>
792 ./ = <URL:http://a/b/c/>
793 .. = <URL:http://a/b/>
794 ../ = <URL:http://a/b/>
795 ../g = <URL:http://a/b/g>
796 ../.. = <URL:http://a/>
797 ../../g = <URL:http://a/g>
798 ../../../g = <URL:http://a/../g>
799 ./../g = <URL:http://a/b/g>
800 ./g/. = <URL:http://a/b/c/g/>
801 /./g = <URL:http://a/./g>
802 g/./h = <URL:http://a/b/c/g/h>
803 g/../h = <URL:http://a/b/c/h>
804 http:g = <URL:http://a/b/c/g>
805 http: = <URL:http://a/b/c/d>
806 http:?y = <URL:http://a/b/c/d?y>
807 http:g?y = <URL:http://a/b/c/g?y>
808 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
809"""
810
811def test():
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000812 base = ''
813 if sys.argv[1:]:
814 fn = sys.argv[1]
815 if fn == '-':
816 fp = sys.stdin
817 else:
818 fp = open(fn)
819 else:
820 from io import StringIO
821 fp = StringIO(test_input)
822 for line in fp:
823 words = line.split()
824 if not words:
825 continue
826 url = words[0]
827 parts = urlparse(url)
828 print('%-10s : %s' % (url, parts))
829 abs = urljoin(base, url)
830 if not base:
831 base = abs
832 wrapped = '<URL:%s>' % abs
833 print('%-10s = %s' % (url, wrapped))
834 if len(words) == 3 and words[1] == '=':
835 if wrapped != words[2]:
836 print('EXPECTED', words[2], '!!!!!!!!!!')
837
838if __name__ == '__main__':
839 test()