blob: b7890d84dd9ab97d8d2a6ad7b7a9348b93589e0e [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
Senthil Kumaran6ffdb6f2010-04-17 14:47:13 +00008RFC2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
9Berners-Lee, R. Fielding, and L. Masinter, August 1998.
10
11RFC2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
12
13RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
141995.
15
16RFC1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
17McCahill, December 1994
18
19RFC 3986 is considered the current standard and any changes to urlparse module
20should conform to this. urlparse module is not entirely compliant with this.
21The defacto scenarios of parsing are considered sometimes and for backward
22compatiblity purposes, older RFC uses of parsing are retained. The testcases in
23test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000024"""
25
Facundo Batista2ac5de22008-07-07 18:24:11 +000026import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000027import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000028
Jeremy Hylton1afc1692008-06-18 20:49:58 +000029__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac469d4c2008-09-03 22:49:01 +000030 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000031 "quote", "quote_plus", "quote_from_bytes",
32 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000033
34# A classification of schemes ('' means apply by default)
35uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
36 'wais', 'file', 'https', 'shttp', 'mms',
37 'prospero', 'rtsp', 'rtspu', '', 'sftp']
38uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
39 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
40 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumarand4cd1882010-05-13 03:43:13 +000041 'svn', 'svn+ssh', 'sftp', 'nfs',' git', 'git+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000042non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
43 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
44uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
45 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
46 'mms', '', 'sftp']
47uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
48 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
49uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
50 'nntp', 'wais', 'https', 'shttp', 'snews',
51 'file', 'prospero', '']
52
53# Characters valid in scheme names
54scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
55 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
56 '0123456789'
57 '+-.')
58
59MAX_CACHE_SIZE = 20
60_parse_cache = {}
61
62def clear_cache():
63 """Clear the parse cache."""
64 _parse_cache.clear()
65
66
67class ResultMixin(object):
68 """Shared methods for the parsed result objects."""
69
70 @property
71 def username(self):
72 netloc = self.netloc
73 if "@" in netloc:
74 userinfo = netloc.rsplit("@", 1)[0]
75 if ":" in userinfo:
76 userinfo = userinfo.split(":", 1)[0]
77 return userinfo
78 return None
79
80 @property
81 def password(self):
82 netloc = self.netloc
83 if "@" in netloc:
84 userinfo = netloc.rsplit("@", 1)[0]
85 if ":" in userinfo:
86 return userinfo.split(":", 1)[1]
87 return None
88
89 @property
90 def hostname(self):
Senthil Kumarana6023ca2010-04-16 11:28:05 +000091 netloc = self.netloc
92 if "@" in netloc:
93 netloc = netloc.rsplit("@", 1)[1]
94 if ":" in netloc:
95 netloc = netloc.split(":", 1)[0]
96 return netloc.lower() or None
Jeremy Hylton1afc1692008-06-18 20:49:58 +000097
98 @property
99 def port(self):
Senthil Kumarana6023ca2010-04-16 11:28:05 +0000100 netloc = self.netloc
101 if "@" in netloc:
102 netloc = netloc.rsplit("@", 1)[1]
103 if ":" in netloc:
104 port = netloc.split(":", 1)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000105 return int(port, 10)
Senthil Kumarana6023ca2010-04-16 11:28:05 +0000106 return None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000107
108from collections import namedtuple
109
110class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
111
112 __slots__ = ()
113
114 def geturl(self):
115 return urlunsplit(self)
116
117
118class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
119
120 __slots__ = ()
121
122 def geturl(self):
123 return urlunparse(self)
124
125
126def urlparse(url, scheme='', allow_fragments=True):
127 """Parse a URL into 6 components:
128 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
129 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
130 Note that we don't break the components up in smaller bits
131 (e.g. netloc is a single string) and we don't expand % escapes."""
132 tuple = urlsplit(url, scheme, allow_fragments)
133 scheme, netloc, url, query, fragment = tuple
134 if scheme in uses_params and ';' in url:
135 url, params = _splitparams(url)
136 else:
137 params = ''
138 return ParseResult(scheme, netloc, url, params, query, fragment)
139
140def _splitparams(url):
141 if '/' in url:
142 i = url.find(';', url.rfind('/'))
143 if i < 0:
144 return url, ''
145 else:
146 i = url.find(';')
147 return url[:i], url[i+1:]
148
149def _splitnetloc(url, start=0):
150 delim = len(url) # position of end of domain part of url, default is end
151 for c in '/?#': # look for delimiters; the order is NOT important
152 wdelim = url.find(c, start) # find first of this delim
153 if wdelim >= 0: # if found
154 delim = min(delim, wdelim) # use earliest delim position
155 return url[start:delim], url[delim:] # return (domain, rest)
156
157def urlsplit(url, scheme='', allow_fragments=True):
158 """Parse a URL into 5 components:
159 <scheme>://<netloc>/<path>?<query>#<fragment>
160 Return a 5-tuple: (scheme, netloc, path, query, fragment).
161 Note that we don't break the components up in smaller bits
162 (e.g. netloc is a single string) and we don't expand % escapes."""
163 allow_fragments = bool(allow_fragments)
164 key = url, scheme, allow_fragments, type(url), type(scheme)
165 cached = _parse_cache.get(key, None)
166 if cached:
167 return cached
168 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
169 clear_cache()
170 netloc = query = fragment = ''
171 i = url.find(':')
172 if i > 0:
173 if url[:i] == 'http': # optimize the common case
174 scheme = url[:i].lower()
175 url = url[i+1:]
176 if url[:2] == '//':
177 netloc, url = _splitnetloc(url, 2)
178 if allow_fragments and '#' in url:
179 url, fragment = url.split('#', 1)
180 if '?' in url:
181 url, query = url.split('?', 1)
182 v = SplitResult(scheme, netloc, url, query, fragment)
183 _parse_cache[key] = v
184 return v
185 for c in url[:i]:
186 if c not in scheme_chars:
187 break
188 else:
189 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumarana8dbb242010-02-19 07:45:03 +0000190 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000191 netloc, url = _splitnetloc(url, 2)
192 if allow_fragments and scheme in uses_fragment and '#' in url:
193 url, fragment = url.split('#', 1)
194 if scheme in uses_query and '?' in url:
195 url, query = url.split('?', 1)
196 v = SplitResult(scheme, netloc, url, query, fragment)
197 _parse_cache[key] = v
198 return v
199
200def urlunparse(components):
201 """Put a parsed URL back together again. This may result in a
202 slightly different, but equivalent URL, if the URL that was parsed
203 originally had redundant delimiters, e.g. a ? with an empty query
204 (the draft states that these are equivalent)."""
205 scheme, netloc, url, params, query, fragment = components
206 if params:
207 url = "%s;%s" % (url, params)
208 return urlunsplit((scheme, netloc, url, query, fragment))
209
210def urlunsplit(components):
Senthil Kumaran930049b2010-06-28 14:12:18 +0000211 """Combine the elements of a tuple as returned by urlsplit() into a
212 complete URL as a string. The data argument can be any five-item iterable.
213 This may result in a slightly different, but equivalent URL, if the URL that
214 was parsed originally had unnecessary delimiters (for example, a ? with an
215 empty query; the RFC states that these are equivalent)."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000216 scheme, netloc, url, query, fragment = components
217 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
218 if url and url[:1] != '/': url = '/' + url
219 url = '//' + (netloc or '') + url
220 if scheme:
221 url = scheme + ':' + url
222 if query:
223 url = url + '?' + query
224 if fragment:
225 url = url + '#' + fragment
226 return url
227
228def urljoin(base, url, allow_fragments=True):
229 """Join a base URL and a possibly relative URL to form an absolute
230 interpretation of the latter."""
231 if not base:
232 return url
233 if not url:
234 return base
235 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
236 urlparse(base, '', allow_fragments)
237 scheme, netloc, path, params, query, fragment = \
238 urlparse(url, bscheme, allow_fragments)
239 if scheme != bscheme or scheme not in uses_relative:
240 return url
241 if scheme in uses_netloc:
242 if netloc:
243 return urlunparse((scheme, netloc, path,
244 params, query, fragment))
245 netloc = bnetloc
246 if path[:1] == '/':
247 return urlunparse((scheme, netloc, path,
248 params, query, fragment))
Facundo Batista23e38562008-08-14 16:55:14 +0000249 if not path:
250 path = bpath
251 if not params:
252 params = bparams
253 else:
254 path = path[:-1]
255 return urlunparse((scheme, netloc, path,
256 params, query, fragment))
257 if not query:
258 query = bquery
259 return urlunparse((scheme, netloc, path,
260 params, query, fragment))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000261 segments = bpath.split('/')[:-1] + path.split('/')
262 # XXX The stuff below is bogus in various ways...
263 if segments[-1] == '.':
264 segments[-1] = ''
265 while '.' in segments:
266 segments.remove('.')
267 while 1:
268 i = 1
269 n = len(segments) - 1
270 while i < n:
271 if (segments[i] == '..'
272 and segments[i-1] not in ('', '..')):
273 del segments[i-1:i+1]
274 break
275 i = i+1
276 else:
277 break
278 if segments == ['', '..']:
279 segments[-1] = ''
280 elif len(segments) >= 2 and segments[-1] == '..':
281 segments[-2:] = ['']
282 return urlunparse((scheme, netloc, '/'.join(segments),
283 params, query, fragment))
284
285def urldefrag(url):
286 """Removes any existing fragment from URL.
287
288 Returns a tuple of the defragmented URL and the fragment. If
289 the URL contained no fragments, the second element is the
290 empty string.
291 """
292 if '#' in url:
293 s, n, p, a, q, frag = urlparse(url)
294 defrag = urlunparse((s, n, p, a, q, ''))
295 return defrag, frag
296 else:
297 return url, ''
298
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000299def unquote_to_bytes(string):
300 """unquote_to_bytes('abc%20def') -> b'abc def'."""
301 # Note: strings are encoded as UTF-8. This is only an issue if it contains
302 # unescaped non-ASCII characters, which URIs should not.
303 if isinstance(string, str):
304 string = string.encode('utf-8')
305 res = string.split(b'%')
306 res[0] = res[0]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000307 for i in range(1, len(res)):
308 item = res[i]
309 try:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000310 res[i] = bytes([int(item[:2], 16)]) + item[2:]
311 except ValueError:
312 res[i] = b'%' + item
313 return b''.join(res)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000314
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000315def unquote(string, encoding='utf-8', errors='replace'):
316 """Replace %xx escapes by their single-character equivalent. The optional
317 encoding and errors parameters specify how to decode percent-encoded
318 sequences into Unicode characters, as accepted by the bytes.decode()
319 method.
320 By default, percent-encoded sequences are decoded with UTF-8, and invalid
321 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000322
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000323 unquote('abc%20def') -> 'abc def'.
324 """
325 if encoding is None: encoding = 'utf-8'
326 if errors is None: errors = 'replace'
327 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
328 # (list of single-byte bytes objects)
329 pct_sequence = []
330 res = string.split('%')
331 for i in range(1, len(res)):
332 item = res[i]
333 try:
334 if not item: raise ValueError
335 pct_sequence.append(bytes.fromhex(item[:2]))
336 rest = item[2:]
337 except ValueError:
338 rest = '%' + item
339 if not rest:
340 # This segment was just a single percent-encoded character.
341 # May be part of a sequence of code units, so delay decoding.
342 # (Stored in pct_sequence).
343 res[i] = ''
344 else:
345 # Encountered non-percent-encoded characters. Flush the current
346 # pct_sequence.
347 res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
348 pct_sequence = []
349 if pct_sequence:
350 # Flush the final pct_sequence
351 # res[-1] will always be empty if pct_sequence != []
352 assert not res[-1], "string=%r, res=%r" % (string, res)
353 res[-1] = b''.join(pct_sequence).decode(encoding, errors)
354 return ''.join(res)
355
Georg Brandlb044b2a2009-09-16 16:05:59 +0000356def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000357 """Parse a query given as a string argument.
358
359 Arguments:
360
361 qs: URL-encoded query string to be parsed
362
363 keep_blank_values: flag indicating whether blank values in
364 URL encoded queries should be treated as blank strings.
365 A true value indicates that blanks should be retained as
366 blank strings. The default false value indicates that
367 blank values are to be ignored and treated as if they were
368 not included.
369
370 strict_parsing: flag indicating what to do with parsing errors.
371 If false (the default), errors are silently ignored.
372 If true, errors raise a ValueError exception.
373 """
374 dict = {}
375 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
376 if name in dict:
377 dict[name].append(value)
378 else:
379 dict[name] = [value]
380 return dict
381
Georg Brandlb044b2a2009-09-16 16:05:59 +0000382def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000383 """Parse a query given as a string argument.
384
385 Arguments:
386
387 qs: URL-encoded query string to be parsed
388
389 keep_blank_values: flag indicating whether blank values in
390 URL encoded queries should be treated as blank strings. A
391 true value indicates that blanks should be retained as blank
392 strings. The default false value indicates that blank values
393 are to be ignored and treated as if they were not included.
394
395 strict_parsing: flag indicating what to do with parsing errors. If
396 false (the default), errors are silently ignored. If true,
397 errors raise a ValueError exception.
398
399 Returns a list, as G-d intended.
400 """
401 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
402 r = []
403 for name_value in pairs:
404 if not name_value and not strict_parsing:
405 continue
406 nv = name_value.split('=', 1)
407 if len(nv) != 2:
408 if strict_parsing:
409 raise ValueError("bad query field: %r" % (name_value,))
410 # Handle case of a control-name with no equal sign
411 if keep_blank_values:
412 nv.append('')
413 else:
414 continue
415 if len(nv[1]) or keep_blank_values:
416 name = unquote(nv[0].replace('+', ' '))
417 value = unquote(nv[1].replace('+', ' '))
418 r.append((name, value))
419
420 return r
421
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000422def unquote_plus(string, encoding='utf-8', errors='replace'):
423 """Like unquote(), but also replace plus signs by spaces, as required for
424 unquoting HTML form values.
425
426 unquote_plus('%7e/abc+def') -> '~/abc def'
427 """
428 string = string.replace('+', ' ')
429 return unquote(string, encoding, errors)
430
431_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
432 b'abcdefghijklmnopqrstuvwxyz'
433 b'0123456789'
434 b'_.-')
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000435_safe_quoters= {}
436
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000437class Quoter(collections.defaultdict):
438 """A mapping from bytes (in range(0,256)) to strings.
439
440 String values are percent-encoded byte values, unless the key < 128, and
441 in the "safe" set (either the specified safe set, or default set).
442 """
443 # Keeps a cache internally, using defaultdict, for efficiency (lookups
444 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000445 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000446 """safe: bytes object."""
447 self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000448
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000449 def __repr__(self):
450 # Without this, will just display as a defaultdict
451 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000452
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000453 def __missing__(self, b):
454 # Handle a cache miss. Store quoted string in cache and return.
455 res = b in self.safe and chr(b) or ('%%%02X' % b)
456 self[b] = res
457 return res
458
459def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000460 """quote('abc def') -> 'abc%20def'
461
462 Each part of a URL, e.g. the path info, the query, etc., has a
463 different set of reserved characters that must be quoted.
464
465 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
466 the following reserved characters.
467
468 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
469 "$" | ","
470
471 Each of these characters is reserved in some component of a URL,
472 but not necessarily in all of them.
473
474 By default, the quote function is intended for quoting the path
475 section of a URL. Thus, it will not encode '/'. This character
476 is reserved, but in typical usage the quote function is being
477 called on a path where the existing slash characters are used as
478 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000479
480 string and safe may be either str or bytes objects. encoding must
481 not be specified if string is a str.
482
483 The optional encoding and errors parameters specify how to deal with
484 non-ASCII characters, as accepted by the str.encode method.
485 By default, encoding='utf-8' (characters are encoded with UTF-8), and
486 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000487 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000488 if isinstance(string, str):
489 if encoding is None:
490 encoding = 'utf-8'
491 if errors is None:
492 errors = 'strict'
493 string = string.encode(encoding, errors)
494 else:
495 if encoding is not None:
496 raise TypeError("quote() doesn't support 'encoding' for bytes")
497 if errors is not None:
498 raise TypeError("quote() doesn't support 'errors' for bytes")
499 return quote_from_bytes(string, safe)
500
501def quote_plus(string, safe='', encoding=None, errors=None):
502 """Like quote(), but also replace ' ' with '+', as required for quoting
503 HTML form values. Plus signs in the original string are escaped unless
504 they are included in safe. It also does not have safe default to '/'.
505 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000506 # Check if ' ' in string, where string may either be a str or bytes. If
507 # there are no spaces, the regular quote will produce the right answer.
508 if ((isinstance(string, str) and ' ' not in string) or
509 (isinstance(string, bytes) and b' ' not in string)):
510 return quote(string, safe, encoding, errors)
511 if isinstance(safe, str):
512 space = ' '
513 else:
514 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000515 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000516 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000517
518def quote_from_bytes(bs, safe='/'):
519 """Like quote(), but accepts a bytes object rather than a str, and does
520 not perform string-to-bytes encoding. It always returns an ASCII string.
521 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
522 """
523 if isinstance(safe, str):
524 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
525 safe = safe.encode('ascii', 'ignore')
526 cachekey = bytes(safe) # In case it was a bytearray
527 if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
528 raise TypeError("quote_from_bytes() expected a bytes")
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000529 try:
530 quoter = _safe_quoters[cachekey]
531 except KeyError:
532 quoter = Quoter(safe)
533 _safe_quoters[cachekey] = quoter
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000534 return ''.join([quoter[char] for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000535
Georg Brandlb044b2a2009-09-16 16:05:59 +0000536def urlencode(query, doseq=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000537 """Encode a sequence of two-element tuples or dictionary into a URL query string.
538
539 If any values in the query arg are sequences and doseq is true, each
540 sequence element is converted to a separate parameter.
541
542 If the query arg is a sequence of two-element tuples, the order of the
543 parameters in the output will match the order of parameters in the
544 input.
545 """
546
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000547 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000548 query = query.items()
549 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000550 # It's a bother at times that strings and string-like objects are
551 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000552 try:
553 # non-sequence items should not work with len()
554 # non-empty strings will fail this
555 if len(query) and not isinstance(query[0], tuple):
556 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000557 # Zero-length sequences of all types will get here and succeed,
558 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000559 # allowed empty dicts that type of behavior probably should be
560 # preserved for consistency
561 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000562 ty, va, tb = sys.exc_info()
563 raise TypeError("not a valid non-string sequence "
564 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000565
566 l = []
567 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000568 for k, v in query:
569 k = quote_plus(str(k))
570 v = quote_plus(str(v))
571 l.append(k + '=' + v)
572 else:
573 for k, v in query:
574 k = quote_plus(str(k))
575 if isinstance(v, str):
576 v = quote_plus(v)
577 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000578 else:
579 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000580 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000581 x = len(v)
582 except TypeError:
583 # not a sequence
584 v = quote_plus(str(v))
585 l.append(k + '=' + v)
586 else:
587 # loop over the sequence
588 for elt in v:
589 l.append(k + '=' + quote_plus(str(elt)))
590 return '&'.join(l)
591
592# Utilities to parse URLs (most of these return None for missing parts):
593# unwrap('<URL:type://host/path>') --> 'type://host/path'
594# splittype('type:opaquestring') --> 'type', 'opaquestring'
595# splithost('//host[:port]/path') --> 'host[:port]', '/path'
596# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
597# splitpasswd('user:passwd') -> 'user', 'passwd'
598# splitport('host:port') --> 'host', 'port'
599# splitquery('/path?query') --> '/path', 'query'
600# splittag('/path#tag') --> '/path', 'tag'
601# splitattr('/path;attr1=value1;attr2=value2;...') ->
602# '/path', ['attr1=value1', 'attr2=value2', ...]
603# splitvalue('attr=value') --> 'attr', 'value'
604# urllib.parse.unquote('abc%20def') -> 'abc def'
605# quote('abc def') -> 'abc%20def')
606
Georg Brandl13e89462008-07-01 19:56:00 +0000607def to_bytes(url):
608 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000609 # Most URL schemes require ASCII. If that changes, the conversion
610 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000611 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000612 if isinstance(url, str):
613 try:
614 url = url.encode("ASCII").decode()
615 except UnicodeError:
616 raise UnicodeError("URL " + repr(url) +
617 " contains non-ASCII characters")
618 return url
619
620def unwrap(url):
621 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
622 url = str(url).strip()
623 if url[:1] == '<' and url[-1:] == '>':
624 url = url[1:-1].strip()
625 if url[:4] == 'URL:': url = url[4:].strip()
626 return url
627
628_typeprog = None
629def splittype(url):
630 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
631 global _typeprog
632 if _typeprog is None:
633 import re
634 _typeprog = re.compile('^([^/:]+):')
635
636 match = _typeprog.match(url)
637 if match:
638 scheme = match.group(1)
639 return scheme.lower(), url[len(scheme) + 1:]
640 return None, url
641
642_hostprog = None
643def splithost(url):
644 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
645 global _hostprog
646 if _hostprog is None:
647 import re
648 _hostprog = re.compile('^//([^/?]*)(.*)$')
649
650 match = _hostprog.match(url)
651 if match: return match.group(1, 2)
652 return None, url
653
654_userprog = None
655def splituser(host):
656 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
657 global _userprog
658 if _userprog is None:
659 import re
660 _userprog = re.compile('^(.*)@(.*)$')
661
662 match = _userprog.match(host)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000663 if match: return map(unquote, match.group(1, 2))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000664 return None, host
665
666_passwdprog = None
667def splitpasswd(user):
668 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
669 global _passwdprog
670 if _passwdprog is None:
671 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000672 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000673
674 match = _passwdprog.match(user)
675 if match: return match.group(1, 2)
676 return user, None
677
678# splittag('/path#tag') --> '/path', 'tag'
679_portprog = None
680def splitport(host):
681 """splitport('host:port') --> 'host', 'port'."""
682 global _portprog
683 if _portprog is None:
684 import re
685 _portprog = re.compile('^(.*):([0-9]+)$')
686
687 match = _portprog.match(host)
688 if match: return match.group(1, 2)
689 return host, None
690
691_nportprog = None
692def splitnport(host, defport=-1):
693 """Split host and port, returning numeric port.
694 Return given default port if no ':' found; defaults to -1.
695 Return numerical port if a valid number are found after ':'.
696 Return None if ':' but not a valid number."""
697 global _nportprog
698 if _nportprog is None:
699 import re
700 _nportprog = re.compile('^(.*):(.*)$')
701
702 match = _nportprog.match(host)
703 if match:
704 host, port = match.group(1, 2)
705 try:
706 if not port: raise ValueError("no digits")
707 nport = int(port)
708 except ValueError:
709 nport = None
710 return host, nport
711 return host, defport
712
713_queryprog = None
714def splitquery(url):
715 """splitquery('/path?query') --> '/path', 'query'."""
716 global _queryprog
717 if _queryprog is None:
718 import re
719 _queryprog = re.compile('^(.*)\?([^?]*)$')
720
721 match = _queryprog.match(url)
722 if match: return match.group(1, 2)
723 return url, None
724
725_tagprog = None
726def splittag(url):
727 """splittag('/path#tag') --> '/path', 'tag'."""
728 global _tagprog
729 if _tagprog is None:
730 import re
731 _tagprog = re.compile('^(.*)#([^#]*)$')
732
733 match = _tagprog.match(url)
734 if match: return match.group(1, 2)
735 return url, None
736
737def splitattr(url):
738 """splitattr('/path;attr1=value1;attr2=value2;...') ->
739 '/path', ['attr1=value1', 'attr2=value2', ...]."""
740 words = url.split(';')
741 return words[0], words[1:]
742
743_valueprog = None
744def splitvalue(attr):
745 """splitvalue('attr=value') --> 'attr', 'value'."""
746 global _valueprog
747 if _valueprog is None:
748 import re
749 _valueprog = re.compile('^([^=]*)=(.*)$')
750
751 match = _valueprog.match(attr)
752 if match: return match.group(1, 2)
753 return attr, None
754
755test_input = """
756 http://a/b/c/d
757
758 g:h = <URL:g:h>
759 http:g = <URL:http://a/b/c/g>
760 http: = <URL:http://a/b/c/d>
761 g = <URL:http://a/b/c/g>
762 ./g = <URL:http://a/b/c/g>
763 g/ = <URL:http://a/b/c/g/>
764 /g = <URL:http://a/g>
765 //g = <URL:http://g>
766 ?y = <URL:http://a/b/c/d?y>
767 g?y = <URL:http://a/b/c/g?y>
768 g?y/./x = <URL:http://a/b/c/g?y/./x>
769 . = <URL:http://a/b/c/>
770 ./ = <URL:http://a/b/c/>
771 .. = <URL:http://a/b/>
772 ../ = <URL:http://a/b/>
773 ../g = <URL:http://a/b/g>
774 ../.. = <URL:http://a/>
775 ../../g = <URL:http://a/g>
776 ../../../g = <URL:http://a/../g>
777 ./../g = <URL:http://a/b/g>
778 ./g/. = <URL:http://a/b/c/g/>
779 /./g = <URL:http://a/./g>
780 g/./h = <URL:http://a/b/c/g/h>
781 g/../h = <URL:http://a/b/c/h>
782 http:g = <URL:http://a/b/c/g>
783 http: = <URL:http://a/b/c/d>
784 http:?y = <URL:http://a/b/c/d?y>
785 http:g?y = <URL:http://a/b/c/g?y>
786 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
787"""
788
789def test():
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000790 base = ''
791 if sys.argv[1:]:
792 fn = sys.argv[1]
793 if fn == '-':
794 fp = sys.stdin
795 else:
796 fp = open(fn)
797 else:
798 from io import StringIO
799 fp = StringIO(test_input)
800 for line in fp:
801 words = line.split()
802 if not words:
803 continue
804 url = words[0]
805 parts = urlparse(url)
806 print('%-10s : %s' % (url, parts))
807 abs = urljoin(base, url)
808 if not base:
809 base = abs
810 wrapped = '<URL:%s>' % abs
811 print('%-10s = %s' % (url, wrapped))
812 if len(words) == 3 and words[1] == '=':
813 if wrapped != words[2]:
814 print('EXPECTED', words[2], '!!!!!!!!!!')
815
816if __name__ == '__main__':
817 test()