blob: 1affc6930d9298f5bf0b32fea66f922998a199d6 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
3See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
4UC Irvine, June 1995.
5"""
6
Facundo Batista2ac5de22008-07-07 18:24:11 +00007import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +00008import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +00009
Jeremy Hylton1afc1692008-06-18 20:49:58 +000010__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac469d4c2008-09-03 22:49:01 +000011 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000012 "quote", "quote_plus", "quote_from_bytes",
13 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000014
15# A classification of schemes ('' means apply by default)
16uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
17 'wais', 'file', 'https', 'shttp', 'mms',
18 'prospero', 'rtsp', 'rtspu', '', 'sftp']
19uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
20 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
21 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaraneaaec272009-03-30 21:54:41 +000022 'svn', 'svn+ssh', 'sftp','nfs']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000023non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
24 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
25uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
26 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
27 'mms', '', 'sftp']
28uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
29 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
30uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
31 'nntp', 'wais', 'https', 'shttp', 'snews',
32 'file', 'prospero', '']
33
34# Characters valid in scheme names
35scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
36 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
37 '0123456789'
38 '+-.')
39
40MAX_CACHE_SIZE = 20
41_parse_cache = {}
42
43def clear_cache():
44 """Clear the parse cache."""
45 _parse_cache.clear()
46
47
48class ResultMixin(object):
49 """Shared methods for the parsed result objects."""
50
51 @property
52 def username(self):
53 netloc = self.netloc
54 if "@" in netloc:
55 userinfo = netloc.rsplit("@", 1)[0]
56 if ":" in userinfo:
57 userinfo = userinfo.split(":", 1)[0]
58 return userinfo
59 return None
60
61 @property
62 def password(self):
63 netloc = self.netloc
64 if "@" in netloc:
65 userinfo = netloc.rsplit("@", 1)[0]
66 if ":" in userinfo:
67 return userinfo.split(":", 1)[1]
68 return None
69
70 @property
71 def hostname(self):
Senthil Kumaran2176ad52010-04-16 03:06:19 +000072 netloc = self.netloc.split('@')[-1]
73 if '[' in netloc and ']' in netloc:
74 return netloc.split(']')[0][1:].lower()
75 elif '[' in netloc or ']' in netloc:
76 raise ValueError("Invalid IPv6 hostname")
77 elif ':' in netloc:
78 return netloc.split(':')[0].lower()
79 elif netloc == '':
80 return None
81 else:
82 return netloc.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000083
84 @property
85 def port(self):
Senthil Kumaran2176ad52010-04-16 03:06:19 +000086 netloc = self.netloc.split('@')[-1].split(']')[-1]
87 if ':' in netloc:
88 port = netloc.split(':')[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000089 return int(port, 10)
Senthil Kumaran2176ad52010-04-16 03:06:19 +000090 else:
91 return None
Jeremy Hylton1afc1692008-06-18 20:49:58 +000092
93from collections import namedtuple
94
95class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
96
97 __slots__ = ()
98
99 def geturl(self):
100 return urlunsplit(self)
101
102
103class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
104
105 __slots__ = ()
106
107 def geturl(self):
108 return urlunparse(self)
109
110
111def urlparse(url, scheme='', allow_fragments=True):
112 """Parse a URL into 6 components:
113 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
114 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
115 Note that we don't break the components up in smaller bits
116 (e.g. netloc is a single string) and we don't expand % escapes."""
117 tuple = urlsplit(url, scheme, allow_fragments)
118 scheme, netloc, url, query, fragment = tuple
119 if scheme in uses_params and ';' in url:
120 url, params = _splitparams(url)
121 else:
122 params = ''
123 return ParseResult(scheme, netloc, url, params, query, fragment)
124
125def _splitparams(url):
126 if '/' in url:
127 i = url.find(';', url.rfind('/'))
128 if i < 0:
129 return url, ''
130 else:
131 i = url.find(';')
132 return url[:i], url[i+1:]
133
134def _splitnetloc(url, start=0):
135 delim = len(url) # position of end of domain part of url, default is end
Senthil Kumaran2176ad52010-04-16 03:06:19 +0000136 if '[' in url: # check for invalid IPv6 URL
137 if not ']' in url: raise ValueError("Invalid IPv6 URL")
138 elif ']' in url:
139 if not '[' in url: raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000140 for c in '/?#': # look for delimiters; the order is NOT important
141 wdelim = url.find(c, start) # find first of this delim
142 if wdelim >= 0: # if found
143 delim = min(delim, wdelim) # use earliest delim position
144 return url[start:delim], url[delim:] # return (domain, rest)
145
146def urlsplit(url, scheme='', allow_fragments=True):
147 """Parse a URL into 5 components:
148 <scheme>://<netloc>/<path>?<query>#<fragment>
149 Return a 5-tuple: (scheme, netloc, path, query, fragment).
150 Note that we don't break the components up in smaller bits
151 (e.g. netloc is a single string) and we don't expand % escapes."""
152 allow_fragments = bool(allow_fragments)
153 key = url, scheme, allow_fragments, type(url), type(scheme)
154 cached = _parse_cache.get(key, None)
155 if cached:
156 return cached
157 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
158 clear_cache()
159 netloc = query = fragment = ''
160 i = url.find(':')
161 if i > 0:
162 if url[:i] == 'http': # optimize the common case
163 scheme = url[:i].lower()
164 url = url[i+1:]
165 if url[:2] == '//':
166 netloc, url = _splitnetloc(url, 2)
167 if allow_fragments and '#' in url:
168 url, fragment = url.split('#', 1)
169 if '?' in url:
170 url, query = url.split('?', 1)
171 v = SplitResult(scheme, netloc, url, query, fragment)
172 _parse_cache[key] = v
173 return v
174 for c in url[:i]:
175 if c not in scheme_chars:
176 break
177 else:
178 scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumarana8dbb242010-02-19 07:45:03 +0000179 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000180 netloc, url = _splitnetloc(url, 2)
181 if allow_fragments and scheme in uses_fragment and '#' in url:
182 url, fragment = url.split('#', 1)
183 if scheme in uses_query and '?' in url:
184 url, query = url.split('?', 1)
185 v = SplitResult(scheme, netloc, url, query, fragment)
186 _parse_cache[key] = v
187 return v
188
189def urlunparse(components):
190 """Put a parsed URL back together again. This may result in a
191 slightly different, but equivalent URL, if the URL that was parsed
192 originally had redundant delimiters, e.g. a ? with an empty query
193 (the draft states that these are equivalent)."""
194 scheme, netloc, url, params, query, fragment = components
195 if params:
196 url = "%s;%s" % (url, params)
197 return urlunsplit((scheme, netloc, url, query, fragment))
198
199def urlunsplit(components):
200 scheme, netloc, url, query, fragment = components
201 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
202 if url and url[:1] != '/': url = '/' + url
203 url = '//' + (netloc or '') + url
204 if scheme:
205 url = scheme + ':' + url
206 if query:
207 url = url + '?' + query
208 if fragment:
209 url = url + '#' + fragment
210 return url
211
212def urljoin(base, url, allow_fragments=True):
213 """Join a base URL and a possibly relative URL to form an absolute
214 interpretation of the latter."""
215 if not base:
216 return url
217 if not url:
218 return base
219 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
220 urlparse(base, '', allow_fragments)
221 scheme, netloc, path, params, query, fragment = \
222 urlparse(url, bscheme, allow_fragments)
223 if scheme != bscheme or scheme not in uses_relative:
224 return url
225 if scheme in uses_netloc:
226 if netloc:
227 return urlunparse((scheme, netloc, path,
228 params, query, fragment))
229 netloc = bnetloc
230 if path[:1] == '/':
231 return urlunparse((scheme, netloc, path,
232 params, query, fragment))
Facundo Batista23e38562008-08-14 16:55:14 +0000233 if not path:
234 path = bpath
235 if not params:
236 params = bparams
237 else:
238 path = path[:-1]
239 return urlunparse((scheme, netloc, path,
240 params, query, fragment))
241 if not query:
242 query = bquery
243 return urlunparse((scheme, netloc, path,
244 params, query, fragment))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000245 segments = bpath.split('/')[:-1] + path.split('/')
246 # XXX The stuff below is bogus in various ways...
247 if segments[-1] == '.':
248 segments[-1] = ''
249 while '.' in segments:
250 segments.remove('.')
251 while 1:
252 i = 1
253 n = len(segments) - 1
254 while i < n:
255 if (segments[i] == '..'
256 and segments[i-1] not in ('', '..')):
257 del segments[i-1:i+1]
258 break
259 i = i+1
260 else:
261 break
262 if segments == ['', '..']:
263 segments[-1] = ''
264 elif len(segments) >= 2 and segments[-1] == '..':
265 segments[-2:] = ['']
266 return urlunparse((scheme, netloc, '/'.join(segments),
267 params, query, fragment))
268
269def urldefrag(url):
270 """Removes any existing fragment from URL.
271
272 Returns a tuple of the defragmented URL and the fragment. If
273 the URL contained no fragments, the second element is the
274 empty string.
275 """
276 if '#' in url:
277 s, n, p, a, q, frag = urlparse(url)
278 defrag = urlunparse((s, n, p, a, q, ''))
279 return defrag, frag
280 else:
281 return url, ''
282
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000283def unquote_to_bytes(string):
284 """unquote_to_bytes('abc%20def') -> b'abc def'."""
285 # Note: strings are encoded as UTF-8. This is only an issue if it contains
286 # unescaped non-ASCII characters, which URIs should not.
287 if isinstance(string, str):
288 string = string.encode('utf-8')
289 res = string.split(b'%')
290 res[0] = res[0]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000291 for i in range(1, len(res)):
292 item = res[i]
293 try:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000294 res[i] = bytes([int(item[:2], 16)]) + item[2:]
295 except ValueError:
296 res[i] = b'%' + item
297 return b''.join(res)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000298
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000299def unquote(string, encoding='utf-8', errors='replace'):
300 """Replace %xx escapes by their single-character equivalent. The optional
301 encoding and errors parameters specify how to decode percent-encoded
302 sequences into Unicode characters, as accepted by the bytes.decode()
303 method.
304 By default, percent-encoded sequences are decoded with UTF-8, and invalid
305 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000306
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000307 unquote('abc%20def') -> 'abc def'.
308 """
309 if encoding is None: encoding = 'utf-8'
310 if errors is None: errors = 'replace'
311 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
312 # (list of single-byte bytes objects)
313 pct_sequence = []
314 res = string.split('%')
315 for i in range(1, len(res)):
316 item = res[i]
317 try:
318 if not item: raise ValueError
319 pct_sequence.append(bytes.fromhex(item[:2]))
320 rest = item[2:]
321 except ValueError:
322 rest = '%' + item
323 if not rest:
324 # This segment was just a single percent-encoded character.
325 # May be part of a sequence of code units, so delay decoding.
326 # (Stored in pct_sequence).
327 res[i] = ''
328 else:
329 # Encountered non-percent-encoded characters. Flush the current
330 # pct_sequence.
331 res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
332 pct_sequence = []
333 if pct_sequence:
334 # Flush the final pct_sequence
335 # res[-1] will always be empty if pct_sequence != []
336 assert not res[-1], "string=%r, res=%r" % (string, res)
337 res[-1] = b''.join(pct_sequence).decode(encoding, errors)
338 return ''.join(res)
339
Georg Brandlb044b2a2009-09-16 16:05:59 +0000340def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000341 """Parse a query given as a string argument.
342
343 Arguments:
344
345 qs: URL-encoded query string to be parsed
346
347 keep_blank_values: flag indicating whether blank values in
348 URL encoded queries should be treated as blank strings.
349 A true value indicates that blanks should be retained as
350 blank strings. The default false value indicates that
351 blank values are to be ignored and treated as if they were
352 not included.
353
354 strict_parsing: flag indicating what to do with parsing errors.
355 If false (the default), errors are silently ignored.
356 If true, errors raise a ValueError exception.
357 """
358 dict = {}
359 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
360 if name in dict:
361 dict[name].append(value)
362 else:
363 dict[name] = [value]
364 return dict
365
Georg Brandlb044b2a2009-09-16 16:05:59 +0000366def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000367 """Parse a query given as a string argument.
368
369 Arguments:
370
371 qs: URL-encoded query string to be parsed
372
373 keep_blank_values: flag indicating whether blank values in
374 URL encoded queries should be treated as blank strings. A
375 true value indicates that blanks should be retained as blank
376 strings. The default false value indicates that blank values
377 are to be ignored and treated as if they were not included.
378
379 strict_parsing: flag indicating what to do with parsing errors. If
380 false (the default), errors are silently ignored. If true,
381 errors raise a ValueError exception.
382
383 Returns a list, as G-d intended.
384 """
385 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
386 r = []
387 for name_value in pairs:
388 if not name_value and not strict_parsing:
389 continue
390 nv = name_value.split('=', 1)
391 if len(nv) != 2:
392 if strict_parsing:
393 raise ValueError("bad query field: %r" % (name_value,))
394 # Handle case of a control-name with no equal sign
395 if keep_blank_values:
396 nv.append('')
397 else:
398 continue
399 if len(nv[1]) or keep_blank_values:
400 name = unquote(nv[0].replace('+', ' '))
401 value = unquote(nv[1].replace('+', ' '))
402 r.append((name, value))
403
404 return r
405
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000406def unquote_plus(string, encoding='utf-8', errors='replace'):
407 """Like unquote(), but also replace plus signs by spaces, as required for
408 unquoting HTML form values.
409
410 unquote_plus('%7e/abc+def') -> '~/abc def'
411 """
412 string = string.replace('+', ' ')
413 return unquote(string, encoding, errors)
414
415_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
416 b'abcdefghijklmnopqrstuvwxyz'
417 b'0123456789'
418 b'_.-')
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000419_safe_quoters= {}
420
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000421class Quoter(collections.defaultdict):
422 """A mapping from bytes (in range(0,256)) to strings.
423
424 String values are percent-encoded byte values, unless the key < 128, and
425 in the "safe" set (either the specified safe set, or default set).
426 """
427 # Keeps a cache internally, using defaultdict, for efficiency (lookups
428 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000429 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000430 """safe: bytes object."""
431 self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000432
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000433 def __repr__(self):
434 # Without this, will just display as a defaultdict
435 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000436
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000437 def __missing__(self, b):
438 # Handle a cache miss. Store quoted string in cache and return.
439 res = b in self.safe and chr(b) or ('%%%02X' % b)
440 self[b] = res
441 return res
442
443def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000444 """quote('abc def') -> 'abc%20def'
445
446 Each part of a URL, e.g. the path info, the query, etc., has a
447 different set of reserved characters that must be quoted.
448
449 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
450 the following reserved characters.
451
452 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
453 "$" | ","
454
455 Each of these characters is reserved in some component of a URL,
456 but not necessarily in all of them.
457
458 By default, the quote function is intended for quoting the path
459 section of a URL. Thus, it will not encode '/'. This character
460 is reserved, but in typical usage the quote function is being
461 called on a path where the existing slash characters are used as
462 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000463
464 string and safe may be either str or bytes objects. encoding must
465 not be specified if string is a str.
466
467 The optional encoding and errors parameters specify how to deal with
468 non-ASCII characters, as accepted by the str.encode method.
469 By default, encoding='utf-8' (characters are encoded with UTF-8), and
470 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000471 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000472 if isinstance(string, str):
473 if encoding is None:
474 encoding = 'utf-8'
475 if errors is None:
476 errors = 'strict'
477 string = string.encode(encoding, errors)
478 else:
479 if encoding is not None:
480 raise TypeError("quote() doesn't support 'encoding' for bytes")
481 if errors is not None:
482 raise TypeError("quote() doesn't support 'errors' for bytes")
483 return quote_from_bytes(string, safe)
484
485def quote_plus(string, safe='', encoding=None, errors=None):
486 """Like quote(), but also replace ' ' with '+', as required for quoting
487 HTML form values. Plus signs in the original string are escaped unless
488 they are included in safe. It also does not have safe default to '/'.
489 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000490 # Check if ' ' in string, where string may either be a str or bytes. If
491 # there are no spaces, the regular quote will produce the right answer.
492 if ((isinstance(string, str) and ' ' not in string) or
493 (isinstance(string, bytes) and b' ' not in string)):
494 return quote(string, safe, encoding, errors)
495 if isinstance(safe, str):
496 space = ' '
497 else:
498 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000499 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000500 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000501
502def quote_from_bytes(bs, safe='/'):
503 """Like quote(), but accepts a bytes object rather than a str, and does
504 not perform string-to-bytes encoding. It always returns an ASCII string.
505 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
506 """
507 if isinstance(safe, str):
508 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
509 safe = safe.encode('ascii', 'ignore')
510 cachekey = bytes(safe) # In case it was a bytearray
511 if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
512 raise TypeError("quote_from_bytes() expected a bytes")
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000513 try:
514 quoter = _safe_quoters[cachekey]
515 except KeyError:
516 quoter = Quoter(safe)
517 _safe_quoters[cachekey] = quoter
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000518 return ''.join([quoter[char] for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000519
Georg Brandlb044b2a2009-09-16 16:05:59 +0000520def urlencode(query, doseq=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000521 """Encode a sequence of two-element tuples or dictionary into a URL query string.
522
523 If any values in the query arg are sequences and doseq is true, each
524 sequence element is converted to a separate parameter.
525
526 If the query arg is a sequence of two-element tuples, the order of the
527 parameters in the output will match the order of parameters in the
528 input.
529 """
530
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000531 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000532 query = query.items()
533 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000534 # It's a bother at times that strings and string-like objects are
535 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000536 try:
537 # non-sequence items should not work with len()
538 # non-empty strings will fail this
539 if len(query) and not isinstance(query[0], tuple):
540 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000541 # Zero-length sequences of all types will get here and succeed,
542 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000543 # allowed empty dicts that type of behavior probably should be
544 # preserved for consistency
545 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000546 ty, va, tb = sys.exc_info()
547 raise TypeError("not a valid non-string sequence "
548 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000549
550 l = []
551 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000552 for k, v in query:
553 k = quote_plus(str(k))
554 v = quote_plus(str(v))
555 l.append(k + '=' + v)
556 else:
557 for k, v in query:
558 k = quote_plus(str(k))
559 if isinstance(v, str):
560 v = quote_plus(v)
561 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000562 else:
563 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000564 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000565 x = len(v)
566 except TypeError:
567 # not a sequence
568 v = quote_plus(str(v))
569 l.append(k + '=' + v)
570 else:
571 # loop over the sequence
572 for elt in v:
573 l.append(k + '=' + quote_plus(str(elt)))
574 return '&'.join(l)
575
576# Utilities to parse URLs (most of these return None for missing parts):
577# unwrap('<URL:type://host/path>') --> 'type://host/path'
578# splittype('type:opaquestring') --> 'type', 'opaquestring'
579# splithost('//host[:port]/path') --> 'host[:port]', '/path'
580# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
581# splitpasswd('user:passwd') -> 'user', 'passwd'
582# splitport('host:port') --> 'host', 'port'
583# splitquery('/path?query') --> '/path', 'query'
584# splittag('/path#tag') --> '/path', 'tag'
585# splitattr('/path;attr1=value1;attr2=value2;...') ->
586# '/path', ['attr1=value1', 'attr2=value2', ...]
587# splitvalue('attr=value') --> 'attr', 'value'
588# urllib.parse.unquote('abc%20def') -> 'abc def'
589# quote('abc def') -> 'abc%20def')
590
Georg Brandl13e89462008-07-01 19:56:00 +0000591def to_bytes(url):
592 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000593 # Most URL schemes require ASCII. If that changes, the conversion
594 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000595 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000596 if isinstance(url, str):
597 try:
598 url = url.encode("ASCII").decode()
599 except UnicodeError:
600 raise UnicodeError("URL " + repr(url) +
601 " contains non-ASCII characters")
602 return url
603
604def unwrap(url):
605 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
606 url = str(url).strip()
607 if url[:1] == '<' and url[-1:] == '>':
608 url = url[1:-1].strip()
609 if url[:4] == 'URL:': url = url[4:].strip()
610 return url
611
612_typeprog = None
613def splittype(url):
614 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
615 global _typeprog
616 if _typeprog is None:
617 import re
618 _typeprog = re.compile('^([^/:]+):')
619
620 match = _typeprog.match(url)
621 if match:
622 scheme = match.group(1)
623 return scheme.lower(), url[len(scheme) + 1:]
624 return None, url
625
626_hostprog = None
627def splithost(url):
628 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
629 global _hostprog
630 if _hostprog is None:
631 import re
632 _hostprog = re.compile('^//([^/?]*)(.*)$')
633
634 match = _hostprog.match(url)
635 if match: return match.group(1, 2)
636 return None, url
637
638_userprog = None
639def splituser(host):
640 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
641 global _userprog
642 if _userprog is None:
643 import re
644 _userprog = re.compile('^(.*)@(.*)$')
645
646 match = _userprog.match(host)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000647 if match: return map(unquote, match.group(1, 2))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000648 return None, host
649
650_passwdprog = None
651def splitpasswd(user):
652 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
653 global _passwdprog
654 if _passwdprog is None:
655 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000656 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000657
658 match = _passwdprog.match(user)
659 if match: return match.group(1, 2)
660 return user, None
661
662# splittag('/path#tag') --> '/path', 'tag'
663_portprog = None
664def splitport(host):
665 """splitport('host:port') --> 'host', 'port'."""
666 global _portprog
667 if _portprog is None:
668 import re
669 _portprog = re.compile('^(.*):([0-9]+)$')
670
671 match = _portprog.match(host)
672 if match: return match.group(1, 2)
673 return host, None
674
675_nportprog = None
676def splitnport(host, defport=-1):
677 """Split host and port, returning numeric port.
678 Return given default port if no ':' found; defaults to -1.
679 Return numerical port if a valid number are found after ':'.
680 Return None if ':' but not a valid number."""
681 global _nportprog
682 if _nportprog is None:
683 import re
684 _nportprog = re.compile('^(.*):(.*)$')
685
686 match = _nportprog.match(host)
687 if match:
688 host, port = match.group(1, 2)
689 try:
690 if not port: raise ValueError("no digits")
691 nport = int(port)
692 except ValueError:
693 nport = None
694 return host, nport
695 return host, defport
696
697_queryprog = None
698def splitquery(url):
699 """splitquery('/path?query') --> '/path', 'query'."""
700 global _queryprog
701 if _queryprog is None:
702 import re
703 _queryprog = re.compile('^(.*)\?([^?]*)$')
704
705 match = _queryprog.match(url)
706 if match: return match.group(1, 2)
707 return url, None
708
709_tagprog = None
710def splittag(url):
711 """splittag('/path#tag') --> '/path', 'tag'."""
712 global _tagprog
713 if _tagprog is None:
714 import re
715 _tagprog = re.compile('^(.*)#([^#]*)$')
716
717 match = _tagprog.match(url)
718 if match: return match.group(1, 2)
719 return url, None
720
721def splitattr(url):
722 """splitattr('/path;attr1=value1;attr2=value2;...') ->
723 '/path', ['attr1=value1', 'attr2=value2', ...]."""
724 words = url.split(';')
725 return words[0], words[1:]
726
727_valueprog = None
728def splitvalue(attr):
729 """splitvalue('attr=value') --> 'attr', 'value'."""
730 global _valueprog
731 if _valueprog is None:
732 import re
733 _valueprog = re.compile('^([^=]*)=(.*)$')
734
735 match = _valueprog.match(attr)
736 if match: return match.group(1, 2)
737 return attr, None
738
739test_input = """
740 http://a/b/c/d
741
742 g:h = <URL:g:h>
743 http:g = <URL:http://a/b/c/g>
744 http: = <URL:http://a/b/c/d>
745 g = <URL:http://a/b/c/g>
746 ./g = <URL:http://a/b/c/g>
747 g/ = <URL:http://a/b/c/g/>
748 /g = <URL:http://a/g>
749 //g = <URL:http://g>
750 ?y = <URL:http://a/b/c/d?y>
751 g?y = <URL:http://a/b/c/g?y>
752 g?y/./x = <URL:http://a/b/c/g?y/./x>
753 . = <URL:http://a/b/c/>
754 ./ = <URL:http://a/b/c/>
755 .. = <URL:http://a/b/>
756 ../ = <URL:http://a/b/>
757 ../g = <URL:http://a/b/g>
758 ../.. = <URL:http://a/>
759 ../../g = <URL:http://a/g>
760 ../../../g = <URL:http://a/../g>
761 ./../g = <URL:http://a/b/g>
762 ./g/. = <URL:http://a/b/c/g/>
763 /./g = <URL:http://a/./g>
764 g/./h = <URL:http://a/b/c/g/h>
765 g/../h = <URL:http://a/b/c/h>
766 http:g = <URL:http://a/b/c/g>
767 http: = <URL:http://a/b/c/d>
768 http:?y = <URL:http://a/b/c/d?y>
769 http:g?y = <URL:http://a/b/c/g?y>
770 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
771"""
772
773def test():
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000774 base = ''
775 if sys.argv[1:]:
776 fn = sys.argv[1]
777 if fn == '-':
778 fp = sys.stdin
779 else:
780 fp = open(fn)
781 else:
782 from io import StringIO
783 fp = StringIO(test_input)
784 for line in fp:
785 words = line.split()
786 if not words:
787 continue
788 url = words[0]
789 parts = urlparse(url)
790 print('%-10s : %s' % (url, parts))
791 abs = urljoin(base, url)
792 if not base:
793 base = abs
794 wrapped = '<URL:%s>' % abs
795 print('%-10s = %s' % (url, wrapped))
796 if len(words) == 3 and words[1] == '=':
797 if wrapped != words[2]:
798 print('EXPECTED', words[2], '!!!!!!!!!!')
799
800if __name__ == '__main__':
801 test()