blob: 5fd038e1698c4569b5f5f4f9b5e555e81c93716c [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
3See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
4UC Irvine, June 1995.
5"""
6
Facundo Batista2ac5de22008-07-07 18:24:11 +00007import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +00008import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +00009
Jeremy Hylton1afc1692008-06-18 20:49:58 +000010__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batistac469d4c2008-09-03 22:49:01 +000011 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000012 "quote", "quote_plus", "quote_from_bytes",
13 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000014
15# A classification of schemes ('' means apply by default)
16uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
17 'wais', 'file', 'https', 'shttp', 'mms',
18 'prospero', 'rtsp', 'rtspu', '', 'sftp']
19uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
20 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
21 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
22 'svn', 'svn+ssh', 'sftp']
23non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
24 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
25uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
26 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
27 'mms', '', 'sftp']
28uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
29 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
30uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
31 'nntp', 'wais', 'https', 'shttp', 'snews',
32 'file', 'prospero', '']
33
34# Characters valid in scheme names
35scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
36 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
37 '0123456789'
38 '+-.')
39
40MAX_CACHE_SIZE = 20
41_parse_cache = {}
42
43def clear_cache():
44 """Clear the parse cache."""
45 _parse_cache.clear()
46
47
48class ResultMixin(object):
49 """Shared methods for the parsed result objects."""
50
51 @property
52 def username(self):
53 netloc = self.netloc
54 if "@" in netloc:
55 userinfo = netloc.rsplit("@", 1)[0]
56 if ":" in userinfo:
57 userinfo = userinfo.split(":", 1)[0]
58 return userinfo
59 return None
60
61 @property
62 def password(self):
63 netloc = self.netloc
64 if "@" in netloc:
65 userinfo = netloc.rsplit("@", 1)[0]
66 if ":" in userinfo:
67 return userinfo.split(":", 1)[1]
68 return None
69
70 @property
71 def hostname(self):
72 netloc = self.netloc
73 if "@" in netloc:
74 netloc = netloc.rsplit("@", 1)[1]
75 if ":" in netloc:
76 netloc = netloc.split(":", 1)[0]
77 return netloc.lower() or None
78
79 @property
80 def port(self):
81 netloc = self.netloc
82 if "@" in netloc:
83 netloc = netloc.rsplit("@", 1)[1]
84 if ":" in netloc:
85 port = netloc.split(":", 1)[1]
86 return int(port, 10)
87 return None
88
89from collections import namedtuple
90
91class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
92
93 __slots__ = ()
94
95 def geturl(self):
96 return urlunsplit(self)
97
98
99class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
100
101 __slots__ = ()
102
103 def geturl(self):
104 return urlunparse(self)
105
106
107def urlparse(url, scheme='', allow_fragments=True):
108 """Parse a URL into 6 components:
109 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
110 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
111 Note that we don't break the components up in smaller bits
112 (e.g. netloc is a single string) and we don't expand % escapes."""
113 tuple = urlsplit(url, scheme, allow_fragments)
114 scheme, netloc, url, query, fragment = tuple
115 if scheme in uses_params and ';' in url:
116 url, params = _splitparams(url)
117 else:
118 params = ''
119 return ParseResult(scheme, netloc, url, params, query, fragment)
120
121def _splitparams(url):
122 if '/' in url:
123 i = url.find(';', url.rfind('/'))
124 if i < 0:
125 return url, ''
126 else:
127 i = url.find(';')
128 return url[:i], url[i+1:]
129
130def _splitnetloc(url, start=0):
131 delim = len(url) # position of end of domain part of url, default is end
132 for c in '/?#': # look for delimiters; the order is NOT important
133 wdelim = url.find(c, start) # find first of this delim
134 if wdelim >= 0: # if found
135 delim = min(delim, wdelim) # use earliest delim position
136 return url[start:delim], url[delim:] # return (domain, rest)
137
138def urlsplit(url, scheme='', allow_fragments=True):
139 """Parse a URL into 5 components:
140 <scheme>://<netloc>/<path>?<query>#<fragment>
141 Return a 5-tuple: (scheme, netloc, path, query, fragment).
142 Note that we don't break the components up in smaller bits
143 (e.g. netloc is a single string) and we don't expand % escapes."""
144 allow_fragments = bool(allow_fragments)
145 key = url, scheme, allow_fragments, type(url), type(scheme)
146 cached = _parse_cache.get(key, None)
147 if cached:
148 return cached
149 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
150 clear_cache()
151 netloc = query = fragment = ''
152 i = url.find(':')
153 if i > 0:
154 if url[:i] == 'http': # optimize the common case
155 scheme = url[:i].lower()
156 url = url[i+1:]
157 if url[:2] == '//':
158 netloc, url = _splitnetloc(url, 2)
159 if allow_fragments and '#' in url:
160 url, fragment = url.split('#', 1)
161 if '?' in url:
162 url, query = url.split('?', 1)
163 v = SplitResult(scheme, netloc, url, query, fragment)
164 _parse_cache[key] = v
165 return v
166 for c in url[:i]:
167 if c not in scheme_chars:
168 break
169 else:
170 scheme, url = url[:i].lower(), url[i+1:]
171 if scheme in uses_netloc and url[:2] == '//':
172 netloc, url = _splitnetloc(url, 2)
173 if allow_fragments and scheme in uses_fragment and '#' in url:
174 url, fragment = url.split('#', 1)
175 if scheme in uses_query and '?' in url:
176 url, query = url.split('?', 1)
177 v = SplitResult(scheme, netloc, url, query, fragment)
178 _parse_cache[key] = v
179 return v
180
181def urlunparse(components):
182 """Put a parsed URL back together again. This may result in a
183 slightly different, but equivalent URL, if the URL that was parsed
184 originally had redundant delimiters, e.g. a ? with an empty query
185 (the draft states that these are equivalent)."""
186 scheme, netloc, url, params, query, fragment = components
187 if params:
188 url = "%s;%s" % (url, params)
189 return urlunsplit((scheme, netloc, url, query, fragment))
190
191def urlunsplit(components):
192 scheme, netloc, url, query, fragment = components
193 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
194 if url and url[:1] != '/': url = '/' + url
195 url = '//' + (netloc or '') + url
196 if scheme:
197 url = scheme + ':' + url
198 if query:
199 url = url + '?' + query
200 if fragment:
201 url = url + '#' + fragment
202 return url
203
204def urljoin(base, url, allow_fragments=True):
205 """Join a base URL and a possibly relative URL to form an absolute
206 interpretation of the latter."""
207 if not base:
208 return url
209 if not url:
210 return base
211 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
212 urlparse(base, '', allow_fragments)
213 scheme, netloc, path, params, query, fragment = \
214 urlparse(url, bscheme, allow_fragments)
215 if scheme != bscheme or scheme not in uses_relative:
216 return url
217 if scheme in uses_netloc:
218 if netloc:
219 return urlunparse((scheme, netloc, path,
220 params, query, fragment))
221 netloc = bnetloc
222 if path[:1] == '/':
223 return urlunparse((scheme, netloc, path,
224 params, query, fragment))
Facundo Batista23e38562008-08-14 16:55:14 +0000225 if not path:
226 path = bpath
227 if not params:
228 params = bparams
229 else:
230 path = path[:-1]
231 return urlunparse((scheme, netloc, path,
232 params, query, fragment))
233 if not query:
234 query = bquery
235 return urlunparse((scheme, netloc, path,
236 params, query, fragment))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000237 segments = bpath.split('/')[:-1] + path.split('/')
238 # XXX The stuff below is bogus in various ways...
239 if segments[-1] == '.':
240 segments[-1] = ''
241 while '.' in segments:
242 segments.remove('.')
243 while 1:
244 i = 1
245 n = len(segments) - 1
246 while i < n:
247 if (segments[i] == '..'
248 and segments[i-1] not in ('', '..')):
249 del segments[i-1:i+1]
250 break
251 i = i+1
252 else:
253 break
254 if segments == ['', '..']:
255 segments[-1] = ''
256 elif len(segments) >= 2 and segments[-1] == '..':
257 segments[-2:] = ['']
258 return urlunparse((scheme, netloc, '/'.join(segments),
259 params, query, fragment))
260
261def urldefrag(url):
262 """Removes any existing fragment from URL.
263
264 Returns a tuple of the defragmented URL and the fragment. If
265 the URL contained no fragments, the second element is the
266 empty string.
267 """
268 if '#' in url:
269 s, n, p, a, q, frag = urlparse(url)
270 defrag = urlunparse((s, n, p, a, q, ''))
271 return defrag, frag
272 else:
273 return url, ''
274
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000275def unquote_to_bytes(string):
276 """unquote_to_bytes('abc%20def') -> b'abc def'."""
277 # Note: strings are encoded as UTF-8. This is only an issue if it contains
278 # unescaped non-ASCII characters, which URIs should not.
279 if isinstance(string, str):
280 string = string.encode('utf-8')
281 res = string.split(b'%')
282 res[0] = res[0]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000283 for i in range(1, len(res)):
284 item = res[i]
285 try:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000286 res[i] = bytes([int(item[:2], 16)]) + item[2:]
287 except ValueError:
288 res[i] = b'%' + item
289 return b''.join(res)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000290
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000291def unquote(string, encoding='utf-8', errors='replace'):
292 """Replace %xx escapes by their single-character equivalent. The optional
293 encoding and errors parameters specify how to decode percent-encoded
294 sequences into Unicode characters, as accepted by the bytes.decode()
295 method.
296 By default, percent-encoded sequences are decoded with UTF-8, and invalid
297 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000298
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000299 unquote('abc%20def') -> 'abc def'.
300 """
301 if encoding is None: encoding = 'utf-8'
302 if errors is None: errors = 'replace'
303 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
304 # (list of single-byte bytes objects)
305 pct_sequence = []
306 res = string.split('%')
307 for i in range(1, len(res)):
308 item = res[i]
309 try:
310 if not item: raise ValueError
311 pct_sequence.append(bytes.fromhex(item[:2]))
312 rest = item[2:]
313 except ValueError:
314 rest = '%' + item
315 if not rest:
316 # This segment was just a single percent-encoded character.
317 # May be part of a sequence of code units, so delay decoding.
318 # (Stored in pct_sequence).
319 res[i] = ''
320 else:
321 # Encountered non-percent-encoded characters. Flush the current
322 # pct_sequence.
323 res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
324 pct_sequence = []
325 if pct_sequence:
326 # Flush the final pct_sequence
327 # res[-1] will always be empty if pct_sequence != []
328 assert not res[-1], "string=%r, res=%r" % (string, res)
329 res[-1] = b''.join(pct_sequence).decode(encoding, errors)
330 return ''.join(res)
331
Facundo Batistac469d4c2008-09-03 22:49:01 +0000332def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
333 """Parse a query given as a string argument.
334
335 Arguments:
336
337 qs: URL-encoded query string to be parsed
338
339 keep_blank_values: flag indicating whether blank values in
340 URL encoded queries should be treated as blank strings.
341 A true value indicates that blanks should be retained as
342 blank strings. The default false value indicates that
343 blank values are to be ignored and treated as if they were
344 not included.
345
346 strict_parsing: flag indicating what to do with parsing errors.
347 If false (the default), errors are silently ignored.
348 If true, errors raise a ValueError exception.
349 """
350 dict = {}
351 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
352 if name in dict:
353 dict[name].append(value)
354 else:
355 dict[name] = [value]
356 return dict
357
358def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
359 """Parse a query given as a string argument.
360
361 Arguments:
362
363 qs: URL-encoded query string to be parsed
364
365 keep_blank_values: flag indicating whether blank values in
366 URL encoded queries should be treated as blank strings. A
367 true value indicates that blanks should be retained as blank
368 strings. The default false value indicates that blank values
369 are to be ignored and treated as if they were not included.
370
371 strict_parsing: flag indicating what to do with parsing errors. If
372 false (the default), errors are silently ignored. If true,
373 errors raise a ValueError exception.
374
375 Returns a list, as G-d intended.
376 """
377 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
378 r = []
379 for name_value in pairs:
380 if not name_value and not strict_parsing:
381 continue
382 nv = name_value.split('=', 1)
383 if len(nv) != 2:
384 if strict_parsing:
385 raise ValueError("bad query field: %r" % (name_value,))
386 # Handle case of a control-name with no equal sign
387 if keep_blank_values:
388 nv.append('')
389 else:
390 continue
391 if len(nv[1]) or keep_blank_values:
392 name = unquote(nv[0].replace('+', ' '))
393 value = unquote(nv[1].replace('+', ' '))
394 r.append((name, value))
395
396 return r
397
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000398def unquote_plus(string, encoding='utf-8', errors='replace'):
399 """Like unquote(), but also replace plus signs by spaces, as required for
400 unquoting HTML form values.
401
402 unquote_plus('%7e/abc+def') -> '~/abc def'
403 """
404 string = string.replace('+', ' ')
405 return unquote(string, encoding, errors)
406
407_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
408 b'abcdefghijklmnopqrstuvwxyz'
409 b'0123456789'
410 b'_.-')
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000411_safe_quoters= {}
412
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000413class Quoter(collections.defaultdict):
414 """A mapping from bytes (in range(0,256)) to strings.
415
416 String values are percent-encoded byte values, unless the key < 128, and
417 in the "safe" set (either the specified safe set, or default set).
418 """
419 # Keeps a cache internally, using defaultdict, for efficiency (lookups
420 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000421 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000422 """safe: bytes object."""
423 self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000424
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000425 def __repr__(self):
426 # Without this, will just display as a defaultdict
427 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000428
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000429 def __missing__(self, b):
430 # Handle a cache miss. Store quoted string in cache and return.
431 res = b in self.safe and chr(b) or ('%%%02X' % b)
432 self[b] = res
433 return res
434
435def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000436 """quote('abc def') -> 'abc%20def'
437
438 Each part of a URL, e.g. the path info, the query, etc., has a
439 different set of reserved characters that must be quoted.
440
441 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
442 the following reserved characters.
443
444 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
445 "$" | ","
446
447 Each of these characters is reserved in some component of a URL,
448 but not necessarily in all of them.
449
450 By default, the quote function is intended for quoting the path
451 section of a URL. Thus, it will not encode '/'. This character
452 is reserved, but in typical usage the quote function is being
453 called on a path where the existing slash characters are used as
454 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000455
456 string and safe may be either str or bytes objects. encoding must
457 not be specified if string is a str.
458
459 The optional encoding and errors parameters specify how to deal with
460 non-ASCII characters, as accepted by the str.encode method.
461 By default, encoding='utf-8' (characters are encoded with UTF-8), and
462 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000463 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000464 if isinstance(string, str):
465 if encoding is None:
466 encoding = 'utf-8'
467 if errors is None:
468 errors = 'strict'
469 string = string.encode(encoding, errors)
470 else:
471 if encoding is not None:
472 raise TypeError("quote() doesn't support 'encoding' for bytes")
473 if errors is not None:
474 raise TypeError("quote() doesn't support 'errors' for bytes")
475 return quote_from_bytes(string, safe)
476
477def quote_plus(string, safe='', encoding=None, errors=None):
478 """Like quote(), but also replace ' ' with '+', as required for quoting
479 HTML form values. Plus signs in the original string are escaped unless
480 they are included in safe. It also does not have safe default to '/'.
481 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000482 # Check if ' ' in string, where string may either be a str or bytes. If
483 # there are no spaces, the regular quote will produce the right answer.
484 if ((isinstance(string, str) and ' ' not in string) or
485 (isinstance(string, bytes) and b' ' not in string)):
486 return quote(string, safe, encoding, errors)
487 if isinstance(safe, str):
488 space = ' '
489 else:
490 space = b' '
491 string = quote(string, safe + space)
492 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000493
494def quote_from_bytes(bs, safe='/'):
495 """Like quote(), but accepts a bytes object rather than a str, and does
496 not perform string-to-bytes encoding. It always returns an ASCII string.
497 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
498 """
499 if isinstance(safe, str):
500 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
501 safe = safe.encode('ascii', 'ignore')
502 cachekey = bytes(safe) # In case it was a bytearray
503 if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
504 raise TypeError("quote_from_bytes() expected a bytes")
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000505 try:
506 quoter = _safe_quoters[cachekey]
507 except KeyError:
508 quoter = Quoter(safe)
509 _safe_quoters[cachekey] = quoter
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000510 return ''.join([quoter[char] for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000511
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000512def urlencode(query, doseq=0):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000513 """Encode a sequence of two-element tuples or dictionary into a URL query string.
514
515 If any values in the query arg are sequences and doseq is true, each
516 sequence element is converted to a separate parameter.
517
518 If the query arg is a sequence of two-element tuples, the order of the
519 parameters in the output will match the order of parameters in the
520 input.
521 """
522
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000523 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000524 query = query.items()
525 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000526 # It's a bother at times that strings and string-like objects are
527 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000528 try:
529 # non-sequence items should not work with len()
530 # non-empty strings will fail this
531 if len(query) and not isinstance(query[0], tuple):
532 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000533 # Zero-length sequences of all types will get here and succeed,
534 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000535 # allowed empty dicts that type of behavior probably should be
536 # preserved for consistency
537 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000538 ty, va, tb = sys.exc_info()
539 raise TypeError("not a valid non-string sequence "
540 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000541
542 l = []
543 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000544 for k, v in query:
545 k = quote_plus(str(k))
546 v = quote_plus(str(v))
547 l.append(k + '=' + v)
548 else:
549 for k, v in query:
550 k = quote_plus(str(k))
551 if isinstance(v, str):
552 v = quote_plus(v)
553 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000554 else:
555 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000556 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000557 x = len(v)
558 except TypeError:
559 # not a sequence
560 v = quote_plus(str(v))
561 l.append(k + '=' + v)
562 else:
563 # loop over the sequence
564 for elt in v:
565 l.append(k + '=' + quote_plus(str(elt)))
566 return '&'.join(l)
567
568# Utilities to parse URLs (most of these return None for missing parts):
569# unwrap('<URL:type://host/path>') --> 'type://host/path'
570# splittype('type:opaquestring') --> 'type', 'opaquestring'
571# splithost('//host[:port]/path') --> 'host[:port]', '/path'
572# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
573# splitpasswd('user:passwd') -> 'user', 'passwd'
574# splitport('host:port') --> 'host', 'port'
575# splitquery('/path?query') --> '/path', 'query'
576# splittag('/path#tag') --> '/path', 'tag'
577# splitattr('/path;attr1=value1;attr2=value2;...') ->
578# '/path', ['attr1=value1', 'attr2=value2', ...]
579# splitvalue('attr=value') --> 'attr', 'value'
580# urllib.parse.unquote('abc%20def') -> 'abc def'
581# quote('abc def') -> 'abc%20def')
582
Georg Brandl13e89462008-07-01 19:56:00 +0000583def to_bytes(url):
584 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000585 # Most URL schemes require ASCII. If that changes, the conversion
586 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000587 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000588 if isinstance(url, str):
589 try:
590 url = url.encode("ASCII").decode()
591 except UnicodeError:
592 raise UnicodeError("URL " + repr(url) +
593 " contains non-ASCII characters")
594 return url
595
596def unwrap(url):
597 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
598 url = str(url).strip()
599 if url[:1] == '<' and url[-1:] == '>':
600 url = url[1:-1].strip()
601 if url[:4] == 'URL:': url = url[4:].strip()
602 return url
603
604_typeprog = None
605def splittype(url):
606 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
607 global _typeprog
608 if _typeprog is None:
609 import re
610 _typeprog = re.compile('^([^/:]+):')
611
612 match = _typeprog.match(url)
613 if match:
614 scheme = match.group(1)
615 return scheme.lower(), url[len(scheme) + 1:]
616 return None, url
617
618_hostprog = None
619def splithost(url):
620 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
621 global _hostprog
622 if _hostprog is None:
623 import re
624 _hostprog = re.compile('^//([^/?]*)(.*)$')
625
626 match = _hostprog.match(url)
627 if match: return match.group(1, 2)
628 return None, url
629
630_userprog = None
631def splituser(host):
632 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
633 global _userprog
634 if _userprog is None:
635 import re
636 _userprog = re.compile('^(.*)@(.*)$')
637
638 match = _userprog.match(host)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000639 if match: return map(unquote, match.group(1, 2))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000640 return None, host
641
642_passwdprog = None
643def splitpasswd(user):
644 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
645 global _passwdprog
646 if _passwdprog is None:
647 import re
648 _passwdprog = re.compile('^([^:]*):(.*)$')
649
650 match = _passwdprog.match(user)
651 if match: return match.group(1, 2)
652 return user, None
653
654# splittag('/path#tag') --> '/path', 'tag'
655_portprog = None
656def splitport(host):
657 """splitport('host:port') --> 'host', 'port'."""
658 global _portprog
659 if _portprog is None:
660 import re
661 _portprog = re.compile('^(.*):([0-9]+)$')
662
663 match = _portprog.match(host)
664 if match: return match.group(1, 2)
665 return host, None
666
667_nportprog = None
668def splitnport(host, defport=-1):
669 """Split host and port, returning numeric port.
670 Return given default port if no ':' found; defaults to -1.
671 Return numerical port if a valid number are found after ':'.
672 Return None if ':' but not a valid number."""
673 global _nportprog
674 if _nportprog is None:
675 import re
676 _nportprog = re.compile('^(.*):(.*)$')
677
678 match = _nportprog.match(host)
679 if match:
680 host, port = match.group(1, 2)
681 try:
682 if not port: raise ValueError("no digits")
683 nport = int(port)
684 except ValueError:
685 nport = None
686 return host, nport
687 return host, defport
688
689_queryprog = None
690def splitquery(url):
691 """splitquery('/path?query') --> '/path', 'query'."""
692 global _queryprog
693 if _queryprog is None:
694 import re
695 _queryprog = re.compile('^(.*)\?([^?]*)$')
696
697 match = _queryprog.match(url)
698 if match: return match.group(1, 2)
699 return url, None
700
701_tagprog = None
702def splittag(url):
703 """splittag('/path#tag') --> '/path', 'tag'."""
704 global _tagprog
705 if _tagprog is None:
706 import re
707 _tagprog = re.compile('^(.*)#([^#]*)$')
708
709 match = _tagprog.match(url)
710 if match: return match.group(1, 2)
711 return url, None
712
713def splitattr(url):
714 """splitattr('/path;attr1=value1;attr2=value2;...') ->
715 '/path', ['attr1=value1', 'attr2=value2', ...]."""
716 words = url.split(';')
717 return words[0], words[1:]
718
719_valueprog = None
720def splitvalue(attr):
721 """splitvalue('attr=value') --> 'attr', 'value'."""
722 global _valueprog
723 if _valueprog is None:
724 import re
725 _valueprog = re.compile('^([^=]*)=(.*)$')
726
727 match = _valueprog.match(attr)
728 if match: return match.group(1, 2)
729 return attr, None
730
731test_input = """
732 http://a/b/c/d
733
734 g:h = <URL:g:h>
735 http:g = <URL:http://a/b/c/g>
736 http: = <URL:http://a/b/c/d>
737 g = <URL:http://a/b/c/g>
738 ./g = <URL:http://a/b/c/g>
739 g/ = <URL:http://a/b/c/g/>
740 /g = <URL:http://a/g>
741 //g = <URL:http://g>
742 ?y = <URL:http://a/b/c/d?y>
743 g?y = <URL:http://a/b/c/g?y>
744 g?y/./x = <URL:http://a/b/c/g?y/./x>
745 . = <URL:http://a/b/c/>
746 ./ = <URL:http://a/b/c/>
747 .. = <URL:http://a/b/>
748 ../ = <URL:http://a/b/>
749 ../g = <URL:http://a/b/g>
750 ../.. = <URL:http://a/>
751 ../../g = <URL:http://a/g>
752 ../../../g = <URL:http://a/../g>
753 ./../g = <URL:http://a/b/g>
754 ./g/. = <URL:http://a/b/c/g/>
755 /./g = <URL:http://a/./g>
756 g/./h = <URL:http://a/b/c/g/h>
757 g/../h = <URL:http://a/b/c/h>
758 http:g = <URL:http://a/b/c/g>
759 http: = <URL:http://a/b/c/d>
760 http:?y = <URL:http://a/b/c/d?y>
761 http:g?y = <URL:http://a/b/c/g?y>
762 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
763"""
764
765def test():
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000766 base = ''
767 if sys.argv[1:]:
768 fn = sys.argv[1]
769 if fn == '-':
770 fp = sys.stdin
771 else:
772 fp = open(fn)
773 else:
774 from io import StringIO
775 fp = StringIO(test_input)
776 for line in fp:
777 words = line.split()
778 if not words:
779 continue
780 url = words[0]
781 parts = urlparse(url)
782 print('%-10s : %s' % (url, parts))
783 abs = urljoin(base, url)
784 if not base:
785 base = abs
786 wrapped = '<URL:%s>' % abs
787 print('%-10s = %s' % (url, wrapped))
788 if len(words) == 3 and words[1] == '=':
789 if wrapped != words[2]:
790 print('EXPECTED', words[2], '!!!!!!!!!!')
791
792if __name__ == '__main__':
793 test()