Blame - Lib/urllib/parse.py - platform/external/python/cpython3

blob: 00f0e5bec49ee18beb5e0d094ec5d787af21ec09 [file] [log] [blame]

Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1	"""Parse (absolute and relative) URLs.
				2
Senthil Kumaran	fd41e08	2010-04-17 14:44:14 +0000	[diff] [blame]	3	urlparse module is based upon the following RFC specifications.
				4
				5	RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
				6	and L. Masinter, January 2005.
				7
				8	RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
				9	and L.Masinter, December 1999.
				10
Benjamin Peterson	d7c3ed5	2010-06-27 22:32:30 +0000	[diff] [blame]	11	RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaran	fd41e08	2010-04-17 14:44:14 +0000	[diff] [blame]	12	Berners-Lee, R. Fielding, and L. Masinter, August 1998.
				13
Benjamin Peterson	d7c3ed5	2010-06-27 22:32:30 +0000	[diff] [blame]	14	RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
Senthil Kumaran	fd41e08	2010-04-17 14:44:14 +0000	[diff] [blame]	15
				16	RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
				17	1995.
				18
Benjamin Peterson	d7c3ed5	2010-06-27 22:32:30 +0000	[diff] [blame]	19	RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaran	fd41e08	2010-04-17 14:44:14 +0000	[diff] [blame]	20	McCahill, December 1994
				21
Benjamin Peterson	d7c3ed5	2010-06-27 22:32:30 +0000	[diff] [blame]	22	RFC 3986 is considered the current standard and any future changes to
				23	urlparse module should conform with it. The urlparse module is
				24	currently not entirely compliant with this RFC due to defacto
				25	scenarios for parsing, and for backward compatibility purposes, some
				26	parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaran	fd41e08	2010-04-17 14:44:14 +0000	[diff] [blame]	27	test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	28	"""
				29
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	30	import sys
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	31	import collections
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	32
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	33	__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	34	"urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	35	"quote", "quote_plus", "quote_from_bytes",
				36	"unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	37
				38	# A classification of schemes ('' means apply by default)
				39	uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
				40	'wais', 'file', 'https', 'shttp', 'mms',
				41	'prospero', 'rtsp', 'rtspu', '', 'sftp']
				42	uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
				43	'imap', 'wais', 'file', 'mms', 'https', 'shttp',
				44	'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	45	'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	46	non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
				47	'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
				48	uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
				49	'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
				50	'mms', '', 'sftp']
				51	uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
				52	'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
				53	uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
				54	'nntp', 'wais', 'https', 'shttp', 'snews',
				55	'file', 'prospero', '']
				56
				57	# Characters valid in scheme names
				58	scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
				59	'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				60	'0123456789'
				61	'+-.')
				62
				63	MAX_CACHE_SIZE = 20
				64	_parse_cache = {}
				65
				66	def clear_cache():
Antoine Pitrou	2df5fc7	2009-12-08 19:38:17 +0000	[diff] [blame]	67	"""Clear the parse cache and the quoters cache."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	68	_parse_cache.clear()
Antoine Pitrou	2df5fc7	2009-12-08 19:38:17 +0000	[diff] [blame]	69	_safe_quoters.clear()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	70
				71
				72	class ResultMixin(object):
				73	"""Shared methods for the parsed result objects."""
				74
				75	@property
				76	def username(self):
				77	netloc = self.netloc
				78	if "@" in netloc:
				79	userinfo = netloc.rsplit("@", 1)[0]
				80	if ":" in userinfo:
				81	userinfo = userinfo.split(":", 1)[0]
				82	return userinfo
				83	return None
				84
				85	@property
				86	def password(self):
				87	netloc = self.netloc
				88	if "@" in netloc:
				89	userinfo = netloc.rsplit("@", 1)[0]
				90	if ":" in userinfo:
				91	return userinfo.split(":", 1)[1]
				92	return None
				93
				94	@property
				95	def hostname(self):
Senthil Kumaran	ad02d23	2010-04-16 03:02:13 +0000	[diff] [blame]	96	netloc = self.netloc.split('@')[-1]
				97	if '[' in netloc and ']' in netloc:
				98	return netloc.split(']')[0][1:].lower()
Senthil Kumaran	ad02d23	2010-04-16 03:02:13 +0000	[diff] [blame]	99	elif ':' in netloc:
				100	return netloc.split(':')[0].lower()
				101	elif netloc == '':
				102	return None
				103	else:
				104	return netloc.lower()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	105
				106	@property
				107	def port(self):
Senthil Kumaran	ad02d23	2010-04-16 03:02:13 +0000	[diff] [blame]	108	netloc = self.netloc.split('@')[-1].split(']')[-1]
				109	if ':' in netloc:
				110	port = netloc.split(':')[1]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	111	return int(port, 10)
Senthil Kumaran	ad02d23	2010-04-16 03:02:13 +0000	[diff] [blame]	112	else:
				113	return None
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	114
				115	from collections import namedtuple
				116
				117	class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
				118
				119	__slots__ = ()
				120
				121	def geturl(self):
				122	return urlunsplit(self)
				123
				124
				125	class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
				126
				127	__slots__ = ()
				128
				129	def geturl(self):
				130	return urlunparse(self)
				131
				132
				133	def urlparse(url, scheme='', allow_fragments=True):
				134	"""Parse a URL into 6 components:
				135	<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
				136	Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
				137	Note that we don't break the components up in smaller bits
				138	(e.g. netloc is a single string) and we don't expand % escapes."""
				139	tuple = urlsplit(url, scheme, allow_fragments)
				140	scheme, netloc, url, query, fragment = tuple
				141	if scheme in uses_params and ';' in url:
				142	url, params = _splitparams(url)
				143	else:
				144	params = ''
				145	return ParseResult(scheme, netloc, url, params, query, fragment)
				146
				147	def _splitparams(url):
				148	if '/' in url:
				149	i = url.find(';', url.rfind('/'))
				150	if i < 0:
				151	return url, ''
				152	else:
				153	i = url.find(';')
				154	return url[:i], url[i+1:]
				155
				156	def _splitnetloc(url, start=0):
				157	delim = len(url) # position of end of domain part of url, default is end
				158	for c in '/?#': # look for delimiters; the order is NOT important
				159	wdelim = url.find(c, start) # find first of this delim
				160	if wdelim >= 0: # if found
				161	delim = min(delim, wdelim) # use earliest delim position
				162	return url[start:delim], url[delim:] # return (domain, rest)
				163
				164	def urlsplit(url, scheme='', allow_fragments=True):
				165	"""Parse a URL into 5 components:
				166	<scheme>://<netloc>/<path>?<query>#<fragment>
				167	Return a 5-tuple: (scheme, netloc, path, query, fragment).
				168	Note that we don't break the components up in smaller bits
				169	(e.g. netloc is a single string) and we don't expand % escapes."""
				170	allow_fragments = bool(allow_fragments)
				171	key = url, scheme, allow_fragments, type(url), type(scheme)
				172	cached = _parse_cache.get(key, None)
				173	if cached:
				174	return cached
				175	if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
				176	clear_cache()
				177	netloc = query = fragment = ''
				178	i = url.find(':')
				179	if i > 0:
				180	if url[:i] == 'http': # optimize the common case
				181	scheme = url[:i].lower()
				182	url = url[i+1:]
				183	if url[:2] == '//':
				184	netloc, url = _splitnetloc(url, 2)
Senthil Kumaran	7a1e09f	2010-04-22 12:19:46 +0000	[diff] [blame]	185	if (('[' in netloc and ']' not in netloc) or
				186	(']' in netloc and '[' not in netloc)):
				187	raise ValueError("Invalid IPv6 URL")
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	188	if allow_fragments and '#' in url:
				189	url, fragment = url.split('#', 1)
				190	if '?' in url:
				191	url, query = url.split('?', 1)
				192	v = SplitResult(scheme, netloc, url, query, fragment)
				193	_parse_cache[key] = v
				194	return v
Senthil Kumaran	84c7d9f	2010-08-04 04:50:44 +0000	[diff] [blame]	195	if url.endswith(':') or not url[i+1].isdigit():
				196	for c in url[:i]:
				197	if c not in scheme_chars:
				198	break
				199	else:
				200	scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran	6be85c5	2010-02-19 07:42:50 +0000	[diff] [blame]	201	if url[:2] == '//':
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	202	netloc, url = _splitnetloc(url, 2)
Senthil Kumaran	7a1e09f	2010-04-22 12:19:46 +0000	[diff] [blame]	203	if (('[' in netloc and ']' not in netloc) or
				204	(']' in netloc and '[' not in netloc)):
				205	raise ValueError("Invalid IPv6 URL")
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	206	if allow_fragments and scheme in uses_fragment and '#' in url:
				207	url, fragment = url.split('#', 1)
				208	if scheme in uses_query and '?' in url:
				209	url, query = url.split('?', 1)
				210	v = SplitResult(scheme, netloc, url, query, fragment)
				211	_parse_cache[key] = v
				212	return v
				213
				214	def urlunparse(components):
				215	"""Put a parsed URL back together again. This may result in a
				216	slightly different, but equivalent URL, if the URL that was parsed
				217	originally had redundant delimiters, e.g. a ? with an empty query
				218	(the draft states that these are equivalent)."""
				219	scheme, netloc, url, params, query, fragment = components
				220	if params:
				221	url = "%s;%s" % (url, params)
				222	return urlunsplit((scheme, netloc, url, query, fragment))
				223
				224	def urlunsplit(components):
Senthil Kumaran	8749a63	2010-06-28 14:08:00 +0000	[diff] [blame]	225	"""Combine the elements of a tuple as returned by urlsplit() into a
				226	complete URL as a string. The data argument can be any five-item iterable.
				227	This may result in a slightly different, but equivalent URL, if the URL that
				228	was parsed originally had unnecessary delimiters (for example, a ? with an
				229	empty query; the RFC states that these are equivalent)."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	230	scheme, netloc, url, query, fragment = components
				231	if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
				232	if url and url[:1] != '/': url = '/' + url
				233	url = '//' + (netloc or '') + url
				234	if scheme:
				235	url = scheme + ':' + url
				236	if query:
				237	url = url + '?' + query
				238	if fragment:
				239	url = url + '#' + fragment
				240	return url
				241
				242	def urljoin(base, url, allow_fragments=True):
				243	"""Join a base URL and a possibly relative URL to form an absolute
				244	interpretation of the latter."""
				245	if not base:
				246	return url
				247	if not url:
				248	return base
				249	bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
				250	urlparse(base, '', allow_fragments)
				251	scheme, netloc, path, params, query, fragment = \
				252	urlparse(url, bscheme, allow_fragments)
				253	if scheme != bscheme or scheme not in uses_relative:
				254	return url
				255	if scheme in uses_netloc:
				256	if netloc:
				257	return urlunparse((scheme, netloc, path,
				258	params, query, fragment))
				259	netloc = bnetloc
				260	if path[:1] == '/':
				261	return urlunparse((scheme, netloc, path,
				262	params, query, fragment))
Facundo Batista	23e3856	2008-08-14 16:55:14 +0000	[diff] [blame]	263	if not path:
				264	path = bpath
				265	if not params:
				266	params = bparams
				267	else:
				268	path = path[:-1]
				269	return urlunparse((scheme, netloc, path,
				270	params, query, fragment))
				271	if not query:
				272	query = bquery
				273	return urlunparse((scheme, netloc, path,
				274	params, query, fragment))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	275	segments = bpath.split('/')[:-1] + path.split('/')
				276	# XXX The stuff below is bogus in various ways...
				277	if segments[-1] == '.':
				278	segments[-1] = ''
				279	while '.' in segments:
				280	segments.remove('.')
				281	while 1:
				282	i = 1
				283	n = len(segments) - 1
				284	while i < n:
				285	if (segments[i] == '..'
				286	and segments[i-1] not in ('', '..')):
				287	del segments[i-1:i+1]
				288	break
				289	i = i+1
				290	else:
				291	break
				292	if segments == ['', '..']:
				293	segments[-1] = ''
				294	elif len(segments) >= 2 and segments[-1] == '..':
				295	segments[-2:] = ['']
				296	return urlunparse((scheme, netloc, '/'.join(segments),
				297	params, query, fragment))
				298
				299	def urldefrag(url):
				300	"""Removes any existing fragment from URL.
				301
				302	Returns a tuple of the defragmented URL and the fragment. If
				303	the URL contained no fragments, the second element is the
				304	empty string.
				305	"""
				306	if '#' in url:
				307	s, n, p, a, q, frag = urlparse(url)
				308	defrag = urlunparse((s, n, p, a, q, ''))
				309	return defrag, frag
				310	else:
				311	return url, ''
				312
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	313	def unquote_to_bytes(string):
				314	"""unquote_to_bytes('abc%20def') -> b'abc def'."""
				315	# Note: strings are encoded as UTF-8. This is only an issue if it contains
				316	# unescaped non-ASCII characters, which URIs should not.
Senthil Kumaran	d496c4c	2010-07-30 19:34:36 +0000	[diff] [blame]	317	if string in (b'', ''):
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	318	return b''
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	319	if isinstance(string, str):
				320	string = string.encode('utf-8')
				321	res = string.split(b'%')
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	322	if len(res) == 1:
				323	return string
				324	string = res[0]
				325	for item in res[1:]:
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	326	try:
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	327	string += bytes([int(item[:2], 16)]) + item[2:]
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	328	except ValueError:
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	329	string += b'%' + item
				330	return string
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	331
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	332	def unquote(string, encoding='utf-8', errors='replace'):
				333	"""Replace %xx escapes by their single-character equivalent. The optional
				334	encoding and errors parameters specify how to decode percent-encoded
				335	sequences into Unicode characters, as accepted by the bytes.decode()
				336	method.
				337	By default, percent-encoded sequences are decoded with UTF-8, and invalid
				338	sequences are replaced by a placeholder character.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	339
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	340	unquote('abc%20def') -> 'abc def'.
				341	"""
Florent Xicluna	c049fca	2010-07-31 08:56:55 +0000	[diff] [blame]	342	if string == '':
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	343	return string
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	344	res = string.split('%')
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	345	if len(res) == 1:
				346	return string
				347	if encoding is None:
				348	encoding = 'utf-8'
				349	if errors is None:
				350	errors = 'replace'
Florent Xicluna	0f78a94	2010-05-17 18:01:22 +0000	[diff] [blame]	351	# pct_sequence: contiguous sequence of percent-encoded bytes, decoded
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	352	pct_sequence = b''
				353	string = res[0]
				354	for item in res[1:]:
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	355	try:
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	356	if not item:
				357	raise ValueError
				358	pct_sequence += bytes.fromhex(item[:2])
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	359	rest = item[2:]
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	360	if not rest:
				361	# This segment was just a single percent-encoded character.
				362	# May be part of a sequence of code units, so delay decoding.
				363	# (Stored in pct_sequence).
				364	continue
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	365	except ValueError:
				366	rest = '%' + item
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	367	# Encountered non-percent-encoded characters. Flush the current
				368	# pct_sequence.
				369	string += pct_sequence.decode(encoding, errors) + rest
				370	pct_sequence = b''
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	371	if pct_sequence:
				372	# Flush the final pct_sequence
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	373	string += pct_sequence.decode(encoding, errors)
				374	return string
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	375
Georg Brandl	3d6575d	2009-09-16 14:36:22 +0000	[diff] [blame]	376	def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	377	"""Parse a query given as a string argument.
				378
				379	Arguments:
				380
				381	qs: URL-encoded query string to be parsed
				382
				383	keep_blank_values: flag indicating whether blank values in
				384	URL encoded queries should be treated as blank strings.
				385	A true value indicates that blanks should be retained as
				386	blank strings. The default false value indicates that
				387	blank values are to be ignored and treated as if they were
				388	not included.
				389
				390	strict_parsing: flag indicating what to do with parsing errors.
				391	If false (the default), errors are silently ignored.
				392	If true, errors raise a ValueError exception.
				393	"""
				394	dict = {}
				395	for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
				396	if name in dict:
				397	dict[name].append(value)
				398	else:
				399	dict[name] = [value]
				400	return dict
				401
Georg Brandl	3d6575d	2009-09-16 14:36:22 +0000	[diff] [blame]	402	def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	403	"""Parse a query given as a string argument.
				404
				405	Arguments:
				406
				407	qs: URL-encoded query string to be parsed
				408
				409	keep_blank_values: flag indicating whether blank values in
				410	URL encoded queries should be treated as blank strings. A
				411	true value indicates that blanks should be retained as blank
				412	strings. The default false value indicates that blank values
				413	are to be ignored and treated as if they were not included.
				414
				415	strict_parsing: flag indicating what to do with parsing errors. If
				416	false (the default), errors are silently ignored. If true,
				417	errors raise a ValueError exception.
				418
				419	Returns a list, as G-d intended.
				420	"""
				421	pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
				422	r = []
				423	for name_value in pairs:
				424	if not name_value and not strict_parsing:
				425	continue
				426	nv = name_value.split('=', 1)
				427	if len(nv) != 2:
				428	if strict_parsing:
				429	raise ValueError("bad query field: %r" % (name_value,))
				430	# Handle case of a control-name with no equal sign
				431	if keep_blank_values:
				432	nv.append('')
				433	else:
				434	continue
				435	if len(nv[1]) or keep_blank_values:
				436	name = unquote(nv[0].replace('+', ' '))
				437	value = unquote(nv[1].replace('+', ' '))
				438	r.append((name, value))
				439
				440	return r
				441
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	442	def unquote_plus(string, encoding='utf-8', errors='replace'):
				443	"""Like unquote(), but also replace plus signs by spaces, as required for
				444	unquoting HTML form values.
				445
				446	unquote_plus('%7e/abc+def') -> '~/abc def'
				447	"""
				448	string = string.replace('+', ' ')
				449	return unquote(string, encoding, errors)
				450
				451	_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				452	b'abcdefghijklmnopqrstuvwxyz'
				453	b'0123456789'
				454	b'_.-')
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	455	_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
				456	_safe_quoters = {}
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	457
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	458	class Quoter(collections.defaultdict):
				459	"""A mapping from bytes (in range(0,256)) to strings.
				460
				461	String values are percent-encoded byte values, unless the key < 128, and
				462	in the "safe" set (either the specified safe set, or default set).
				463	"""
				464	# Keeps a cache internally, using defaultdict, for efficiency (lookups
				465	# of cached keys don't call Python code at all).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	466	def __init__(self, safe):
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	467	"""safe: bytes object."""
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	468	self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	469
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	470	def __repr__(self):
				471	# Without this, will just display as a defaultdict
				472	return "<Quoter %r>" % dict(self)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	473
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	474	def __missing__(self, b):
				475	# Handle a cache miss. Store quoted string in cache and return.
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	476	res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	477	self[b] = res
				478	return res
				479
				480	def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	481	"""quote('abc def') -> 'abc%20def'
				482
				483	Each part of a URL, e.g. the path info, the query, etc., has a
				484	different set of reserved characters that must be quoted.
				485
				486	RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
				487	the following reserved characters.
				488
				489	reserved = ";" \| "/" \| "?" \| ":" \| "@" \| "&" \| "=" \| "+" \|
				490	"$" \| ","
				491
				492	Each of these characters is reserved in some component of a URL,
				493	but not necessarily in all of them.
				494
				495	By default, the quote function is intended for quoting the path
				496	section of a URL. Thus, it will not encode '/'. This character
				497	is reserved, but in typical usage the quote function is being
				498	called on a path where the existing slash characters are used as
				499	reserved characters.
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	500
				501	string and safe may be either str or bytes objects. encoding must
				502	not be specified if string is a str.
				503
				504	The optional encoding and errors parameters specify how to deal with
				505	non-ASCII characters, as accepted by the str.encode method.
				506	By default, encoding='utf-8' (characters are encoded with UTF-8), and
				507	errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	508	"""
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	509	if isinstance(string, str):
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	510	if not string:
				511	return string
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	512	if encoding is None:
				513	encoding = 'utf-8'
				514	if errors is None:
				515	errors = 'strict'
				516	string = string.encode(encoding, errors)
				517	else:
				518	if encoding is not None:
				519	raise TypeError("quote() doesn't support 'encoding' for bytes")
				520	if errors is not None:
				521	raise TypeError("quote() doesn't support 'errors' for bytes")
				522	return quote_from_bytes(string, safe)
				523
				524	def quote_plus(string, safe='', encoding=None, errors=None):
				525	"""Like quote(), but also replace ' ' with '+', as required for quoting
				526	HTML form values. Plus signs in the original string are escaped unless
				527	they are included in safe. It also does not have safe default to '/'.
				528	"""
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	529	# Check if ' ' in string, where string may either be a str or bytes. If
				530	# there are no spaces, the regular quote will produce the right answer.
				531	if ((isinstance(string, str) and ' ' not in string) or
				532	(isinstance(string, bytes) and b' ' not in string)):
				533	return quote(string, safe, encoding, errors)
				534	if isinstance(safe, str):
				535	space = ' '
				536	else:
				537	space = b' '
Georg Brandl	faf4149	2009-05-26 18:31:11 +0000	[diff] [blame]	538	string = quote(string, safe + space, encoding, errors)
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	539	return string.replace(' ', '+')
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	540
				541	def quote_from_bytes(bs, safe='/'):
				542	"""Like quote(), but accepts a bytes object rather than a str, and does
				543	not perform string-to-bytes encoding. It always returns an ASCII string.
				544	quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
				545	"""
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	546	if not isinstance(bs, (bytes, bytearray)):
				547	raise TypeError("quote_from_bytes() expected bytes")
				548	if not bs:
				549	return ''
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	550	if isinstance(safe, str):
				551	# Normalize 'safe' by converting to bytes and removing non-ASCII chars
				552	safe = safe.encode('ascii', 'ignore')
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	553	else:
				554	safe = bytes([c for c in safe if c < 128])
				555	if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
				556	return bs.decode()
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	557	try:
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	558	quoter = _safe_quoters[safe]
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	559	except KeyError:
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	560	_safe_quoters[safe] = quoter = Quoter(safe).__getitem__
				561	return ''.join([quoter(char) for char in bs])
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	562
Senthil Kumaran	df022da	2010-07-03 17:48:22 +0000	[diff] [blame]	563	def urlencode(query, doseq=False, safe='', encoding=None, errors=None):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	564	"""Encode a sequence of two-element tuples or dictionary into a URL query string.
				565
				566	If any values in the query arg are sequences and doseq is true, each
				567	sequence element is converted to a separate parameter.
				568
				569	If the query arg is a sequence of two-element tuples, the order of the
				570	parameters in the output will match the order of parameters in the
				571	input.
Senthil Kumaran	df022da	2010-07-03 17:48:22 +0000	[diff] [blame]	572
				573	The query arg may be either a string or a bytes type. When query arg is a
				574	string, the safe, encoding and error parameters are sent the quote_plus for
				575	encoding.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	576	"""
				577
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	578	if hasattr(query, "items"):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	579	query = query.items()
				580	else:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	581	# It's a bother at times that strings and string-like objects are
				582	# sequences.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	583	try:
				584	# non-sequence items should not work with len()
				585	# non-empty strings will fail this
				586	if len(query) and not isinstance(query[0], tuple):
				587	raise TypeError
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	588	# Zero-length sequences of all types will get here and succeed,
				589	# but that's a minor nit. Since the original implementation
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	590	# allowed empty dicts that type of behavior probably should be
				591	# preserved for consistency
				592	except TypeError:
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	593	ty, va, tb = sys.exc_info()
				594	raise TypeError("not a valid non-string sequence "
				595	"or mapping object").with_traceback(tb)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	596
				597	l = []
				598	if not doseq:
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	599	for k, v in query:
Senthil Kumaran	df022da	2010-07-03 17:48:22 +0000	[diff] [blame]	600	if isinstance(k, bytes):
				601	k = quote_plus(k, safe)
				602	else:
				603	k = quote_plus(str(k), safe, encoding, errors)
				604
				605	if isinstance(v, bytes):
				606	v = quote_plus(v, safe)
				607	else:
				608	v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	609	l.append(k + '=' + v)
				610	else:
				611	for k, v in query:
Senthil Kumaran	df022da	2010-07-03 17:48:22 +0000	[diff] [blame]	612	if isinstance(k, bytes):
				613	k = quote_plus(k, safe)
				614	else:
				615	k = quote_plus(str(k), safe, encoding, errors)
				616
				617	if isinstance(v, bytes):
				618	v = quote_plus(v, safe)
				619	l.append(k + '=' + v)
				620	elif isinstance(v, str):
				621	v = quote_plus(v, safe, encoding, errors)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	622	l.append(k + '=' + v)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	623	else:
				624	try:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	625	# Is this a sufficient test for sequence-ness?
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	626	x = len(v)
				627	except TypeError:
				628	# not a sequence
Senthil Kumaran	df022da	2010-07-03 17:48:22 +0000	[diff] [blame]	629	v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	630	l.append(k + '=' + v)
				631	else:
				632	# loop over the sequence
				633	for elt in v:
Senthil Kumaran	df022da	2010-07-03 17:48:22 +0000	[diff] [blame]	634	if isinstance(elt, bytes):
				635	elt = quote_plus(elt, safe)
				636	else:
				637	elt = quote_plus(str(elt), safe, encoding, errors)
				638	l.append(k + '=' + elt)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	639	return '&'.join(l)
				640
				641	# Utilities to parse URLs (most of these return None for missing parts):
				642	# unwrap('<URL:type://host/path>') --> 'type://host/path'
				643	# splittype('type:opaquestring') --> 'type', 'opaquestring'
				644	# splithost('//host[:port]/path') --> 'host[:port]', '/path'
				645	# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
				646	# splitpasswd('user:passwd') -> 'user', 'passwd'
				647	# splitport('host:port') --> 'host', 'port'
				648	# splitquery('/path?query') --> '/path', 'query'
				649	# splittag('/path#tag') --> '/path', 'tag'
				650	# splitattr('/path;attr1=value1;attr2=value2;...') ->
				651	# '/path', ['attr1=value1', 'attr2=value2', ...]
				652	# splitvalue('attr=value') --> 'attr', 'value'
				653	# urllib.parse.unquote('abc%20def') -> 'abc def'
				654	# quote('abc def') -> 'abc%20def')
				655
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	656	def to_bytes(url):
				657	"""to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	658	# Most URL schemes require ASCII. If that changes, the conversion
				659	# can be relaxed.
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	660	# XXX get rid of to_bytes()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	661	if isinstance(url, str):
				662	try:
				663	url = url.encode("ASCII").decode()
				664	except UnicodeError:
				665	raise UnicodeError("URL " + repr(url) +
				666	" contains non-ASCII characters")
				667	return url
				668
				669	def unwrap(url):
				670	"""unwrap('<URL:type://host/path>') --> 'type://host/path'."""
				671	url = str(url).strip()
				672	if url[:1] == '<' and url[-1:] == '>':
				673	url = url[1:-1].strip()
				674	if url[:4] == 'URL:': url = url[4:].strip()
				675	return url
				676
				677	_typeprog = None
				678	def splittype(url):
				679	"""splittype('type:opaquestring') --> 'type', 'opaquestring'."""
				680	global _typeprog
				681	if _typeprog is None:
				682	import re
				683	_typeprog = re.compile('^([^/:]+):')
				684
				685	match = _typeprog.match(url)
				686	if match:
				687	scheme = match.group(1)
				688	return scheme.lower(), url[len(scheme) + 1:]
				689	return None, url
				690
				691	_hostprog = None
				692	def splithost(url):
				693	"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
				694	global _hostprog
				695	if _hostprog is None:
				696	import re
				697	_hostprog = re.compile('^//([^/?])(.)$')
				698
				699	match = _hostprog.match(url)
				700	if match: return match.group(1, 2)
				701	return None, url
				702
				703	_userprog = None
				704	def splituser(host):
				705	"""splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
				706	global _userprog
				707	if _userprog is None:
				708	import re
				709	_userprog = re.compile('^(.)@(.)$')
				710
				711	match = _userprog.match(host)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	712	if match: return map(unquote, match.group(1, 2))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	713	return None, host
				714
				715	_passwdprog = None
				716	def splitpasswd(user):
				717	"""splitpasswd('user:passwd') -> 'user', 'passwd'."""
				718	global _passwdprog
				719	if _passwdprog is None:
				720	import re
Senthil Kumaran	eaaec27	2009-03-30 21:54:41 +0000	[diff] [blame]	721	_passwdprog = re.compile('^([^:]):(.)$',re.S)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	722
				723	match = _passwdprog.match(user)
				724	if match: return match.group(1, 2)
				725	return user, None
				726
				727	# splittag('/path#tag') --> '/path', 'tag'
				728	_portprog = None
				729	def splitport(host):
				730	"""splitport('host:port') --> 'host', 'port'."""
				731	global _portprog
				732	if _portprog is None:
				733	import re
				734	_portprog = re.compile('^(.*):([0-9]+)$')
				735
				736	match = _portprog.match(host)
				737	if match: return match.group(1, 2)
				738	return host, None
				739
				740	_nportprog = None
				741	def splitnport(host, defport=-1):
				742	"""Split host and port, returning numeric port.
				743	Return given default port if no ':' found; defaults to -1.
				744	Return numerical port if a valid number are found after ':'.
				745	Return None if ':' but not a valid number."""
				746	global _nportprog
				747	if _nportprog is None:
				748	import re
				749	_nportprog = re.compile('^(.):(.)$')
				750
				751	match = _nportprog.match(host)
				752	if match:
				753	host, port = match.group(1, 2)
				754	try:
				755	if not port: raise ValueError("no digits")
				756	nport = int(port)
				757	except ValueError:
				758	nport = None
				759	return host, nport
				760	return host, defport
				761
				762	_queryprog = None
				763	def splitquery(url):
				764	"""splitquery('/path?query') --> '/path', 'query'."""
				765	global _queryprog
				766	if _queryprog is None:
				767	import re
				768	_queryprog = re.compile('^(.)\?([^?])$')
				769
				770	match = _queryprog.match(url)
				771	if match: return match.group(1, 2)
				772	return url, None
				773
				774	_tagprog = None
				775	def splittag(url):
				776	"""splittag('/path#tag') --> '/path', 'tag'."""
				777	global _tagprog
				778	if _tagprog is None:
				779	import re
				780	_tagprog = re.compile('^(.)#([^#])$')
				781
				782	match = _tagprog.match(url)
				783	if match: return match.group(1, 2)
				784	return url, None
				785
				786	def splitattr(url):
				787	"""splitattr('/path;attr1=value1;attr2=value2;...') ->
				788	'/path', ['attr1=value1', 'attr2=value2', ...]."""
				789	words = url.split(';')
				790	return words[0], words[1:]
				791
				792	_valueprog = None
				793	def splitvalue(attr):
				794	"""splitvalue('attr=value') --> 'attr', 'value'."""
				795	global _valueprog
				796	if _valueprog is None:
				797	import re
				798	_valueprog = re.compile('^([^=])=(.)$')
				799
				800	match = _valueprog.match(attr)
				801	if match: return match.group(1, 2)
				802	return attr, None