Blame - Lib/urllib/parse.py - platform/external/python/cpython3

blob: 38efd502ab17ca4cee815137b126b60eb6a4ecae [file] [log] [blame]

Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1	"""Parse (absolute and relative) URLs.
				2
Senthil Kumaran	fd41e08	2010-04-17 14:44:14 +0000	[diff] [blame]	3	urlparse module is based upon the following RFC specifications.
				4
				5	RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
				6	and L. Masinter, January 2005.
				7
				8	RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
				9	and L.Masinter, December 1999.
				10
Benjamin Peterson	d7c3ed5	2010-06-27 22:32:30 +0000	[diff] [blame]	11	RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaran	fd41e08	2010-04-17 14:44:14 +0000	[diff] [blame]	12	Berners-Lee, R. Fielding, and L. Masinter, August 1998.
				13
Benjamin Peterson	d7c3ed5	2010-06-27 22:32:30 +0000	[diff] [blame]	14	RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
Senthil Kumaran	fd41e08	2010-04-17 14:44:14 +0000	[diff] [blame]	15
				16	RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
				17	1995.
				18
Benjamin Peterson	d7c3ed5	2010-06-27 22:32:30 +0000	[diff] [blame]	19	RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaran	fd41e08	2010-04-17 14:44:14 +0000	[diff] [blame]	20	McCahill, December 1994
				21
Benjamin Peterson	d7c3ed5	2010-06-27 22:32:30 +0000	[diff] [blame]	22	RFC 3986 is considered the current standard and any future changes to
				23	urlparse module should conform with it. The urlparse module is
				24	currently not entirely compliant with this RFC due to defacto
				25	scenarios for parsing, and for backward compatibility purposes, some
				26	parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaran	fd41e08	2010-04-17 14:44:14 +0000	[diff] [blame]	27	test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	28	"""
				29
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	30	import sys
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	31	import collections
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	32
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	33	__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Senthil Kumaran	0256b2a	2010-10-25 16:36:20 +0000	[diff] [blame^]	34	"urlsplit", "urlunsplit", "urlencode", "parse_qs",
				35	"parse_qsl", "quote", "quote_plus", "quote_from_bytes",
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	36	"unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	37
				38	# A classification of schemes ('' means apply by default)
				39	uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
				40	'wais', 'file', 'https', 'shttp', 'mms',
				41	'prospero', 'rtsp', 'rtspu', '', 'sftp']
				42	uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
				43	'imap', 'wais', 'file', 'mms', 'https', 'shttp',
				44	'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	45	'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	46	non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
				47	'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
				48	uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
				49	'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
				50	'mms', '', 'sftp']
				51	uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
				52	'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
				53	uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
				54	'nntp', 'wais', 'https', 'shttp', 'snews',
				55	'file', 'prospero', '']
				56
				57	# Characters valid in scheme names
				58	scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
				59	'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				60	'0123456789'
				61	'+-.')
				62
				63	MAX_CACHE_SIZE = 20
				64	_parse_cache = {}
				65
				66	def clear_cache():
Antoine Pitrou	2df5fc7	2009-12-08 19:38:17 +0000	[diff] [blame]	67	"""Clear the parse cache and the quoters cache."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	68	_parse_cache.clear()
Antoine Pitrou	2df5fc7	2009-12-08 19:38:17 +0000	[diff] [blame]	69	_safe_quoters.clear()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	70
				71
				72	class ResultMixin(object):
				73	"""Shared methods for the parsed result objects."""
				74
				75	@property
				76	def username(self):
				77	netloc = self.netloc
				78	if "@" in netloc:
				79	userinfo = netloc.rsplit("@", 1)[0]
				80	if ":" in userinfo:
				81	userinfo = userinfo.split(":", 1)[0]
				82	return userinfo
				83	return None
				84
				85	@property
				86	def password(self):
				87	netloc = self.netloc
				88	if "@" in netloc:
				89	userinfo = netloc.rsplit("@", 1)[0]
				90	if ":" in userinfo:
				91	return userinfo.split(":", 1)[1]
				92	return None
				93
				94	@property
				95	def hostname(self):
Senthil Kumaran	ad02d23	2010-04-16 03:02:13 +0000	[diff] [blame]	96	netloc = self.netloc.split('@')[-1]
				97	if '[' in netloc and ']' in netloc:
				98	return netloc.split(']')[0][1:].lower()
Senthil Kumaran	ad02d23	2010-04-16 03:02:13 +0000	[diff] [blame]	99	elif ':' in netloc:
				100	return netloc.split(':')[0].lower()
				101	elif netloc == '':
				102	return None
				103	else:
				104	return netloc.lower()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	105
				106	@property
				107	def port(self):
Senthil Kumaran	ad02d23	2010-04-16 03:02:13 +0000	[diff] [blame]	108	netloc = self.netloc.split('@')[-1].split(']')[-1]
				109	if ':' in netloc:
				110	port = netloc.split(':')[1]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	111	return int(port, 10)
Senthil Kumaran	ad02d23	2010-04-16 03:02:13 +0000	[diff] [blame]	112	else:
				113	return None
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	114
				115	from collections import namedtuple
				116
				117	class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
				118
				119	__slots__ = ()
				120
				121	def geturl(self):
				122	return urlunsplit(self)
				123
				124
				125	class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
				126
				127	__slots__ = ()
				128
				129	def geturl(self):
				130	return urlunparse(self)
				131
				132
				133	def urlparse(url, scheme='', allow_fragments=True):
				134	"""Parse a URL into 6 components:
				135	<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
				136	Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
				137	Note that we don't break the components up in smaller bits
				138	(e.g. netloc is a single string) and we don't expand % escapes."""
				139	tuple = urlsplit(url, scheme, allow_fragments)
				140	scheme, netloc, url, query, fragment = tuple
				141	if scheme in uses_params and ';' in url:
				142	url, params = _splitparams(url)
				143	else:
				144	params = ''
				145	return ParseResult(scheme, netloc, url, params, query, fragment)
				146
				147	def _splitparams(url):
				148	if '/' in url:
				149	i = url.find(';', url.rfind('/'))
				150	if i < 0:
				151	return url, ''
				152	else:
				153	i = url.find(';')
				154	return url[:i], url[i+1:]
				155
				156	def _splitnetloc(url, start=0):
				157	delim = len(url) # position of end of domain part of url, default is end
				158	for c in '/?#': # look for delimiters; the order is NOT important
				159	wdelim = url.find(c, start) # find first of this delim
				160	if wdelim >= 0: # if found
				161	delim = min(delim, wdelim) # use earliest delim position
				162	return url[start:delim], url[delim:] # return (domain, rest)
				163
				164	def urlsplit(url, scheme='', allow_fragments=True):
				165	"""Parse a URL into 5 components:
				166	<scheme>://<netloc>/<path>?<query>#<fragment>
				167	Return a 5-tuple: (scheme, netloc, path, query, fragment).
				168	Note that we don't break the components up in smaller bits
				169	(e.g. netloc is a single string) and we don't expand % escapes."""
				170	allow_fragments = bool(allow_fragments)
				171	key = url, scheme, allow_fragments, type(url), type(scheme)
				172	cached = _parse_cache.get(key, None)
				173	if cached:
				174	return cached
				175	if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
				176	clear_cache()
				177	netloc = query = fragment = ''
				178	i = url.find(':')
				179	if i > 0:
				180	if url[:i] == 'http': # optimize the common case
				181	scheme = url[:i].lower()
				182	url = url[i+1:]
				183	if url[:2] == '//':
				184	netloc, url = _splitnetloc(url, 2)
Senthil Kumaran	7a1e09f	2010-04-22 12:19:46 +0000	[diff] [blame]	185	if (('[' in netloc and ']' not in netloc) or
				186	(']' in netloc and '[' not in netloc)):
				187	raise ValueError("Invalid IPv6 URL")
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	188	if allow_fragments and '#' in url:
				189	url, fragment = url.split('#', 1)
				190	if '?' in url:
				191	url, query = url.split('?', 1)
				192	v = SplitResult(scheme, netloc, url, query, fragment)
				193	_parse_cache[key] = v
				194	return v
Senthil Kumaran	84c7d9f	2010-08-04 04:50:44 +0000	[diff] [blame]	195	if url.endswith(':') or not url[i+1].isdigit():
				196	for c in url[:i]:
				197	if c not in scheme_chars:
				198	break
				199	else:
				200	scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran	6be85c5	2010-02-19 07:42:50 +0000	[diff] [blame]	201	if url[:2] == '//':
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	202	netloc, url = _splitnetloc(url, 2)
Senthil Kumaran	7a1e09f	2010-04-22 12:19:46 +0000	[diff] [blame]	203	if (('[' in netloc and ']' not in netloc) or
				204	(']' in netloc and '[' not in netloc)):
				205	raise ValueError("Invalid IPv6 URL")
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	206	if allow_fragments and scheme in uses_fragment and '#' in url:
				207	url, fragment = url.split('#', 1)
				208	if scheme in uses_query and '?' in url:
				209	url, query = url.split('?', 1)
				210	v = SplitResult(scheme, netloc, url, query, fragment)
				211	_parse_cache[key] = v
				212	return v
				213
				214	def urlunparse(components):
				215	"""Put a parsed URL back together again. This may result in a
				216	slightly different, but equivalent URL, if the URL that was parsed
				217	originally had redundant delimiters, e.g. a ? with an empty query
				218	(the draft states that these are equivalent)."""
				219	scheme, netloc, url, params, query, fragment = components
				220	if params:
				221	url = "%s;%s" % (url, params)
				222	return urlunsplit((scheme, netloc, url, query, fragment))
				223
				224	def urlunsplit(components):
Senthil Kumaran	8749a63	2010-06-28 14:08:00 +0000	[diff] [blame]	225	"""Combine the elements of a tuple as returned by urlsplit() into a
				226	complete URL as a string. The data argument can be any five-item iterable.
				227	This may result in a slightly different, but equivalent URL, if the URL that
				228	was parsed originally had unnecessary delimiters (for example, a ? with an
				229	empty query; the RFC states that these are equivalent)."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	230	scheme, netloc, url, query, fragment = components
				231	if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
				232	if url and url[:1] != '/': url = '/' + url
				233	url = '//' + (netloc or '') + url
				234	if scheme:
				235	url = scheme + ':' + url
				236	if query:
				237	url = url + '?' + query
				238	if fragment:
				239	url = url + '#' + fragment
				240	return url
				241
				242	def urljoin(base, url, allow_fragments=True):
				243	"""Join a base URL and a possibly relative URL to form an absolute
				244	interpretation of the latter."""
				245	if not base:
				246	return url
				247	if not url:
				248	return base
				249	bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
				250	urlparse(base, '', allow_fragments)
				251	scheme, netloc, path, params, query, fragment = \
				252	urlparse(url, bscheme, allow_fragments)
				253	if scheme != bscheme or scheme not in uses_relative:
				254	return url
				255	if scheme in uses_netloc:
				256	if netloc:
				257	return urlunparse((scheme, netloc, path,
				258	params, query, fragment))
				259	netloc = bnetloc
				260	if path[:1] == '/':
				261	return urlunparse((scheme, netloc, path,
				262	params, query, fragment))
Facundo Batista	23e3856	2008-08-14 16:55:14 +0000	[diff] [blame]	263	if not path:
				264	path = bpath
				265	if not params:
				266	params = bparams
				267	else:
				268	path = path[:-1]
				269	return urlunparse((scheme, netloc, path,
				270	params, query, fragment))
				271	if not query:
				272	query = bquery
				273	return urlunparse((scheme, netloc, path,
				274	params, query, fragment))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	275	segments = bpath.split('/')[:-1] + path.split('/')
				276	# XXX The stuff below is bogus in various ways...
				277	if segments[-1] == '.':
				278	segments[-1] = ''
				279	while '.' in segments:
				280	segments.remove('.')
				281	while 1:
				282	i = 1
				283	n = len(segments) - 1
				284	while i < n:
				285	if (segments[i] == '..'
				286	and segments[i-1] not in ('', '..')):
				287	del segments[i-1:i+1]
				288	break
				289	i = i+1
				290	else:
				291	break
				292	if segments == ['', '..']:
				293	segments[-1] = ''
				294	elif len(segments) >= 2 and segments[-1] == '..':
				295	segments[-2:] = ['']
				296	return urlunparse((scheme, netloc, '/'.join(segments),
				297	params, query, fragment))
				298
				299	def urldefrag(url):
				300	"""Removes any existing fragment from URL.
				301
				302	Returns a tuple of the defragmented URL and the fragment. If
				303	the URL contained no fragments, the second element is the
				304	empty string.
				305	"""
				306	if '#' in url:
				307	s, n, p, a, q, frag = urlparse(url)
				308	defrag = urlunparse((s, n, p, a, q, ''))
				309	return defrag, frag
				310	else:
				311	return url, ''
				312
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	313	def unquote_to_bytes(string):
				314	"""unquote_to_bytes('abc%20def') -> b'abc def'."""
				315	# Note: strings are encoded as UTF-8. This is only an issue if it contains
				316	# unescaped non-ASCII characters, which URIs should not.
Florent Xicluna	82a3f8a	2010-08-14 18:30:35 +0000	[diff] [blame]	317	if not string:
				318	# Is it a string-like object?
				319	string.split
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	320	return b''
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	321	if isinstance(string, str):
				322	string = string.encode('utf-8')
				323	res = string.split(b'%')
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	324	if len(res) == 1:
				325	return string
				326	string = res[0]
				327	for item in res[1:]:
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	328	try:
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	329	string += bytes([int(item[:2], 16)]) + item[2:]
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	330	except ValueError:
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	331	string += b'%' + item
				332	return string
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	333
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	334	def unquote(string, encoding='utf-8', errors='replace'):
				335	"""Replace %xx escapes by their single-character equivalent. The optional
				336	encoding and errors parameters specify how to decode percent-encoded
				337	sequences into Unicode characters, as accepted by the bytes.decode()
				338	method.
				339	By default, percent-encoded sequences are decoded with UTF-8, and invalid
				340	sequences are replaced by a placeholder character.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	341
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	342	unquote('abc%20def') -> 'abc def'.
				343	"""
Florent Xicluna	c049fca	2010-07-31 08:56:55 +0000	[diff] [blame]	344	if string == '':
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	345	return string
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	346	res = string.split('%')
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	347	if len(res) == 1:
				348	return string
				349	if encoding is None:
				350	encoding = 'utf-8'
				351	if errors is None:
				352	errors = 'replace'
Florent Xicluna	0f78a94	2010-05-17 18:01:22 +0000	[diff] [blame]	353	# pct_sequence: contiguous sequence of percent-encoded bytes, decoded
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	354	pct_sequence = b''
				355	string = res[0]
				356	for item in res[1:]:
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	357	try:
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	358	if not item:
				359	raise ValueError
				360	pct_sequence += bytes.fromhex(item[:2])
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	361	rest = item[2:]
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	362	if not rest:
				363	# This segment was just a single percent-encoded character.
				364	# May be part of a sequence of code units, so delay decoding.
				365	# (Stored in pct_sequence).
				366	continue
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	367	except ValueError:
				368	rest = '%' + item
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	369	# Encountered non-percent-encoded characters. Flush the current
				370	# pct_sequence.
				371	string += pct_sequence.decode(encoding, errors) + rest
				372	pct_sequence = b''
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	373	if pct_sequence:
				374	# Flush the final pct_sequence
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	375	string += pct_sequence.decode(encoding, errors)
				376	return string
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	377
Georg Brandl	3d6575d	2009-09-16 14:36:22 +0000	[diff] [blame]	378	def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	379	"""Parse a query given as a string argument.
				380
				381	Arguments:
				382
Senthil Kumaran	30e86a4	2010-08-09 20:01:35 +0000	[diff] [blame]	383	qs: percent-encoded query string to be parsed
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	384
				385	keep_blank_values: flag indicating whether blank values in
Senthil Kumaran	30e86a4	2010-08-09 20:01:35 +0000	[diff] [blame]	386	percent-encoded queries should be treated as blank strings.
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	387	A true value indicates that blanks should be retained as
				388	blank strings. The default false value indicates that
				389	blank values are to be ignored and treated as if they were
				390	not included.
				391
				392	strict_parsing: flag indicating what to do with parsing errors.
				393	If false (the default), errors are silently ignored.
				394	If true, errors raise a ValueError exception.
				395	"""
				396	dict = {}
				397	for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
				398	if name in dict:
				399	dict[name].append(value)
				400	else:
				401	dict[name] = [value]
				402	return dict
				403
Georg Brandl	3d6575d	2009-09-16 14:36:22 +0000	[diff] [blame]	404	def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	405	"""Parse a query given as a string argument.
				406
				407	Arguments:
				408
Senthil Kumaran	30e86a4	2010-08-09 20:01:35 +0000	[diff] [blame]	409	qs: percent-encoded query string to be parsed
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	410
				411	keep_blank_values: flag indicating whether blank values in
Senthil Kumaran	30e86a4	2010-08-09 20:01:35 +0000	[diff] [blame]	412	percent-encoded queries should be treated as blank strings. A
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	413	true value indicates that blanks should be retained as blank
				414	strings. The default false value indicates that blank values
				415	are to be ignored and treated as if they were not included.
				416
				417	strict_parsing: flag indicating what to do with parsing errors. If
				418	false (the default), errors are silently ignored. If true,
				419	errors raise a ValueError exception.
				420
				421	Returns a list, as G-d intended.
				422	"""
				423	pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
				424	r = []
				425	for name_value in pairs:
				426	if not name_value and not strict_parsing:
				427	continue
				428	nv = name_value.split('=', 1)
				429	if len(nv) != 2:
				430	if strict_parsing:
				431	raise ValueError("bad query field: %r" % (name_value,))
				432	# Handle case of a control-name with no equal sign
				433	if keep_blank_values:
				434	nv.append('')
				435	else:
				436	continue
				437	if len(nv[1]) or keep_blank_values:
				438	name = unquote(nv[0].replace('+', ' '))
				439	value = unquote(nv[1].replace('+', ' '))
				440	r.append((name, value))
				441
				442	return r
				443
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	444	def unquote_plus(string, encoding='utf-8', errors='replace'):
				445	"""Like unquote(), but also replace plus signs by spaces, as required for
				446	unquoting HTML form values.
				447
				448	unquote_plus('%7e/abc+def') -> '~/abc def'
				449	"""
				450	string = string.replace('+', ' ')
				451	return unquote(string, encoding, errors)
				452
				453	_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				454	b'abcdefghijklmnopqrstuvwxyz'
				455	b'0123456789'
				456	b'_.-')
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	457	_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
				458	_safe_quoters = {}
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	459
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	460	class Quoter(collections.defaultdict):
				461	"""A mapping from bytes (in range(0,256)) to strings.
				462
				463	String values are percent-encoded byte values, unless the key < 128, and
				464	in the "safe" set (either the specified safe set, or default set).
				465	"""
				466	# Keeps a cache internally, using defaultdict, for efficiency (lookups
				467	# of cached keys don't call Python code at all).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	468	def __init__(self, safe):
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	469	"""safe: bytes object."""
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	470	self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	471
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	472	def __repr__(self):
				473	# Without this, will just display as a defaultdict
				474	return "<Quoter %r>" % dict(self)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	475
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	476	def __missing__(self, b):
				477	# Handle a cache miss. Store quoted string in cache and return.
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	478	res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	479	self[b] = res
				480	return res
				481
				482	def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	483	"""quote('abc def') -> 'abc%20def'
				484
				485	Each part of a URL, e.g. the path info, the query, etc., has a
				486	different set of reserved characters that must be quoted.
				487
				488	RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
				489	the following reserved characters.
				490
				491	reserved = ";" \| "/" \| "?" \| ":" \| "@" \| "&" \| "=" \| "+" \|
				492	"$" \| ","
				493
				494	Each of these characters is reserved in some component of a URL,
				495	but not necessarily in all of them.
				496
				497	By default, the quote function is intended for quoting the path
				498	section of a URL. Thus, it will not encode '/'. This character
				499	is reserved, but in typical usage the quote function is being
				500	called on a path where the existing slash characters are used as
				501	reserved characters.
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	502
				503	string and safe may be either str or bytes objects. encoding must
				504	not be specified if string is a str.
				505
				506	The optional encoding and errors parameters specify how to deal with
				507	non-ASCII characters, as accepted by the str.encode method.
				508	By default, encoding='utf-8' (characters are encoded with UTF-8), and
				509	errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	510	"""
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	511	if isinstance(string, str):
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	512	if not string:
				513	return string
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	514	if encoding is None:
				515	encoding = 'utf-8'
				516	if errors is None:
				517	errors = 'strict'
				518	string = string.encode(encoding, errors)
				519	else:
				520	if encoding is not None:
				521	raise TypeError("quote() doesn't support 'encoding' for bytes")
				522	if errors is not None:
				523	raise TypeError("quote() doesn't support 'errors' for bytes")
				524	return quote_from_bytes(string, safe)
				525
				526	def quote_plus(string, safe='', encoding=None, errors=None):
				527	"""Like quote(), but also replace ' ' with '+', as required for quoting
				528	HTML form values. Plus signs in the original string are escaped unless
				529	they are included in safe. It also does not have safe default to '/'.
				530	"""
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	531	# Check if ' ' in string, where string may either be a str or bytes. If
				532	# there are no spaces, the regular quote will produce the right answer.
				533	if ((isinstance(string, str) and ' ' not in string) or
				534	(isinstance(string, bytes) and b' ' not in string)):
				535	return quote(string, safe, encoding, errors)
				536	if isinstance(safe, str):
				537	space = ' '
				538	else:
				539	space = b' '
Georg Brandl	faf4149	2009-05-26 18:31:11 +0000	[diff] [blame]	540	string = quote(string, safe + space, encoding, errors)
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	541	return string.replace(' ', '+')
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	542
				543	def quote_from_bytes(bs, safe='/'):
				544	"""Like quote(), but accepts a bytes object rather than a str, and does
				545	not perform string-to-bytes encoding. It always returns an ASCII string.
				546	quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
				547	"""
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	548	if not isinstance(bs, (bytes, bytearray)):
				549	raise TypeError("quote_from_bytes() expected bytes")
				550	if not bs:
				551	return ''
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	552	if isinstance(safe, str):
				553	# Normalize 'safe' by converting to bytes and removing non-ASCII chars
				554	safe = safe.encode('ascii', 'ignore')
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	555	else:
				556	safe = bytes([c for c in safe if c < 128])
				557	if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
				558	return bs.decode()
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	559	try:
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	560	quoter = _safe_quoters[safe]
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	561	except KeyError:
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	562	_safe_quoters[safe] = quoter = Quoter(safe).__getitem__
				563	return ''.join([quoter(char) for char in bs])
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	564
Senthil Kumaran	df022da	2010-07-03 17:48:22 +0000	[diff] [blame]	565	def urlencode(query, doseq=False, safe='', encoding=None, errors=None):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	566	"""Encode a sequence of two-element tuples or dictionary into a URL query string.
				567
				568	If any values in the query arg are sequences and doseq is true, each
				569	sequence element is converted to a separate parameter.
				570
				571	If the query arg is a sequence of two-element tuples, the order of the
				572	parameters in the output will match the order of parameters in the
				573	input.
Senthil Kumaran	df022da	2010-07-03 17:48:22 +0000	[diff] [blame]	574
				575	The query arg may be either a string or a bytes type. When query arg is a
				576	string, the safe, encoding and error parameters are sent the quote_plus for
				577	encoding.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	578	"""
				579
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	580	if hasattr(query, "items"):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	581	query = query.items()
				582	else:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	583	# It's a bother at times that strings and string-like objects are
				584	# sequences.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	585	try:
				586	# non-sequence items should not work with len()
				587	# non-empty strings will fail this
				588	if len(query) and not isinstance(query[0], tuple):
				589	raise TypeError
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	590	# Zero-length sequences of all types will get here and succeed,
				591	# but that's a minor nit. Since the original implementation
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	592	# allowed empty dicts that type of behavior probably should be
				593	# preserved for consistency
				594	except TypeError:
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	595	ty, va, tb = sys.exc_info()
				596	raise TypeError("not a valid non-string sequence "
				597	"or mapping object").with_traceback(tb)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	598
				599	l = []
				600	if not doseq:
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	601	for k, v in query:
Senthil Kumaran	df022da	2010-07-03 17:48:22 +0000	[diff] [blame]	602	if isinstance(k, bytes):
				603	k = quote_plus(k, safe)
				604	else:
				605	k = quote_plus(str(k), safe, encoding, errors)
				606
				607	if isinstance(v, bytes):
				608	v = quote_plus(v, safe)
				609	else:
				610	v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	611	l.append(k + '=' + v)
				612	else:
				613	for k, v in query:
Senthil Kumaran	df022da	2010-07-03 17:48:22 +0000	[diff] [blame]	614	if isinstance(k, bytes):
				615	k = quote_plus(k, safe)
				616	else:
				617	k = quote_plus(str(k), safe, encoding, errors)
				618
				619	if isinstance(v, bytes):
				620	v = quote_plus(v, safe)
				621	l.append(k + '=' + v)
				622	elif isinstance(v, str):
				623	v = quote_plus(v, safe, encoding, errors)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	624	l.append(k + '=' + v)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	625	else:
				626	try:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	627	# Is this a sufficient test for sequence-ness?
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	628	x = len(v)
				629	except TypeError:
				630	# not a sequence
Senthil Kumaran	df022da	2010-07-03 17:48:22 +0000	[diff] [blame]	631	v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	632	l.append(k + '=' + v)
				633	else:
				634	# loop over the sequence
				635	for elt in v:
Senthil Kumaran	df022da	2010-07-03 17:48:22 +0000	[diff] [blame]	636	if isinstance(elt, bytes):
				637	elt = quote_plus(elt, safe)
				638	else:
				639	elt = quote_plus(str(elt), safe, encoding, errors)
				640	l.append(k + '=' + elt)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	641	return '&'.join(l)
				642
				643	# Utilities to parse URLs (most of these return None for missing parts):
				644	# unwrap('<URL:type://host/path>') --> 'type://host/path'
				645	# splittype('type:opaquestring') --> 'type', 'opaquestring'
				646	# splithost('//host[:port]/path') --> 'host[:port]', '/path'
				647	# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
				648	# splitpasswd('user:passwd') -> 'user', 'passwd'
				649	# splitport('host:port') --> 'host', 'port'
				650	# splitquery('/path?query') --> '/path', 'query'
				651	# splittag('/path#tag') --> '/path', 'tag'
				652	# splitattr('/path;attr1=value1;attr2=value2;...') ->
				653	# '/path', ['attr1=value1', 'attr2=value2', ...]
				654	# splitvalue('attr=value') --> 'attr', 'value'
				655	# urllib.parse.unquote('abc%20def') -> 'abc def'
				656	# quote('abc def') -> 'abc%20def')
				657
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	658	def to_bytes(url):
				659	"""to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	660	# Most URL schemes require ASCII. If that changes, the conversion
				661	# can be relaxed.
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	662	# XXX get rid of to_bytes()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	663	if isinstance(url, str):
				664	try:
				665	url = url.encode("ASCII").decode()
				666	except UnicodeError:
				667	raise UnicodeError("URL " + repr(url) +
				668	" contains non-ASCII characters")
				669	return url
				670
				671	def unwrap(url):
				672	"""unwrap('<URL:type://host/path>') --> 'type://host/path'."""
				673	url = str(url).strip()
				674	if url[:1] == '<' and url[-1:] == '>':
				675	url = url[1:-1].strip()
				676	if url[:4] == 'URL:': url = url[4:].strip()
				677	return url
				678
				679	_typeprog = None
				680	def splittype(url):
				681	"""splittype('type:opaquestring') --> 'type', 'opaquestring'."""
				682	global _typeprog
				683	if _typeprog is None:
				684	import re
				685	_typeprog = re.compile('^([^/:]+):')
				686
				687	match = _typeprog.match(url)
				688	if match:
				689	scheme = match.group(1)
				690	return scheme.lower(), url[len(scheme) + 1:]
				691	return None, url
				692
				693	_hostprog = None
				694	def splithost(url):
				695	"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
				696	global _hostprog
				697	if _hostprog is None:
				698	import re
				699	_hostprog = re.compile('^//([^/?])(.)$')
				700
				701	match = _hostprog.match(url)
				702	if match: return match.group(1, 2)
				703	return None, url
				704
				705	_userprog = None
				706	def splituser(host):
				707	"""splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
				708	global _userprog
				709	if _userprog is None:
				710	import re
				711	_userprog = re.compile('^(.)@(.)$')
				712
				713	match = _userprog.match(host)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	714	if match: return map(unquote, match.group(1, 2))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	715	return None, host
				716
				717	_passwdprog = None
				718	def splitpasswd(user):
				719	"""splitpasswd('user:passwd') -> 'user', 'passwd'."""
				720	global _passwdprog
				721	if _passwdprog is None:
				722	import re
Senthil Kumaran	eaaec27	2009-03-30 21:54:41 +0000	[diff] [blame]	723	_passwdprog = re.compile('^([^:]):(.)$',re.S)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	724
				725	match = _passwdprog.match(user)
				726	if match: return match.group(1, 2)
				727	return user, None
				728
				729	# splittag('/path#tag') --> '/path', 'tag'
				730	_portprog = None
				731	def splitport(host):
				732	"""splitport('host:port') --> 'host', 'port'."""
				733	global _portprog
				734	if _portprog is None:
				735	import re
				736	_portprog = re.compile('^(.*):([0-9]+)$')
				737
				738	match = _portprog.match(host)
				739	if match: return match.group(1, 2)
				740	return host, None
				741
				742	_nportprog = None
				743	def splitnport(host, defport=-1):
				744	"""Split host and port, returning numeric port.
				745	Return given default port if no ':' found; defaults to -1.
				746	Return numerical port if a valid number are found after ':'.
				747	Return None if ':' but not a valid number."""
				748	global _nportprog
				749	if _nportprog is None:
				750	import re
				751	_nportprog = re.compile('^(.):(.)$')
				752
				753	match = _nportprog.match(host)
				754	if match:
				755	host, port = match.group(1, 2)
				756	try:
				757	if not port: raise ValueError("no digits")
				758	nport = int(port)
				759	except ValueError:
				760	nport = None
				761	return host, nport
				762	return host, defport
				763
				764	_queryprog = None
				765	def splitquery(url):
				766	"""splitquery('/path?query') --> '/path', 'query'."""
				767	global _queryprog
				768	if _queryprog is None:
				769	import re
				770	_queryprog = re.compile('^(.)\?([^?])$')
				771
				772	match = _queryprog.match(url)
				773	if match: return match.group(1, 2)
				774	return url, None
				775
				776	_tagprog = None
				777	def splittag(url):
				778	"""splittag('/path#tag') --> '/path', 'tag'."""
				779	global _tagprog
				780	if _tagprog is None:
				781	import re
				782	_tagprog = re.compile('^(.)#([^#])$')
				783
				784	match = _tagprog.match(url)
				785	if match: return match.group(1, 2)
				786	return url, None
				787
				788	def splitattr(url):
				789	"""splitattr('/path;attr1=value1;attr2=value2;...') ->
				790	'/path', ['attr1=value1', 'attr2=value2', ...]."""
				791	words = url.split(';')
				792	return words[0], words[1:]
				793
				794	_valueprog = None
				795	def splitvalue(attr):
				796	"""splitvalue('attr=value') --> 'attr', 'value'."""
				797	global _valueprog
				798	if _valueprog is None:
				799	import re
				800	_valueprog = re.compile('^([^=])=(.)$')
				801
				802	match = _valueprog.match(attr)
				803	if match: return match.group(1, 2)
				804	return attr, None