Blame - Lib/urllib/parse.py - platform/external/python/cpython3

blob: e1afe528d1f5f5394a9e5a425d4bf2e22ffb6552 [file] [log] [blame]

Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1	"""Parse (absolute and relative) URLs.
				2
Senthil Kumaran	6ffdb6f	2010-04-17 14:47:13 +0000	[diff] [blame]	3	urlparse module is based upon the following RFC specifications.
				4
				5	RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
				6	and L. Masinter, January 2005.
				7
Georg Brandl	c62efa8	2010-07-11 10:41:07 +0000	[diff] [blame]	8	RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaran	6ffdb6f	2010-04-17 14:47:13 +0000	[diff] [blame]	9	Berners-Lee, R. Fielding, and L. Masinter, August 1998.
				10
Georg Brandl	c62efa8	2010-07-11 10:41:07 +0000	[diff] [blame]	11	RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
Senthil Kumaran	6ffdb6f	2010-04-17 14:47:13 +0000	[diff] [blame]	12
				13	RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
				14	1995.
				15
Georg Brandl	c62efa8	2010-07-11 10:41:07 +0000	[diff] [blame]	16	RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaran	6ffdb6f	2010-04-17 14:47:13 +0000	[diff] [blame]	17	McCahill, December 1994
				18
Georg Brandl	c62efa8	2010-07-11 10:41:07 +0000	[diff] [blame]	19	RFC 3986 is considered the current standard and any future changes to
				20	urlparse module should conform with it. The urlparse module is
				21	currently not entirely compliant with this RFC due to defacto
				22	scenarios for parsing, and for backward compatibility purposes, some
				23	parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaran	6ffdb6f	2010-04-17 14:47:13 +0000	[diff] [blame]	24	test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	25	"""
				26
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	27	import sys
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	28	import collections
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	29
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	30	__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	31	"urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	32	"quote", "quote_plus", "quote_from_bytes",
				33	"unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	34
				35	# A classification of schemes ('' means apply by default)
				36	uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
				37	'wais', 'file', 'https', 'shttp', 'mms',
				38	'prospero', 'rtsp', 'rtspu', '', 'sftp']
				39	uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
				40	'imap', 'wais', 'file', 'mms', 'https', 'shttp',
				41	'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaran	d4cd188	2010-05-13 03:43:13 +0000	[diff] [blame]	42	'svn', 'svn+ssh', 'sftp', 'nfs',' git', 'git+ssh']
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	43	non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
				44	'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
				45	uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
				46	'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
				47	'mms', '', 'sftp']
				48	uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
				49	'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
				50	uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
				51	'nntp', 'wais', 'https', 'shttp', 'snews',
				52	'file', 'prospero', '']
				53
				54	# Characters valid in scheme names
				55	scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
				56	'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				57	'0123456789'
				58	'+-.')
				59
				60	MAX_CACHE_SIZE = 20
				61	_parse_cache = {}
				62
				63	def clear_cache():
				64	"""Clear the parse cache."""
				65	_parse_cache.clear()
				66
				67
				68	class ResultMixin(object):
				69	"""Shared methods for the parsed result objects."""
				70
				71	@property
				72	def username(self):
				73	netloc = self.netloc
				74	if "@" in netloc:
				75	userinfo = netloc.rsplit("@", 1)[0]
				76	if ":" in userinfo:
				77	userinfo = userinfo.split(":", 1)[0]
				78	return userinfo
				79	return None
				80
				81	@property
				82	def password(self):
				83	netloc = self.netloc
				84	if "@" in netloc:
				85	userinfo = netloc.rsplit("@", 1)[0]
				86	if ":" in userinfo:
				87	return userinfo.split(":", 1)[1]
				88	return None
				89
				90	@property
				91	def hostname(self):
Senthil Kumaran	a6023ca	2010-04-16 11:28:05 +0000	[diff] [blame]	92	netloc = self.netloc
				93	if "@" in netloc:
				94	netloc = netloc.rsplit("@", 1)[1]
				95	if ":" in netloc:
				96	netloc = netloc.split(":", 1)[0]
				97	return netloc.lower() or None
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	98
				99	@property
				100	def port(self):
Senthil Kumaran	a6023ca	2010-04-16 11:28:05 +0000	[diff] [blame]	101	netloc = self.netloc
				102	if "@" in netloc:
				103	netloc = netloc.rsplit("@", 1)[1]
				104	if ":" in netloc:
				105	port = netloc.split(":", 1)[1]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	106	return int(port, 10)
Senthil Kumaran	a6023ca	2010-04-16 11:28:05 +0000	[diff] [blame]	107	return None
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	108
				109	from collections import namedtuple
				110
				111	class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
				112
				113	__slots__ = ()
				114
				115	def geturl(self):
				116	return urlunsplit(self)
				117
				118
				119	class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
				120
				121	__slots__ = ()
				122
				123	def geturl(self):
				124	return urlunparse(self)
				125
				126
				127	def urlparse(url, scheme='', allow_fragments=True):
				128	"""Parse a URL into 6 components:
				129	<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
				130	Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
				131	Note that we don't break the components up in smaller bits
				132	(e.g. netloc is a single string) and we don't expand % escapes."""
				133	tuple = urlsplit(url, scheme, allow_fragments)
				134	scheme, netloc, url, query, fragment = tuple
				135	if scheme in uses_params and ';' in url:
				136	url, params = _splitparams(url)
				137	else:
				138	params = ''
				139	return ParseResult(scheme, netloc, url, params, query, fragment)
				140
				141	def _splitparams(url):
				142	if '/' in url:
				143	i = url.find(';', url.rfind('/'))
				144	if i < 0:
				145	return url, ''
				146	else:
				147	i = url.find(';')
				148	return url[:i], url[i+1:]
				149
				150	def _splitnetloc(url, start=0):
				151	delim = len(url) # position of end of domain part of url, default is end
				152	for c in '/?#': # look for delimiters; the order is NOT important
				153	wdelim = url.find(c, start) # find first of this delim
				154	if wdelim >= 0: # if found
				155	delim = min(delim, wdelim) # use earliest delim position
				156	return url[start:delim], url[delim:] # return (domain, rest)
				157
				158	def urlsplit(url, scheme='', allow_fragments=True):
				159	"""Parse a URL into 5 components:
				160	<scheme>://<netloc>/<path>?<query>#<fragment>
				161	Return a 5-tuple: (scheme, netloc, path, query, fragment).
				162	Note that we don't break the components up in smaller bits
				163	(e.g. netloc is a single string) and we don't expand % escapes."""
				164	allow_fragments = bool(allow_fragments)
				165	key = url, scheme, allow_fragments, type(url), type(scheme)
				166	cached = _parse_cache.get(key, None)
				167	if cached:
				168	return cached
				169	if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
				170	clear_cache()
				171	netloc = query = fragment = ''
				172	i = url.find(':')
				173	if i > 0:
				174	if url[:i] == 'http': # optimize the common case
				175	scheme = url[:i].lower()
				176	url = url[i+1:]
				177	if url[:2] == '//':
				178	netloc, url = _splitnetloc(url, 2)
				179	if allow_fragments and '#' in url:
				180	url, fragment = url.split('#', 1)
				181	if '?' in url:
				182	url, query = url.split('?', 1)
				183	v = SplitResult(scheme, netloc, url, query, fragment)
				184	_parse_cache[key] = v
				185	return v
Senthil Kumaran	8801f7a	2010-08-04 04:53:07 +0000	[diff] [blame]	186	if url.endswith(':') or not url[i+1].isdigit():
				187	for c in url[:i]:
				188	if c not in scheme_chars:
				189	break
				190	else:
				191	scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran	a8dbb24	2010-02-19 07:45:03 +0000	[diff] [blame]	192	if url[:2] == '//':
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	193	netloc, url = _splitnetloc(url, 2)
				194	if allow_fragments and scheme in uses_fragment and '#' in url:
				195	url, fragment = url.split('#', 1)
				196	if scheme in uses_query and '?' in url:
				197	url, query = url.split('?', 1)
				198	v = SplitResult(scheme, netloc, url, query, fragment)
				199	_parse_cache[key] = v
				200	return v
				201
				202	def urlunparse(components):
				203	"""Put a parsed URL back together again. This may result in a
				204	slightly different, but equivalent URL, if the URL that was parsed
				205	originally had redundant delimiters, e.g. a ? with an empty query
				206	(the draft states that these are equivalent)."""
				207	scheme, netloc, url, params, query, fragment = components
				208	if params:
				209	url = "%s;%s" % (url, params)
				210	return urlunsplit((scheme, netloc, url, query, fragment))
				211
				212	def urlunsplit(components):
Senthil Kumaran	930049b	2010-06-28 14:12:18 +0000	[diff] [blame]	213	"""Combine the elements of a tuple as returned by urlsplit() into a
				214	complete URL as a string. The data argument can be any five-item iterable.
				215	This may result in a slightly different, but equivalent URL, if the URL that
				216	was parsed originally had unnecessary delimiters (for example, a ? with an
				217	empty query; the RFC states that these are equivalent)."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	218	scheme, netloc, url, query, fragment = components
				219	if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
				220	if url and url[:1] != '/': url = '/' + url
				221	url = '//' + (netloc or '') + url
				222	if scheme:
				223	url = scheme + ':' + url
				224	if query:
				225	url = url + '?' + query
				226	if fragment:
				227	url = url + '#' + fragment
				228	return url
				229
				230	def urljoin(base, url, allow_fragments=True):
				231	"""Join a base URL and a possibly relative URL to form an absolute
				232	interpretation of the latter."""
				233	if not base:
				234	return url
				235	if not url:
				236	return base
				237	bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
				238	urlparse(base, '', allow_fragments)
				239	scheme, netloc, path, params, query, fragment = \
				240	urlparse(url, bscheme, allow_fragments)
				241	if scheme != bscheme or scheme not in uses_relative:
				242	return url
				243	if scheme in uses_netloc:
				244	if netloc:
				245	return urlunparse((scheme, netloc, path,
				246	params, query, fragment))
				247	netloc = bnetloc
				248	if path[:1] == '/':
				249	return urlunparse((scheme, netloc, path,
				250	params, query, fragment))
Facundo Batista	23e3856	2008-08-14 16:55:14 +0000	[diff] [blame]	251	if not path:
				252	path = bpath
				253	if not params:
				254	params = bparams
				255	else:
				256	path = path[:-1]
				257	return urlunparse((scheme, netloc, path,
				258	params, query, fragment))
				259	if not query:
				260	query = bquery
				261	return urlunparse((scheme, netloc, path,
				262	params, query, fragment))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	263	segments = bpath.split('/')[:-1] + path.split('/')
				264	# XXX The stuff below is bogus in various ways...
				265	if segments[-1] == '.':
				266	segments[-1] = ''
				267	while '.' in segments:
				268	segments.remove('.')
				269	while 1:
				270	i = 1
				271	n = len(segments) - 1
				272	while i < n:
				273	if (segments[i] == '..'
				274	and segments[i-1] not in ('', '..')):
				275	del segments[i-1:i+1]
				276	break
				277	i = i+1
				278	else:
				279	break
				280	if segments == ['', '..']:
				281	segments[-1] = ''
				282	elif len(segments) >= 2 and segments[-1] == '..':
				283	segments[-2:] = ['']
				284	return urlunparse((scheme, netloc, '/'.join(segments),
				285	params, query, fragment))
				286
				287	def urldefrag(url):
				288	"""Removes any existing fragment from URL.
				289
				290	Returns a tuple of the defragmented URL and the fragment. If
				291	the URL contained no fragments, the second element is the
				292	empty string.
				293	"""
				294	if '#' in url:
				295	s, n, p, a, q, frag = urlparse(url)
				296	defrag = urlunparse((s, n, p, a, q, ''))
				297	return defrag, frag
				298	else:
				299	return url, ''
				300
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	301	def unquote_to_bytes(string):
				302	"""unquote_to_bytes('abc%20def') -> b'abc def'."""
				303	# Note: strings are encoded as UTF-8. This is only an issue if it contains
				304	# unescaped non-ASCII characters, which URIs should not.
				305	if isinstance(string, str):
				306	string = string.encode('utf-8')
				307	res = string.split(b'%')
				308	res[0] = res[0]
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	309	for i in range(1, len(res)):
				310	item = res[i]
				311	try:
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	312	res[i] = bytes([int(item[:2], 16)]) + item[2:]
				313	except ValueError:
				314	res[i] = b'%' + item
				315	return b''.join(res)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	316
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	317	def unquote(string, encoding='utf-8', errors='replace'):
				318	"""Replace %xx escapes by their single-character equivalent. The optional
				319	encoding and errors parameters specify how to decode percent-encoded
				320	sequences into Unicode characters, as accepted by the bytes.decode()
				321	method.
				322	By default, percent-encoded sequences are decoded with UTF-8, and invalid
				323	sequences are replaced by a placeholder character.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	324
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	325	unquote('abc%20def') -> 'abc def'.
				326	"""
				327	if encoding is None: encoding = 'utf-8'
				328	if errors is None: errors = 'replace'
				329	# pct_sequence: contiguous sequence of percent-encoded bytes, decoded
				330	# (list of single-byte bytes objects)
				331	pct_sequence = []
				332	res = string.split('%')
				333	for i in range(1, len(res)):
				334	item = res[i]
				335	try:
				336	if not item: raise ValueError
				337	pct_sequence.append(bytes.fromhex(item[:2]))
				338	rest = item[2:]
				339	except ValueError:
				340	rest = '%' + item
				341	if not rest:
				342	# This segment was just a single percent-encoded character.
				343	# May be part of a sequence of code units, so delay decoding.
				344	# (Stored in pct_sequence).
				345	res[i] = ''
				346	else:
				347	# Encountered non-percent-encoded characters. Flush the current
				348	# pct_sequence.
				349	res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
				350	pct_sequence = []
				351	if pct_sequence:
				352	# Flush the final pct_sequence
				353	# res[-1] will always be empty if pct_sequence != []
				354	assert not res[-1], "string=%r, res=%r" % (string, res)
				355	res[-1] = b''.join(pct_sequence).decode(encoding, errors)
				356	return ''.join(res)
				357
Georg Brandl	b044b2a	2009-09-16 16:05:59 +0000	[diff] [blame]	358	def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	359	"""Parse a query given as a string argument.
				360
				361	Arguments:
				362
				363	qs: URL-encoded query string to be parsed
				364
				365	keep_blank_values: flag indicating whether blank values in
				366	URL encoded queries should be treated as blank strings.
				367	A true value indicates that blanks should be retained as
				368	blank strings. The default false value indicates that
				369	blank values are to be ignored and treated as if they were
				370	not included.
				371
				372	strict_parsing: flag indicating what to do with parsing errors.
				373	If false (the default), errors are silently ignored.
				374	If true, errors raise a ValueError exception.
				375	"""
				376	dict = {}
				377	for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
				378	if name in dict:
				379	dict[name].append(value)
				380	else:
				381	dict[name] = [value]
				382	return dict
				383
Georg Brandl	b044b2a	2009-09-16 16:05:59 +0000	[diff] [blame]	384	def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	385	"""Parse a query given as a string argument.
				386
				387	Arguments:
				388
				389	qs: URL-encoded query string to be parsed
				390
				391	keep_blank_values: flag indicating whether blank values in
				392	URL encoded queries should be treated as blank strings. A
				393	true value indicates that blanks should be retained as blank
				394	strings. The default false value indicates that blank values
				395	are to be ignored and treated as if they were not included.
				396
				397	strict_parsing: flag indicating what to do with parsing errors. If
				398	false (the default), errors are silently ignored. If true,
				399	errors raise a ValueError exception.
				400
				401	Returns a list, as G-d intended.
				402	"""
				403	pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
				404	r = []
				405	for name_value in pairs:
				406	if not name_value and not strict_parsing:
				407	continue
				408	nv = name_value.split('=', 1)
				409	if len(nv) != 2:
				410	if strict_parsing:
				411	raise ValueError("bad query field: %r" % (name_value,))
				412	# Handle case of a control-name with no equal sign
				413	if keep_blank_values:
				414	nv.append('')
				415	else:
				416	continue
				417	if len(nv[1]) or keep_blank_values:
				418	name = unquote(nv[0].replace('+', ' '))
				419	value = unquote(nv[1].replace('+', ' '))
				420	r.append((name, value))
				421
				422	return r
				423
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	424	def unquote_plus(string, encoding='utf-8', errors='replace'):
				425	"""Like unquote(), but also replace plus signs by spaces, as required for
				426	unquoting HTML form values.
				427
				428	unquote_plus('%7e/abc+def') -> '~/abc def'
				429	"""
				430	string = string.replace('+', ' ')
				431	return unquote(string, encoding, errors)
				432
				433	_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				434	b'abcdefghijklmnopqrstuvwxyz'
				435	b'0123456789'
				436	b'_.-')
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	437	_safe_quoters= {}
				438
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	439	class Quoter(collections.defaultdict):
				440	"""A mapping from bytes (in range(0,256)) to strings.
				441
				442	String values are percent-encoded byte values, unless the key < 128, and
				443	in the "safe" set (either the specified safe set, or default set).
				444	"""
				445	# Keeps a cache internally, using defaultdict, for efficiency (lookups
				446	# of cached keys don't call Python code at all).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	447	def __init__(self, safe):
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	448	"""safe: bytes object."""
				449	self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	450
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	451	def __repr__(self):
				452	# Without this, will just display as a defaultdict
				453	return "<Quoter %r>" % dict(self)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	454
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	455	def __missing__(self, b):
				456	# Handle a cache miss. Store quoted string in cache and return.
				457	res = b in self.safe and chr(b) or ('%%%02X' % b)
				458	self[b] = res
				459	return res
				460
				461	def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	462	"""quote('abc def') -> 'abc%20def'
				463
				464	Each part of a URL, e.g. the path info, the query, etc., has a
				465	different set of reserved characters that must be quoted.
				466
				467	RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
				468	the following reserved characters.
				469
				470	reserved = ";" \| "/" \| "?" \| ":" \| "@" \| "&" \| "=" \| "+" \|
				471	"$" \| ","
				472
				473	Each of these characters is reserved in some component of a URL,
				474	but not necessarily in all of them.
				475
				476	By default, the quote function is intended for quoting the path
				477	section of a URL. Thus, it will not encode '/'. This character
				478	is reserved, but in typical usage the quote function is being
				479	called on a path where the existing slash characters are used as
				480	reserved characters.
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	481
				482	string and safe may be either str or bytes objects. encoding must
				483	not be specified if string is a str.
				484
				485	The optional encoding and errors parameters specify how to deal with
				486	non-ASCII characters, as accepted by the str.encode method.
				487	By default, encoding='utf-8' (characters are encoded with UTF-8), and
				488	errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	489	"""
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	490	if isinstance(string, str):
				491	if encoding is None:
				492	encoding = 'utf-8'
				493	if errors is None:
				494	errors = 'strict'
				495	string = string.encode(encoding, errors)
				496	else:
				497	if encoding is not None:
				498	raise TypeError("quote() doesn't support 'encoding' for bytes")
				499	if errors is not None:
				500	raise TypeError("quote() doesn't support 'errors' for bytes")
				501	return quote_from_bytes(string, safe)
				502
				503	def quote_plus(string, safe='', encoding=None, errors=None):
				504	"""Like quote(), but also replace ' ' with '+', as required for quoting
				505	HTML form values. Plus signs in the original string are escaped unless
				506	they are included in safe. It also does not have safe default to '/'.
				507	"""
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	508	# Check if ' ' in string, where string may either be a str or bytes. If
				509	# there are no spaces, the regular quote will produce the right answer.
				510	if ((isinstance(string, str) and ' ' not in string) or
				511	(isinstance(string, bytes) and b' ' not in string)):
				512	return quote(string, safe, encoding, errors)
				513	if isinstance(safe, str):
				514	space = ' '
				515	else:
				516	space = b' '
Georg Brandl	faf4149	2009-05-26 18:31:11 +0000	[diff] [blame]	517	string = quote(string, safe + space, encoding, errors)
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	518	return string.replace(' ', '+')
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	519
				520	def quote_from_bytes(bs, safe='/'):
				521	"""Like quote(), but accepts a bytes object rather than a str, and does
				522	not perform string-to-bytes encoding. It always returns an ASCII string.
				523	quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
				524	"""
				525	if isinstance(safe, str):
				526	# Normalize 'safe' by converting to bytes and removing non-ASCII chars
				527	safe = safe.encode('ascii', 'ignore')
				528	cachekey = bytes(safe) # In case it was a bytearray
				529	if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
				530	raise TypeError("quote_from_bytes() expected a bytes")
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	531	try:
				532	quoter = _safe_quoters[cachekey]
				533	except KeyError:
				534	quoter = Quoter(safe)
				535	_safe_quoters[cachekey] = quoter
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	536	return ''.join([quoter[char] for char in bs])
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	537
Senthil Kumaran	fe1ad15	2010-07-03 17:55:41 +0000	[diff] [blame]	538	def urlencode(query, doseq=False, safe='', encoding=None, errors=None):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	539	"""Encode a sequence of two-element tuples or dictionary into a URL query string.
				540
				541	If any values in the query arg are sequences and doseq is true, each
				542	sequence element is converted to a separate parameter.
				543
				544	If the query arg is a sequence of two-element tuples, the order of the
				545	parameters in the output will match the order of parameters in the
				546	input.
Senthil Kumaran	fe1ad15	2010-07-03 17:55:41 +0000	[diff] [blame]	547
				548	The query arg may be either a string or a bytes type. When query arg is a
				549	string, the safe, encoding and error parameters are sent the quote_plus for
				550	encoding.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	551	"""
				552
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	553	if hasattr(query, "items"):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	554	query = query.items()
				555	else:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	556	# It's a bother at times that strings and string-like objects are
				557	# sequences.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	558	try:
				559	# non-sequence items should not work with len()
				560	# non-empty strings will fail this
				561	if len(query) and not isinstance(query[0], tuple):
				562	raise TypeError
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	563	# Zero-length sequences of all types will get here and succeed,
				564	# but that's a minor nit. Since the original implementation
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	565	# allowed empty dicts that type of behavior probably should be
				566	# preserved for consistency
				567	except TypeError:
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	568	ty, va, tb = sys.exc_info()
				569	raise TypeError("not a valid non-string sequence "
				570	"or mapping object").with_traceback(tb)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	571
				572	l = []
				573	if not doseq:
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	574	for k, v in query:
Senthil Kumaran	fe1ad15	2010-07-03 17:55:41 +0000	[diff] [blame]	575	if isinstance(k, bytes):
				576	k = quote_plus(k, safe)
				577	else:
				578	k = quote_plus(str(k), safe, encoding, errors)
				579
				580	if isinstance(v, bytes):
				581	v = quote_plus(v, safe)
				582	else:
				583	v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	584	l.append(k + '=' + v)
				585	else:
				586	for k, v in query:
Senthil Kumaran	fe1ad15	2010-07-03 17:55:41 +0000	[diff] [blame]	587	if isinstance(k, bytes):
				588	k = quote_plus(k, safe)
				589	else:
				590	k = quote_plus(str(k), safe, encoding, errors)
				591
				592	if isinstance(v, bytes):
				593	v = quote_plus(v, safe)
				594	l.append(k + '=' + v)
				595	elif isinstance(v, str):
				596	v = quote_plus(v, safe, encoding, errors)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	597	l.append(k + '=' + v)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	598	else:
				599	try:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	600	# Is this a sufficient test for sequence-ness?
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	601	x = len(v)
				602	except TypeError:
				603	# not a sequence
Senthil Kumaran	fe1ad15	2010-07-03 17:55:41 +0000	[diff] [blame]	604	v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	605	l.append(k + '=' + v)
				606	else:
				607	# loop over the sequence
				608	for elt in v:
Senthil Kumaran	fe1ad15	2010-07-03 17:55:41 +0000	[diff] [blame]	609	if isinstance(elt, bytes):
				610	elt = quote_plus(elt, safe)
				611	else:
				612	elt = quote_plus(str(elt), safe, encoding, errors)
				613	l.append(k + '=' + elt)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	614	return '&'.join(l)
				615
				616	# Utilities to parse URLs (most of these return None for missing parts):
				617	# unwrap('<URL:type://host/path>') --> 'type://host/path'
				618	# splittype('type:opaquestring') --> 'type', 'opaquestring'
				619	# splithost('//host[:port]/path') --> 'host[:port]', '/path'
				620	# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
				621	# splitpasswd('user:passwd') -> 'user', 'passwd'
				622	# splitport('host:port') --> 'host', 'port'
				623	# splitquery('/path?query') --> '/path', 'query'
				624	# splittag('/path#tag') --> '/path', 'tag'
				625	# splitattr('/path;attr1=value1;attr2=value2;...') ->
				626	# '/path', ['attr1=value1', 'attr2=value2', ...]
				627	# splitvalue('attr=value') --> 'attr', 'value'
				628	# urllib.parse.unquote('abc%20def') -> 'abc def'
				629	# quote('abc def') -> 'abc%20def')
				630
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	631	def to_bytes(url):
				632	"""to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	633	# Most URL schemes require ASCII. If that changes, the conversion
				634	# can be relaxed.
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	635	# XXX get rid of to_bytes()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	636	if isinstance(url, str):
				637	try:
				638	url = url.encode("ASCII").decode()
				639	except UnicodeError:
				640	raise UnicodeError("URL " + repr(url) +
				641	" contains non-ASCII characters")
				642	return url
				643
				644	def unwrap(url):
				645	"""unwrap('<URL:type://host/path>') --> 'type://host/path'."""
				646	url = str(url).strip()
				647	if url[:1] == '<' and url[-1:] == '>':
				648	url = url[1:-1].strip()
				649	if url[:4] == 'URL:': url = url[4:].strip()
				650	return url
				651
				652	_typeprog = None
				653	def splittype(url):
				654	"""splittype('type:opaquestring') --> 'type', 'opaquestring'."""
				655	global _typeprog
				656	if _typeprog is None:
				657	import re
				658	_typeprog = re.compile('^([^/:]+):')
				659
				660	match = _typeprog.match(url)
				661	if match:
				662	scheme = match.group(1)
				663	return scheme.lower(), url[len(scheme) + 1:]
				664	return None, url
				665
				666	_hostprog = None
				667	def splithost(url):
				668	"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
				669	global _hostprog
				670	if _hostprog is None:
				671	import re
				672	_hostprog = re.compile('^//([^/?])(.)$')
				673
				674	match = _hostprog.match(url)
				675	if match: return match.group(1, 2)
				676	return None, url
				677
				678	_userprog = None
				679	def splituser(host):
				680	"""splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
				681	global _userprog
				682	if _userprog is None:
				683	import re
				684	_userprog = re.compile('^(.)@(.)$')
				685
				686	match = _userprog.match(host)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	687	if match: return map(unquote, match.group(1, 2))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	688	return None, host
				689
				690	_passwdprog = None
				691	def splitpasswd(user):
				692	"""splitpasswd('user:passwd') -> 'user', 'passwd'."""
				693	global _passwdprog
				694	if _passwdprog is None:
				695	import re
Senthil Kumaran	eaaec27	2009-03-30 21:54:41 +0000	[diff] [blame]	696	_passwdprog = re.compile('^([^:]):(.)$',re.S)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	697
				698	match = _passwdprog.match(user)
				699	if match: return match.group(1, 2)
				700	return user, None
				701
				702	# splittag('/path#tag') --> '/path', 'tag'
				703	_portprog = None
				704	def splitport(host):
				705	"""splitport('host:port') --> 'host', 'port'."""
				706	global _portprog
				707	if _portprog is None:
				708	import re
				709	_portprog = re.compile('^(.*):([0-9]+)$')
				710
				711	match = _portprog.match(host)
				712	if match: return match.group(1, 2)
				713	return host, None
				714
				715	_nportprog = None
				716	def splitnport(host, defport=-1):
				717	"""Split host and port, returning numeric port.
				718	Return given default port if no ':' found; defaults to -1.
				719	Return numerical port if a valid number are found after ':'.
				720	Return None if ':' but not a valid number."""
				721	global _nportprog
				722	if _nportprog is None:
				723	import re
				724	_nportprog = re.compile('^(.):(.)$')
				725
				726	match = _nportprog.match(host)
				727	if match:
				728	host, port = match.group(1, 2)
				729	try:
				730	if not port: raise ValueError("no digits")
				731	nport = int(port)
				732	except ValueError:
				733	nport = None
				734	return host, nport
				735	return host, defport
				736
				737	_queryprog = None
				738	def splitquery(url):
				739	"""splitquery('/path?query') --> '/path', 'query'."""
				740	global _queryprog
				741	if _queryprog is None:
				742	import re
				743	_queryprog = re.compile('^(.)\?([^?])$')
				744
				745	match = _queryprog.match(url)
				746	if match: return match.group(1, 2)
				747	return url, None
				748
				749	_tagprog = None
				750	def splittag(url):
				751	"""splittag('/path#tag') --> '/path', 'tag'."""
				752	global _tagprog
				753	if _tagprog is None:
				754	import re
				755	_tagprog = re.compile('^(.)#([^#])$')
				756
				757	match = _tagprog.match(url)
				758	if match: return match.group(1, 2)
				759	return url, None
				760
				761	def splitattr(url):
				762	"""splitattr('/path;attr1=value1;attr2=value2;...') ->
				763	'/path', ['attr1=value1', 'attr2=value2', ...]."""
				764	words = url.split(';')
				765	return words[0], words[1:]
				766
				767	_valueprog = None
				768	def splitvalue(attr):
				769	"""splitvalue('attr=value') --> 'attr', 'value'."""
				770	global _valueprog
				771	if _valueprog is None:
				772	import re
				773	_valueprog = re.compile('^([^=])=(.)$')
				774
				775	match = _valueprog.match(attr)
				776	if match: return match.group(1, 2)
				777	return attr, None