Blame - Lib/urllib/parse.py - platform/external/python/cpython2

blob: 1f54ac6f99a91db3530f5e96d186661a23d329fe [file] [log] [blame]

Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1	"""Parse (absolute and relative) URLs.
				2
Senthil Kumaran	6ffdb6f	2010-04-17 14:47:13 +0000	[diff] [blame]	3	urlparse module is based upon the following RFC specifications.
				4
				5	RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
				6	and L. Masinter, January 2005.
				7
Georg Brandl	c62efa8	2010-07-11 10:41:07 +0000	[diff] [blame]	8	RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaran	6ffdb6f	2010-04-17 14:47:13 +0000	[diff] [blame]	9	Berners-Lee, R. Fielding, and L. Masinter, August 1998.
				10
Georg Brandl	c62efa8	2010-07-11 10:41:07 +0000	[diff] [blame]	11	RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
Senthil Kumaran	6ffdb6f	2010-04-17 14:47:13 +0000	[diff] [blame]	12
				13	RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
				14	1995.
				15
Georg Brandl	c62efa8	2010-07-11 10:41:07 +0000	[diff] [blame]	16	RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaran	6ffdb6f	2010-04-17 14:47:13 +0000	[diff] [blame]	17	McCahill, December 1994
				18
Georg Brandl	c62efa8	2010-07-11 10:41:07 +0000	[diff] [blame]	19	RFC 3986 is considered the current standard and any future changes to
				20	urlparse module should conform with it. The urlparse module is
				21	currently not entirely compliant with this RFC due to defacto
				22	scenarios for parsing, and for backward compatibility purposes, some
				23	parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaran	6ffdb6f	2010-04-17 14:47:13 +0000	[diff] [blame]	24	test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	25	"""
				26
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	27	import sys
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	28	import collections
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	29
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	30	__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	31	"urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	32	"quote", "quote_plus", "quote_from_bytes",
				33	"unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	34
				35	# A classification of schemes ('' means apply by default)
				36	uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
				37	'wais', 'file', 'https', 'shttp', 'mms',
				38	'prospero', 'rtsp', 'rtspu', '', 'sftp']
				39	uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
				40	'imap', 'wais', 'file', 'mms', 'https', 'shttp',
				41	'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaran	d4cd188	2010-05-13 03:43:13 +0000	[diff] [blame]	42	'svn', 'svn+ssh', 'sftp', 'nfs',' git', 'git+ssh']
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	43	non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
				44	'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
				45	uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
				46	'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
				47	'mms', '', 'sftp']
				48	uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
				49	'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
				50	uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
				51	'nntp', 'wais', 'https', 'shttp', 'snews',
				52	'file', 'prospero', '']
				53
				54	# Characters valid in scheme names
				55	scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
				56	'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				57	'0123456789'
				58	'+-.')
				59
				60	MAX_CACHE_SIZE = 20
				61	_parse_cache = {}
				62
				63	def clear_cache():
				64	"""Clear the parse cache."""
				65	_parse_cache.clear()
				66
				67
				68	class ResultMixin(object):
				69	"""Shared methods for the parsed result objects."""
				70
				71	@property
				72	def username(self):
				73	netloc = self.netloc
				74	if "@" in netloc:
				75	userinfo = netloc.rsplit("@", 1)[0]
				76	if ":" in userinfo:
				77	userinfo = userinfo.split(":", 1)[0]
				78	return userinfo
				79	return None
				80
				81	@property
				82	def password(self):
				83	netloc = self.netloc
				84	if "@" in netloc:
				85	userinfo = netloc.rsplit("@", 1)[0]
				86	if ":" in userinfo:
				87	return userinfo.split(":", 1)[1]
				88	return None
				89
				90	@property
				91	def hostname(self):
Senthil Kumaran	a6023ca	2010-04-16 11:28:05 +0000	[diff] [blame]	92	netloc = self.netloc
				93	if "@" in netloc:
				94	netloc = netloc.rsplit("@", 1)[1]
				95	if ":" in netloc:
				96	netloc = netloc.split(":", 1)[0]
				97	return netloc.lower() or None
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	98
				99	@property
				100	def port(self):
Senthil Kumaran	a6023ca	2010-04-16 11:28:05 +0000	[diff] [blame]	101	netloc = self.netloc
				102	if "@" in netloc:
				103	netloc = netloc.rsplit("@", 1)[1]
				104	if ":" in netloc:
				105	port = netloc.split(":", 1)[1]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	106	return int(port, 10)
Senthil Kumaran	a6023ca	2010-04-16 11:28:05 +0000	[diff] [blame]	107	return None
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	108
				109	from collections import namedtuple
				110
				111	class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
				112
				113	__slots__ = ()
				114
				115	def geturl(self):
				116	return urlunsplit(self)
				117
				118
				119	class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
				120
				121	__slots__ = ()
				122
				123	def geturl(self):
				124	return urlunparse(self)
				125
				126
				127	def urlparse(url, scheme='', allow_fragments=True):
				128	"""Parse a URL into 6 components:
				129	<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
				130	Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
				131	Note that we don't break the components up in smaller bits
				132	(e.g. netloc is a single string) and we don't expand % escapes."""
				133	tuple = urlsplit(url, scheme, allow_fragments)
				134	scheme, netloc, url, query, fragment = tuple
				135	if scheme in uses_params and ';' in url:
				136	url, params = _splitparams(url)
				137	else:
				138	params = ''
				139	return ParseResult(scheme, netloc, url, params, query, fragment)
				140
				141	def _splitparams(url):
				142	if '/' in url:
				143	i = url.find(';', url.rfind('/'))
				144	if i < 0:
				145	return url, ''
				146	else:
				147	i = url.find(';')
				148	return url[:i], url[i+1:]
				149
				150	def _splitnetloc(url, start=0):
				151	delim = len(url) # position of end of domain part of url, default is end
				152	for c in '/?#': # look for delimiters; the order is NOT important
				153	wdelim = url.find(c, start) # find first of this delim
				154	if wdelim >= 0: # if found
				155	delim = min(delim, wdelim) # use earliest delim position
				156	return url[start:delim], url[delim:] # return (domain, rest)
				157
				158	def urlsplit(url, scheme='', allow_fragments=True):
				159	"""Parse a URL into 5 components:
				160	<scheme>://<netloc>/<path>?<query>#<fragment>
				161	Return a 5-tuple: (scheme, netloc, path, query, fragment).
				162	Note that we don't break the components up in smaller bits
				163	(e.g. netloc is a single string) and we don't expand % escapes."""
				164	allow_fragments = bool(allow_fragments)
				165	key = url, scheme, allow_fragments, type(url), type(scheme)
				166	cached = _parse_cache.get(key, None)
				167	if cached:
				168	return cached
				169	if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
				170	clear_cache()
				171	netloc = query = fragment = ''
				172	i = url.find(':')
				173	if i > 0:
				174	if url[:i] == 'http': # optimize the common case
				175	scheme = url[:i].lower()
				176	url = url[i+1:]
				177	if url[:2] == '//':
				178	netloc, url = _splitnetloc(url, 2)
				179	if allow_fragments and '#' in url:
				180	url, fragment = url.split('#', 1)
				181	if '?' in url:
				182	url, query = url.split('?', 1)
				183	v = SplitResult(scheme, netloc, url, query, fragment)
				184	_parse_cache[key] = v
				185	return v
				186	for c in url[:i]:
				187	if c not in scheme_chars:
				188	break
				189	else:
				190	scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran	a8dbb24	2010-02-19 07:45:03 +0000	[diff] [blame]	191	if url[:2] == '//':
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	192	netloc, url = _splitnetloc(url, 2)
				193	if allow_fragments and scheme in uses_fragment and '#' in url:
				194	url, fragment = url.split('#', 1)
				195	if scheme in uses_query and '?' in url:
				196	url, query = url.split('?', 1)
				197	v = SplitResult(scheme, netloc, url, query, fragment)
				198	_parse_cache[key] = v
				199	return v
				200
				201	def urlunparse(components):
				202	"""Put a parsed URL back together again. This may result in a
				203	slightly different, but equivalent URL, if the URL that was parsed
				204	originally had redundant delimiters, e.g. a ? with an empty query
				205	(the draft states that these are equivalent)."""
				206	scheme, netloc, url, params, query, fragment = components
				207	if params:
				208	url = "%s;%s" % (url, params)
				209	return urlunsplit((scheme, netloc, url, query, fragment))
				210
				211	def urlunsplit(components):
Senthil Kumaran	930049b	2010-06-28 14:12:18 +0000	[diff] [blame]	212	"""Combine the elements of a tuple as returned by urlsplit() into a
				213	complete URL as a string. The data argument can be any five-item iterable.
				214	This may result in a slightly different, but equivalent URL, if the URL that
				215	was parsed originally had unnecessary delimiters (for example, a ? with an
				216	empty query; the RFC states that these are equivalent)."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	217	scheme, netloc, url, query, fragment = components
				218	if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
				219	if url and url[:1] != '/': url = '/' + url
				220	url = '//' + (netloc or '') + url
				221	if scheme:
				222	url = scheme + ':' + url
				223	if query:
				224	url = url + '?' + query
				225	if fragment:
				226	url = url + '#' + fragment
				227	return url
				228
				229	def urljoin(base, url, allow_fragments=True):
				230	"""Join a base URL and a possibly relative URL to form an absolute
				231	interpretation of the latter."""
				232	if not base:
				233	return url
				234	if not url:
				235	return base
				236	bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
				237	urlparse(base, '', allow_fragments)
				238	scheme, netloc, path, params, query, fragment = \
				239	urlparse(url, bscheme, allow_fragments)
				240	if scheme != bscheme or scheme not in uses_relative:
				241	return url
				242	if scheme in uses_netloc:
				243	if netloc:
				244	return urlunparse((scheme, netloc, path,
				245	params, query, fragment))
				246	netloc = bnetloc
				247	if path[:1] == '/':
				248	return urlunparse((scheme, netloc, path,
				249	params, query, fragment))
Facundo Batista	23e3856	2008-08-14 16:55:14 +0000	[diff] [blame]	250	if not path:
				251	path = bpath
				252	if not params:
				253	params = bparams
				254	else:
				255	path = path[:-1]
				256	return urlunparse((scheme, netloc, path,
				257	params, query, fragment))
				258	if not query:
				259	query = bquery
				260	return urlunparse((scheme, netloc, path,
				261	params, query, fragment))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	262	segments = bpath.split('/')[:-1] + path.split('/')
				263	# XXX The stuff below is bogus in various ways...
				264	if segments[-1] == '.':
				265	segments[-1] = ''
				266	while '.' in segments:
				267	segments.remove('.')
				268	while 1:
				269	i = 1
				270	n = len(segments) - 1
				271	while i < n:
				272	if (segments[i] == '..'
				273	and segments[i-1] not in ('', '..')):
				274	del segments[i-1:i+1]
				275	break
				276	i = i+1
				277	else:
				278	break
				279	if segments == ['', '..']:
				280	segments[-1] = ''
				281	elif len(segments) >= 2 and segments[-1] == '..':
				282	segments[-2:] = ['']
				283	return urlunparse((scheme, netloc, '/'.join(segments),
				284	params, query, fragment))
				285
				286	def urldefrag(url):
				287	"""Removes any existing fragment from URL.
				288
				289	Returns a tuple of the defragmented URL and the fragment. If
				290	the URL contained no fragments, the second element is the
				291	empty string.
				292	"""
				293	if '#' in url:
				294	s, n, p, a, q, frag = urlparse(url)
				295	defrag = urlunparse((s, n, p, a, q, ''))
				296	return defrag, frag
				297	else:
				298	return url, ''
				299
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	300	def unquote_to_bytes(string):
				301	"""unquote_to_bytes('abc%20def') -> b'abc def'."""
				302	# Note: strings are encoded as UTF-8. This is only an issue if it contains
				303	# unescaped non-ASCII characters, which URIs should not.
				304	if isinstance(string, str):
				305	string = string.encode('utf-8')
				306	res = string.split(b'%')
				307	res[0] = res[0]
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	308	for i in range(1, len(res)):
				309	item = res[i]
				310	try:
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	311	res[i] = bytes([int(item[:2], 16)]) + item[2:]
				312	except ValueError:
				313	res[i] = b'%' + item
				314	return b''.join(res)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	315
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	316	def unquote(string, encoding='utf-8', errors='replace'):
				317	"""Replace %xx escapes by their single-character equivalent. The optional
				318	encoding and errors parameters specify how to decode percent-encoded
				319	sequences into Unicode characters, as accepted by the bytes.decode()
				320	method.
				321	By default, percent-encoded sequences are decoded with UTF-8, and invalid
				322	sequences are replaced by a placeholder character.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	323
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	324	unquote('abc%20def') -> 'abc def'.
				325	"""
				326	if encoding is None: encoding = 'utf-8'
				327	if errors is None: errors = 'replace'
				328	# pct_sequence: contiguous sequence of percent-encoded bytes, decoded
				329	# (list of single-byte bytes objects)
				330	pct_sequence = []
				331	res = string.split('%')
				332	for i in range(1, len(res)):
				333	item = res[i]
				334	try:
				335	if not item: raise ValueError
				336	pct_sequence.append(bytes.fromhex(item[:2]))
				337	rest = item[2:]
				338	except ValueError:
				339	rest = '%' + item
				340	if not rest:
				341	# This segment was just a single percent-encoded character.
				342	# May be part of a sequence of code units, so delay decoding.
				343	# (Stored in pct_sequence).
				344	res[i] = ''
				345	else:
				346	# Encountered non-percent-encoded characters. Flush the current
				347	# pct_sequence.
				348	res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
				349	pct_sequence = []
				350	if pct_sequence:
				351	# Flush the final pct_sequence
				352	# res[-1] will always be empty if pct_sequence != []
				353	assert not res[-1], "string=%r, res=%r" % (string, res)
				354	res[-1] = b''.join(pct_sequence).decode(encoding, errors)
				355	return ''.join(res)
				356
Georg Brandl	b044b2a	2009-09-16 16:05:59 +0000	[diff] [blame]	357	def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	358	"""Parse a query given as a string argument.
				359
				360	Arguments:
				361
				362	qs: URL-encoded query string to be parsed
				363
				364	keep_blank_values: flag indicating whether blank values in
				365	URL encoded queries should be treated as blank strings.
				366	A true value indicates that blanks should be retained as
				367	blank strings. The default false value indicates that
				368	blank values are to be ignored and treated as if they were
				369	not included.
				370
				371	strict_parsing: flag indicating what to do with parsing errors.
				372	If false (the default), errors are silently ignored.
				373	If true, errors raise a ValueError exception.
				374	"""
				375	dict = {}
				376	for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
				377	if name in dict:
				378	dict[name].append(value)
				379	else:
				380	dict[name] = [value]
				381	return dict
				382
Georg Brandl	b044b2a	2009-09-16 16:05:59 +0000	[diff] [blame]	383	def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	384	"""Parse a query given as a string argument.
				385
				386	Arguments:
				387
				388	qs: URL-encoded query string to be parsed
				389
				390	keep_blank_values: flag indicating whether blank values in
				391	URL encoded queries should be treated as blank strings. A
				392	true value indicates that blanks should be retained as blank
				393	strings. The default false value indicates that blank values
				394	are to be ignored and treated as if they were not included.
				395
				396	strict_parsing: flag indicating what to do with parsing errors. If
				397	false (the default), errors are silently ignored. If true,
				398	errors raise a ValueError exception.
				399
				400	Returns a list, as G-d intended.
				401	"""
				402	pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
				403	r = []
				404	for name_value in pairs:
				405	if not name_value and not strict_parsing:
				406	continue
				407	nv = name_value.split('=', 1)
				408	if len(nv) != 2:
				409	if strict_parsing:
				410	raise ValueError("bad query field: %r" % (name_value,))
				411	# Handle case of a control-name with no equal sign
				412	if keep_blank_values:
				413	nv.append('')
				414	else:
				415	continue
				416	if len(nv[1]) or keep_blank_values:
				417	name = unquote(nv[0].replace('+', ' '))
				418	value = unquote(nv[1].replace('+', ' '))
				419	r.append((name, value))
				420
				421	return r
				422
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	423	def unquote_plus(string, encoding='utf-8', errors='replace'):
				424	"""Like unquote(), but also replace plus signs by spaces, as required for
				425	unquoting HTML form values.
				426
				427	unquote_plus('%7e/abc+def') -> '~/abc def'
				428	"""
				429	string = string.replace('+', ' ')
				430	return unquote(string, encoding, errors)
				431
				432	_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				433	b'abcdefghijklmnopqrstuvwxyz'
				434	b'0123456789'
				435	b'_.-')
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	436	_safe_quoters= {}
				437
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	438	class Quoter(collections.defaultdict):
				439	"""A mapping from bytes (in range(0,256)) to strings.
				440
				441	String values are percent-encoded byte values, unless the key < 128, and
				442	in the "safe" set (either the specified safe set, or default set).
				443	"""
				444	# Keeps a cache internally, using defaultdict, for efficiency (lookups
				445	# of cached keys don't call Python code at all).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	446	def __init__(self, safe):
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	447	"""safe: bytes object."""
				448	self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	449
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	450	def __repr__(self):
				451	# Without this, will just display as a defaultdict
				452	return "<Quoter %r>" % dict(self)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	453
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	454	def __missing__(self, b):
				455	# Handle a cache miss. Store quoted string in cache and return.
				456	res = b in self.safe and chr(b) or ('%%%02X' % b)
				457	self[b] = res
				458	return res
				459
				460	def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	461	"""quote('abc def') -> 'abc%20def'
				462
				463	Each part of a URL, e.g. the path info, the query, etc., has a
				464	different set of reserved characters that must be quoted.
				465
				466	RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
				467	the following reserved characters.
				468
				469	reserved = ";" \| "/" \| "?" \| ":" \| "@" \| "&" \| "=" \| "+" \|
				470	"$" \| ","
				471
				472	Each of these characters is reserved in some component of a URL,
				473	but not necessarily in all of them.
				474
				475	By default, the quote function is intended for quoting the path
				476	section of a URL. Thus, it will not encode '/'. This character
				477	is reserved, but in typical usage the quote function is being
				478	called on a path where the existing slash characters are used as
				479	reserved characters.
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	480
				481	string and safe may be either str or bytes objects. encoding must
				482	not be specified if string is a str.
				483
				484	The optional encoding and errors parameters specify how to deal with
				485	non-ASCII characters, as accepted by the str.encode method.
				486	By default, encoding='utf-8' (characters are encoded with UTF-8), and
				487	errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	488	"""
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	489	if isinstance(string, str):
				490	if encoding is None:
				491	encoding = 'utf-8'
				492	if errors is None:
				493	errors = 'strict'
				494	string = string.encode(encoding, errors)
				495	else:
				496	if encoding is not None:
				497	raise TypeError("quote() doesn't support 'encoding' for bytes")
				498	if errors is not None:
				499	raise TypeError("quote() doesn't support 'errors' for bytes")
				500	return quote_from_bytes(string, safe)
				501
				502	def quote_plus(string, safe='', encoding=None, errors=None):
				503	"""Like quote(), but also replace ' ' with '+', as required for quoting
				504	HTML form values. Plus signs in the original string are escaped unless
				505	they are included in safe. It also does not have safe default to '/'.
				506	"""
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	507	# Check if ' ' in string, where string may either be a str or bytes. If
				508	# there are no spaces, the regular quote will produce the right answer.
				509	if ((isinstance(string, str) and ' ' not in string) or
				510	(isinstance(string, bytes) and b' ' not in string)):
				511	return quote(string, safe, encoding, errors)
				512	if isinstance(safe, str):
				513	space = ' '
				514	else:
				515	space = b' '
Georg Brandl	faf4149	2009-05-26 18:31:11 +0000	[diff] [blame]	516	string = quote(string, safe + space, encoding, errors)
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	517	return string.replace(' ', '+')
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	518
				519	def quote_from_bytes(bs, safe='/'):
				520	"""Like quote(), but accepts a bytes object rather than a str, and does
				521	not perform string-to-bytes encoding. It always returns an ASCII string.
				522	quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
				523	"""
				524	if isinstance(safe, str):
				525	# Normalize 'safe' by converting to bytes and removing non-ASCII chars
				526	safe = safe.encode('ascii', 'ignore')
				527	cachekey = bytes(safe) # In case it was a bytearray
				528	if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
				529	raise TypeError("quote_from_bytes() expected a bytes")
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	530	try:
				531	quoter = _safe_quoters[cachekey]
				532	except KeyError:
				533	quoter = Quoter(safe)
				534	_safe_quoters[cachekey] = quoter
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	535	return ''.join([quoter[char] for char in bs])
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	536
Senthil Kumaran	fe1ad15	2010-07-03 17:55:41 +0000	[diff] [blame]	537	def urlencode(query, doseq=False, safe='', encoding=None, errors=None):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	538	"""Encode a sequence of two-element tuples or dictionary into a URL query string.
				539
				540	If any values in the query arg are sequences and doseq is true, each
				541	sequence element is converted to a separate parameter.
				542
				543	If the query arg is a sequence of two-element tuples, the order of the
				544	parameters in the output will match the order of parameters in the
				545	input.
Senthil Kumaran	fe1ad15	2010-07-03 17:55:41 +0000	[diff] [blame]	546
				547	The query arg may be either a string or a bytes type. When query arg is a
				548	string, the safe, encoding and error parameters are sent the quote_plus for
				549	encoding.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	550	"""
				551
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	552	if hasattr(query, "items"):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	553	query = query.items()
				554	else:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	555	# It's a bother at times that strings and string-like objects are
				556	# sequences.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	557	try:
				558	# non-sequence items should not work with len()
				559	# non-empty strings will fail this
				560	if len(query) and not isinstance(query[0], tuple):
				561	raise TypeError
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	562	# Zero-length sequences of all types will get here and succeed,
				563	# but that's a minor nit. Since the original implementation
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	564	# allowed empty dicts that type of behavior probably should be
				565	# preserved for consistency
				566	except TypeError:
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	567	ty, va, tb = sys.exc_info()
				568	raise TypeError("not a valid non-string sequence "
				569	"or mapping object").with_traceback(tb)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	570
				571	l = []
				572	if not doseq:
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	573	for k, v in query:
Senthil Kumaran	fe1ad15	2010-07-03 17:55:41 +0000	[diff] [blame]	574	if isinstance(k, bytes):
				575	k = quote_plus(k, safe)
				576	else:
				577	k = quote_plus(str(k), safe, encoding, errors)
				578
				579	if isinstance(v, bytes):
				580	v = quote_plus(v, safe)
				581	else:
				582	v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	583	l.append(k + '=' + v)
				584	else:
				585	for k, v in query:
Senthil Kumaran	fe1ad15	2010-07-03 17:55:41 +0000	[diff] [blame]	586	if isinstance(k, bytes):
				587	k = quote_plus(k, safe)
				588	else:
				589	k = quote_plus(str(k), safe, encoding, errors)
				590
				591	if isinstance(v, bytes):
				592	v = quote_plus(v, safe)
				593	l.append(k + '=' + v)
				594	elif isinstance(v, str):
				595	v = quote_plus(v, safe, encoding, errors)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	596	l.append(k + '=' + v)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	597	else:
				598	try:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	599	# Is this a sufficient test for sequence-ness?
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	600	x = len(v)
				601	except TypeError:
				602	# not a sequence
Senthil Kumaran	fe1ad15	2010-07-03 17:55:41 +0000	[diff] [blame]	603	v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	604	l.append(k + '=' + v)
				605	else:
				606	# loop over the sequence
				607	for elt in v:
Senthil Kumaran	fe1ad15	2010-07-03 17:55:41 +0000	[diff] [blame]	608	if isinstance(elt, bytes):
				609	elt = quote_plus(elt, safe)
				610	else:
				611	elt = quote_plus(str(elt), safe, encoding, errors)
				612	l.append(k + '=' + elt)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	613	return '&'.join(l)
				614
				615	# Utilities to parse URLs (most of these return None for missing parts):
				616	# unwrap('<URL:type://host/path>') --> 'type://host/path'
				617	# splittype('type:opaquestring') --> 'type', 'opaquestring'
				618	# splithost('//host[:port]/path') --> 'host[:port]', '/path'
				619	# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
				620	# splitpasswd('user:passwd') -> 'user', 'passwd'
				621	# splitport('host:port') --> 'host', 'port'
				622	# splitquery('/path?query') --> '/path', 'query'
				623	# splittag('/path#tag') --> '/path', 'tag'
				624	# splitattr('/path;attr1=value1;attr2=value2;...') ->
				625	# '/path', ['attr1=value1', 'attr2=value2', ...]
				626	# splitvalue('attr=value') --> 'attr', 'value'
				627	# urllib.parse.unquote('abc%20def') -> 'abc def'
				628	# quote('abc def') -> 'abc%20def')
				629
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	630	def to_bytes(url):
				631	"""to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	632	# Most URL schemes require ASCII. If that changes, the conversion
				633	# can be relaxed.
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	634	# XXX get rid of to_bytes()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	635	if isinstance(url, str):
				636	try:
				637	url = url.encode("ASCII").decode()
				638	except UnicodeError:
				639	raise UnicodeError("URL " + repr(url) +
				640	" contains non-ASCII characters")
				641	return url
				642
				643	def unwrap(url):
				644	"""unwrap('<URL:type://host/path>') --> 'type://host/path'."""
				645	url = str(url).strip()
				646	if url[:1] == '<' and url[-1:] == '>':
				647	url = url[1:-1].strip()
				648	if url[:4] == 'URL:': url = url[4:].strip()
				649	return url
				650
				651	_typeprog = None
				652	def splittype(url):
				653	"""splittype('type:opaquestring') --> 'type', 'opaquestring'."""
				654	global _typeprog
				655	if _typeprog is None:
				656	import re
				657	_typeprog = re.compile('^([^/:]+):')
				658
				659	match = _typeprog.match(url)
				660	if match:
				661	scheme = match.group(1)
				662	return scheme.lower(), url[len(scheme) + 1:]
				663	return None, url
				664
				665	_hostprog = None
				666	def splithost(url):
				667	"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
				668	global _hostprog
				669	if _hostprog is None:
				670	import re
				671	_hostprog = re.compile('^//([^/?])(.)$')
				672
				673	match = _hostprog.match(url)
				674	if match: return match.group(1, 2)
				675	return None, url
				676
				677	_userprog = None
				678	def splituser(host):
				679	"""splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
				680	global _userprog
				681	if _userprog is None:
				682	import re
				683	_userprog = re.compile('^(.)@(.)$')
				684
				685	match = _userprog.match(host)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	686	if match: return map(unquote, match.group(1, 2))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	687	return None, host
				688
				689	_passwdprog = None
				690	def splitpasswd(user):
				691	"""splitpasswd('user:passwd') -> 'user', 'passwd'."""
				692	global _passwdprog
				693	if _passwdprog is None:
				694	import re
Senthil Kumaran	eaaec27	2009-03-30 21:54:41 +0000	[diff] [blame]	695	_passwdprog = re.compile('^([^:]):(.)$',re.S)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	696
				697	match = _passwdprog.match(user)
				698	if match: return match.group(1, 2)
				699	return user, None
				700
				701	# splittag('/path#tag') --> '/path', 'tag'
				702	_portprog = None
				703	def splitport(host):
				704	"""splitport('host:port') --> 'host', 'port'."""
				705	global _portprog
				706	if _portprog is None:
				707	import re
				708	_portprog = re.compile('^(.*):([0-9]+)$')
				709
				710	match = _portprog.match(host)
				711	if match: return match.group(1, 2)
				712	return host, None
				713
				714	_nportprog = None
				715	def splitnport(host, defport=-1):
				716	"""Split host and port, returning numeric port.
				717	Return given default port if no ':' found; defaults to -1.
				718	Return numerical port if a valid number are found after ':'.
				719	Return None if ':' but not a valid number."""
				720	global _nportprog
				721	if _nportprog is None:
				722	import re
				723	_nportprog = re.compile('^(.):(.)$')
				724
				725	match = _nportprog.match(host)
				726	if match:
				727	host, port = match.group(1, 2)
				728	try:
				729	if not port: raise ValueError("no digits")
				730	nport = int(port)
				731	except ValueError:
				732	nport = None
				733	return host, nport
				734	return host, defport
				735
				736	_queryprog = None
				737	def splitquery(url):
				738	"""splitquery('/path?query') --> '/path', 'query'."""
				739	global _queryprog
				740	if _queryprog is None:
				741	import re
				742	_queryprog = re.compile('^(.)\?([^?])$')
				743
				744	match = _queryprog.match(url)
				745	if match: return match.group(1, 2)
				746	return url, None
				747
				748	_tagprog = None
				749	def splittag(url):
				750	"""splittag('/path#tag') --> '/path', 'tag'."""
				751	global _tagprog
				752	if _tagprog is None:
				753	import re
				754	_tagprog = re.compile('^(.)#([^#])$')
				755
				756	match = _tagprog.match(url)
				757	if match: return match.group(1, 2)
				758	return url, None
				759
				760	def splitattr(url):
				761	"""splitattr('/path;attr1=value1;attr2=value2;...') ->
				762	'/path', ['attr1=value1', 'attr2=value2', ...]."""
				763	words = url.split(';')
				764	return words[0], words[1:]
				765
				766	_valueprog = None
				767	def splitvalue(attr):
				768	"""splitvalue('attr=value') --> 'attr', 'value'."""
				769	global _valueprog
				770	if _valueprog is None:
				771	import re
				772	_valueprog = re.compile('^([^=])=(.)$')
				773
				774	match = _valueprog.match(attr)
				775	if match: return match.group(1, 2)
				776	return attr, None