Blame - Lib/urllib/parse.py - platform/external/python/cpython2

blob: 1ac6f4dff6d9d414c54f07be9dfc6889fb93387e [file] [log] [blame]

Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1	"""Parse (absolute and relative) URLs.
				2
Senthil Kumaran	6ffdb6f	2010-04-17 14:47:13 +0000	[diff] [blame]	3	urlparse module is based upon the following RFC specifications.
				4
				5	RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
				6	and L. Masinter, January 2005.
				7
Senthil Kumaran	6ffdb6f	2010-04-17 14:47:13 +0000	[diff] [blame]	8	RFC2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
				9	Berners-Lee, R. Fielding, and L. Masinter, August 1998.
				10
				11	RFC2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
				12
				13	RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
				14	1995.
				15
				16	RFC1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
				17	McCahill, December 1994
				18
				19	RFC 3986 is considered the current standard and any changes to urlparse module
				20	should conform to this. urlparse module is not entirely compliant with this.
				21	The defacto scenarios of parsing are considered sometimes and for backward
				22	compatiblity purposes, older RFC uses of parsing are retained. The testcases in
				23	test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	24	"""
				25
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	26	import sys
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	27	import collections
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	28
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	29	__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	30	"urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	31	"quote", "quote_plus", "quote_from_bytes",
				32	"unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	33
				34	# A classification of schemes ('' means apply by default)
				35	uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
				36	'wais', 'file', 'https', 'shttp', 'mms',
				37	'prospero', 'rtsp', 'rtspu', '', 'sftp']
				38	uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
				39	'imap', 'wais', 'file', 'mms', 'https', 'shttp',
				40	'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaran	d4cd188	2010-05-13 03:43:13 +0000	[diff] [blame^]	41	'svn', 'svn+ssh', 'sftp', 'nfs',' git', 'git+ssh']
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	42	non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
				43	'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
				44	uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
				45	'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
				46	'mms', '', 'sftp']
				47	uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
				48	'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
				49	uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
				50	'nntp', 'wais', 'https', 'shttp', 'snews',
				51	'file', 'prospero', '']
				52
				53	# Characters valid in scheme names
				54	scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
				55	'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				56	'0123456789'
				57	'+-.')
				58
				59	MAX_CACHE_SIZE = 20
				60	_parse_cache = {}
				61
				62	def clear_cache():
				63	"""Clear the parse cache."""
				64	_parse_cache.clear()
				65
				66
				67	class ResultMixin(object):
				68	"""Shared methods for the parsed result objects."""
				69
				70	@property
				71	def username(self):
				72	netloc = self.netloc
				73	if "@" in netloc:
				74	userinfo = netloc.rsplit("@", 1)[0]
				75	if ":" in userinfo:
				76	userinfo = userinfo.split(":", 1)[0]
				77	return userinfo
				78	return None
				79
				80	@property
				81	def password(self):
				82	netloc = self.netloc
				83	if "@" in netloc:
				84	userinfo = netloc.rsplit("@", 1)[0]
				85	if ":" in userinfo:
				86	return userinfo.split(":", 1)[1]
				87	return None
				88
				89	@property
				90	def hostname(self):
Senthil Kumaran	a6023ca	2010-04-16 11:28:05 +0000	[diff] [blame]	91	netloc = self.netloc
				92	if "@" in netloc:
				93	netloc = netloc.rsplit("@", 1)[1]
				94	if ":" in netloc:
				95	netloc = netloc.split(":", 1)[0]
				96	return netloc.lower() or None
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	97
				98	@property
				99	def port(self):
Senthil Kumaran	a6023ca	2010-04-16 11:28:05 +0000	[diff] [blame]	100	netloc = self.netloc
				101	if "@" in netloc:
				102	netloc = netloc.rsplit("@", 1)[1]
				103	if ":" in netloc:
				104	port = netloc.split(":", 1)[1]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	105	return int(port, 10)
Senthil Kumaran	a6023ca	2010-04-16 11:28:05 +0000	[diff] [blame]	106	return None
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	107
				108	from collections import namedtuple
				109
				110	class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
				111
				112	__slots__ = ()
				113
				114	def geturl(self):
				115	return urlunsplit(self)
				116
				117
				118	class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
				119
				120	__slots__ = ()
				121
				122	def geturl(self):
				123	return urlunparse(self)
				124
				125
				126	def urlparse(url, scheme='', allow_fragments=True):
				127	"""Parse a URL into 6 components:
				128	<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
				129	Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
				130	Note that we don't break the components up in smaller bits
				131	(e.g. netloc is a single string) and we don't expand % escapes."""
				132	tuple = urlsplit(url, scheme, allow_fragments)
				133	scheme, netloc, url, query, fragment = tuple
				134	if scheme in uses_params and ';' in url:
				135	url, params = _splitparams(url)
				136	else:
				137	params = ''
				138	return ParseResult(scheme, netloc, url, params, query, fragment)
				139
				140	def _splitparams(url):
				141	if '/' in url:
				142	i = url.find(';', url.rfind('/'))
				143	if i < 0:
				144	return url, ''
				145	else:
				146	i = url.find(';')
				147	return url[:i], url[i+1:]
				148
				149	def _splitnetloc(url, start=0):
				150	delim = len(url) # position of end of domain part of url, default is end
				151	for c in '/?#': # look for delimiters; the order is NOT important
				152	wdelim = url.find(c, start) # find first of this delim
				153	if wdelim >= 0: # if found
				154	delim = min(delim, wdelim) # use earliest delim position
				155	return url[start:delim], url[delim:] # return (domain, rest)
				156
				157	def urlsplit(url, scheme='', allow_fragments=True):
				158	"""Parse a URL into 5 components:
				159	<scheme>://<netloc>/<path>?<query>#<fragment>
				160	Return a 5-tuple: (scheme, netloc, path, query, fragment).
				161	Note that we don't break the components up in smaller bits
				162	(e.g. netloc is a single string) and we don't expand % escapes."""
				163	allow_fragments = bool(allow_fragments)
				164	key = url, scheme, allow_fragments, type(url), type(scheme)
				165	cached = _parse_cache.get(key, None)
				166	if cached:
				167	return cached
				168	if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
				169	clear_cache()
				170	netloc = query = fragment = ''
				171	i = url.find(':')
				172	if i > 0:
				173	if url[:i] == 'http': # optimize the common case
				174	scheme = url[:i].lower()
				175	url = url[i+1:]
				176	if url[:2] == '//':
				177	netloc, url = _splitnetloc(url, 2)
				178	if allow_fragments and '#' in url:
				179	url, fragment = url.split('#', 1)
				180	if '?' in url:
				181	url, query = url.split('?', 1)
				182	v = SplitResult(scheme, netloc, url, query, fragment)
				183	_parse_cache[key] = v
				184	return v
				185	for c in url[:i]:
				186	if c not in scheme_chars:
				187	break
				188	else:
				189	scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran	a8dbb24	2010-02-19 07:45:03 +0000	[diff] [blame]	190	if url[:2] == '//':
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	191	netloc, url = _splitnetloc(url, 2)
				192	if allow_fragments and scheme in uses_fragment and '#' in url:
				193	url, fragment = url.split('#', 1)
				194	if scheme in uses_query and '?' in url:
				195	url, query = url.split('?', 1)
				196	v = SplitResult(scheme, netloc, url, query, fragment)
				197	_parse_cache[key] = v
				198	return v
				199
				200	def urlunparse(components):
				201	"""Put a parsed URL back together again. This may result in a
				202	slightly different, but equivalent URL, if the URL that was parsed
				203	originally had redundant delimiters, e.g. a ? with an empty query
				204	(the draft states that these are equivalent)."""
				205	scheme, netloc, url, params, query, fragment = components
				206	if params:
				207	url = "%s;%s" % (url, params)
				208	return urlunsplit((scheme, netloc, url, query, fragment))
				209
				210	def urlunsplit(components):
				211	scheme, netloc, url, query, fragment = components
				212	if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
				213	if url and url[:1] != '/': url = '/' + url
				214	url = '//' + (netloc or '') + url
				215	if scheme:
				216	url = scheme + ':' + url
				217	if query:
				218	url = url + '?' + query
				219	if fragment:
				220	url = url + '#' + fragment
				221	return url
				222
				223	def urljoin(base, url, allow_fragments=True):
				224	"""Join a base URL and a possibly relative URL to form an absolute
				225	interpretation of the latter."""
				226	if not base:
				227	return url
				228	if not url:
				229	return base
				230	bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
				231	urlparse(base, '', allow_fragments)
				232	scheme, netloc, path, params, query, fragment = \
				233	urlparse(url, bscheme, allow_fragments)
				234	if scheme != bscheme or scheme not in uses_relative:
				235	return url
				236	if scheme in uses_netloc:
				237	if netloc:
				238	return urlunparse((scheme, netloc, path,
				239	params, query, fragment))
				240	netloc = bnetloc
				241	if path[:1] == '/':
				242	return urlunparse((scheme, netloc, path,
				243	params, query, fragment))
Facundo Batista	23e3856	2008-08-14 16:55:14 +0000	[diff] [blame]	244	if not path:
				245	path = bpath
				246	if not params:
				247	params = bparams
				248	else:
				249	path = path[:-1]
				250	return urlunparse((scheme, netloc, path,
				251	params, query, fragment))
				252	if not query:
				253	query = bquery
				254	return urlunparse((scheme, netloc, path,
				255	params, query, fragment))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	256	segments = bpath.split('/')[:-1] + path.split('/')
				257	# XXX The stuff below is bogus in various ways...
				258	if segments[-1] == '.':
				259	segments[-1] = ''
				260	while '.' in segments:
				261	segments.remove('.')
				262	while 1:
				263	i = 1
				264	n = len(segments) - 1
				265	while i < n:
				266	if (segments[i] == '..'
				267	and segments[i-1] not in ('', '..')):
				268	del segments[i-1:i+1]
				269	break
				270	i = i+1
				271	else:
				272	break
				273	if segments == ['', '..']:
				274	segments[-1] = ''
				275	elif len(segments) >= 2 and segments[-1] == '..':
				276	segments[-2:] = ['']
				277	return urlunparse((scheme, netloc, '/'.join(segments),
				278	params, query, fragment))
				279
				280	def urldefrag(url):
				281	"""Removes any existing fragment from URL.
				282
				283	Returns a tuple of the defragmented URL and the fragment. If
				284	the URL contained no fragments, the second element is the
				285	empty string.
				286	"""
				287	if '#' in url:
				288	s, n, p, a, q, frag = urlparse(url)
				289	defrag = urlunparse((s, n, p, a, q, ''))
				290	return defrag, frag
				291	else:
				292	return url, ''
				293
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	294	def unquote_to_bytes(string):
				295	"""unquote_to_bytes('abc%20def') -> b'abc def'."""
				296	# Note: strings are encoded as UTF-8. This is only an issue if it contains
				297	# unescaped non-ASCII characters, which URIs should not.
				298	if isinstance(string, str):
				299	string = string.encode('utf-8')
				300	res = string.split(b'%')
				301	res[0] = res[0]
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	302	for i in range(1, len(res)):
				303	item = res[i]
				304	try:
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	305	res[i] = bytes([int(item[:2], 16)]) + item[2:]
				306	except ValueError:
				307	res[i] = b'%' + item
				308	return b''.join(res)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	309
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	310	def unquote(string, encoding='utf-8', errors='replace'):
				311	"""Replace %xx escapes by their single-character equivalent. The optional
				312	encoding and errors parameters specify how to decode percent-encoded
				313	sequences into Unicode characters, as accepted by the bytes.decode()
				314	method.
				315	By default, percent-encoded sequences are decoded with UTF-8, and invalid
				316	sequences are replaced by a placeholder character.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	317
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	318	unquote('abc%20def') -> 'abc def'.
				319	"""
				320	if encoding is None: encoding = 'utf-8'
				321	if errors is None: errors = 'replace'
				322	# pct_sequence: contiguous sequence of percent-encoded bytes, decoded
				323	# (list of single-byte bytes objects)
				324	pct_sequence = []
				325	res = string.split('%')
				326	for i in range(1, len(res)):
				327	item = res[i]
				328	try:
				329	if not item: raise ValueError
				330	pct_sequence.append(bytes.fromhex(item[:2]))
				331	rest = item[2:]
				332	except ValueError:
				333	rest = '%' + item
				334	if not rest:
				335	# This segment was just a single percent-encoded character.
				336	# May be part of a sequence of code units, so delay decoding.
				337	# (Stored in pct_sequence).
				338	res[i] = ''
				339	else:
				340	# Encountered non-percent-encoded characters. Flush the current
				341	# pct_sequence.
				342	res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
				343	pct_sequence = []
				344	if pct_sequence:
				345	# Flush the final pct_sequence
				346	# res[-1] will always be empty if pct_sequence != []
				347	assert not res[-1], "string=%r, res=%r" % (string, res)
				348	res[-1] = b''.join(pct_sequence).decode(encoding, errors)
				349	return ''.join(res)
				350
Georg Brandl	b044b2a	2009-09-16 16:05:59 +0000	[diff] [blame]	351	def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	352	"""Parse a query given as a string argument.
				353
				354	Arguments:
				355
				356	qs: URL-encoded query string to be parsed
				357
				358	keep_blank_values: flag indicating whether blank values in
				359	URL encoded queries should be treated as blank strings.
				360	A true value indicates that blanks should be retained as
				361	blank strings. The default false value indicates that
				362	blank values are to be ignored and treated as if they were
				363	not included.
				364
				365	strict_parsing: flag indicating what to do with parsing errors.
				366	If false (the default), errors are silently ignored.
				367	If true, errors raise a ValueError exception.
				368	"""
				369	dict = {}
				370	for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
				371	if name in dict:
				372	dict[name].append(value)
				373	else:
				374	dict[name] = [value]
				375	return dict
				376
Georg Brandl	b044b2a	2009-09-16 16:05:59 +0000	[diff] [blame]	377	def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	378	"""Parse a query given as a string argument.
				379
				380	Arguments:
				381
				382	qs: URL-encoded query string to be parsed
				383
				384	keep_blank_values: flag indicating whether blank values in
				385	URL encoded queries should be treated as blank strings. A
				386	true value indicates that blanks should be retained as blank
				387	strings. The default false value indicates that blank values
				388	are to be ignored and treated as if they were not included.
				389
				390	strict_parsing: flag indicating what to do with parsing errors. If
				391	false (the default), errors are silently ignored. If true,
				392	errors raise a ValueError exception.
				393
				394	Returns a list, as G-d intended.
				395	"""
				396	pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
				397	r = []
				398	for name_value in pairs:
				399	if not name_value and not strict_parsing:
				400	continue
				401	nv = name_value.split('=', 1)
				402	if len(nv) != 2:
				403	if strict_parsing:
				404	raise ValueError("bad query field: %r" % (name_value,))
				405	# Handle case of a control-name with no equal sign
				406	if keep_blank_values:
				407	nv.append('')
				408	else:
				409	continue
				410	if len(nv[1]) or keep_blank_values:
				411	name = unquote(nv[0].replace('+', ' '))
				412	value = unquote(nv[1].replace('+', ' '))
				413	r.append((name, value))
				414
				415	return r
				416
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	417	def unquote_plus(string, encoding='utf-8', errors='replace'):
				418	"""Like unquote(), but also replace plus signs by spaces, as required for
				419	unquoting HTML form values.
				420
				421	unquote_plus('%7e/abc+def') -> '~/abc def'
				422	"""
				423	string = string.replace('+', ' ')
				424	return unquote(string, encoding, errors)
				425
				426	_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				427	b'abcdefghijklmnopqrstuvwxyz'
				428	b'0123456789'
				429	b'_.-')
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	430	_safe_quoters= {}
				431
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	432	class Quoter(collections.defaultdict):
				433	"""A mapping from bytes (in range(0,256)) to strings.
				434
				435	String values are percent-encoded byte values, unless the key < 128, and
				436	in the "safe" set (either the specified safe set, or default set).
				437	"""
				438	# Keeps a cache internally, using defaultdict, for efficiency (lookups
				439	# of cached keys don't call Python code at all).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	440	def __init__(self, safe):
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	441	"""safe: bytes object."""
				442	self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	443
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	444	def __repr__(self):
				445	# Without this, will just display as a defaultdict
				446	return "<Quoter %r>" % dict(self)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	447
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	448	def __missing__(self, b):
				449	# Handle a cache miss. Store quoted string in cache and return.
				450	res = b in self.safe and chr(b) or ('%%%02X' % b)
				451	self[b] = res
				452	return res
				453
				454	def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	455	"""quote('abc def') -> 'abc%20def'
				456
				457	Each part of a URL, e.g. the path info, the query, etc., has a
				458	different set of reserved characters that must be quoted.
				459
				460	RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
				461	the following reserved characters.
				462
				463	reserved = ";" \| "/" \| "?" \| ":" \| "@" \| "&" \| "=" \| "+" \|
				464	"$" \| ","
				465
				466	Each of these characters is reserved in some component of a URL,
				467	but not necessarily in all of them.
				468
				469	By default, the quote function is intended for quoting the path
				470	section of a URL. Thus, it will not encode '/'. This character
				471	is reserved, but in typical usage the quote function is being
				472	called on a path where the existing slash characters are used as
				473	reserved characters.
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	474
				475	string and safe may be either str or bytes objects. encoding must
				476	not be specified if string is a str.
				477
				478	The optional encoding and errors parameters specify how to deal with
				479	non-ASCII characters, as accepted by the str.encode method.
				480	By default, encoding='utf-8' (characters are encoded with UTF-8), and
				481	errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	482	"""
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	483	if isinstance(string, str):
				484	if encoding is None:
				485	encoding = 'utf-8'
				486	if errors is None:
				487	errors = 'strict'
				488	string = string.encode(encoding, errors)
				489	else:
				490	if encoding is not None:
				491	raise TypeError("quote() doesn't support 'encoding' for bytes")
				492	if errors is not None:
				493	raise TypeError("quote() doesn't support 'errors' for bytes")
				494	return quote_from_bytes(string, safe)
				495
				496	def quote_plus(string, safe='', encoding=None, errors=None):
				497	"""Like quote(), but also replace ' ' with '+', as required for quoting
				498	HTML form values. Plus signs in the original string are escaped unless
				499	they are included in safe. It also does not have safe default to '/'.
				500	"""
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	501	# Check if ' ' in string, where string may either be a str or bytes. If
				502	# there are no spaces, the regular quote will produce the right answer.
				503	if ((isinstance(string, str) and ' ' not in string) or
				504	(isinstance(string, bytes) and b' ' not in string)):
				505	return quote(string, safe, encoding, errors)
				506	if isinstance(safe, str):
				507	space = ' '
				508	else:
				509	space = b' '
Georg Brandl	faf4149	2009-05-26 18:31:11 +0000	[diff] [blame]	510	string = quote(string, safe + space, encoding, errors)
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	511	return string.replace(' ', '+')
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	512
				513	def quote_from_bytes(bs, safe='/'):
				514	"""Like quote(), but accepts a bytes object rather than a str, and does
				515	not perform string-to-bytes encoding. It always returns an ASCII string.
				516	quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
				517	"""
				518	if isinstance(safe, str):
				519	# Normalize 'safe' by converting to bytes and removing non-ASCII chars
				520	safe = safe.encode('ascii', 'ignore')
				521	cachekey = bytes(safe) # In case it was a bytearray
				522	if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
				523	raise TypeError("quote_from_bytes() expected a bytes")
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	524	try:
				525	quoter = _safe_quoters[cachekey]
				526	except KeyError:
				527	quoter = Quoter(safe)
				528	_safe_quoters[cachekey] = quoter
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	529	return ''.join([quoter[char] for char in bs])
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	530
Georg Brandl	b044b2a	2009-09-16 16:05:59 +0000	[diff] [blame]	531	def urlencode(query, doseq=False):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	532	"""Encode a sequence of two-element tuples or dictionary into a URL query string.
				533
				534	If any values in the query arg are sequences and doseq is true, each
				535	sequence element is converted to a separate parameter.
				536
				537	If the query arg is a sequence of two-element tuples, the order of the
				538	parameters in the output will match the order of parameters in the
				539	input.
				540	"""
				541
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	542	if hasattr(query, "items"):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	543	query = query.items()
				544	else:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	545	# It's a bother at times that strings and string-like objects are
				546	# sequences.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	547	try:
				548	# non-sequence items should not work with len()
				549	# non-empty strings will fail this
				550	if len(query) and not isinstance(query[0], tuple):
				551	raise TypeError
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	552	# Zero-length sequences of all types will get here and succeed,
				553	# but that's a minor nit. Since the original implementation
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	554	# allowed empty dicts that type of behavior probably should be
				555	# preserved for consistency
				556	except TypeError:
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	557	ty, va, tb = sys.exc_info()
				558	raise TypeError("not a valid non-string sequence "
				559	"or mapping object").with_traceback(tb)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	560
				561	l = []
				562	if not doseq:
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	563	for k, v in query:
				564	k = quote_plus(str(k))
				565	v = quote_plus(str(v))
				566	l.append(k + '=' + v)
				567	else:
				568	for k, v in query:
				569	k = quote_plus(str(k))
				570	if isinstance(v, str):
				571	v = quote_plus(v)
				572	l.append(k + '=' + v)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	573	else:
				574	try:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	575	# Is this a sufficient test for sequence-ness?
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	576	x = len(v)
				577	except TypeError:
				578	# not a sequence
				579	v = quote_plus(str(v))
				580	l.append(k + '=' + v)
				581	else:
				582	# loop over the sequence
				583	for elt in v:
				584	l.append(k + '=' + quote_plus(str(elt)))
				585	return '&'.join(l)
				586
				587	# Utilities to parse URLs (most of these return None for missing parts):
				588	# unwrap('<URL:type://host/path>') --> 'type://host/path'
				589	# splittype('type:opaquestring') --> 'type', 'opaquestring'
				590	# splithost('//host[:port]/path') --> 'host[:port]', '/path'
				591	# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
				592	# splitpasswd('user:passwd') -> 'user', 'passwd'
				593	# splitport('host:port') --> 'host', 'port'
				594	# splitquery('/path?query') --> '/path', 'query'
				595	# splittag('/path#tag') --> '/path', 'tag'
				596	# splitattr('/path;attr1=value1;attr2=value2;...') ->
				597	# '/path', ['attr1=value1', 'attr2=value2', ...]
				598	# splitvalue('attr=value') --> 'attr', 'value'
				599	# urllib.parse.unquote('abc%20def') -> 'abc def'
				600	# quote('abc def') -> 'abc%20def')
				601
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	602	def to_bytes(url):
				603	"""to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	604	# Most URL schemes require ASCII. If that changes, the conversion
				605	# can be relaxed.
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	606	# XXX get rid of to_bytes()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	607	if isinstance(url, str):
				608	try:
				609	url = url.encode("ASCII").decode()
				610	except UnicodeError:
				611	raise UnicodeError("URL " + repr(url) +
				612	" contains non-ASCII characters")
				613	return url
				614
				615	def unwrap(url):
				616	"""unwrap('<URL:type://host/path>') --> 'type://host/path'."""
				617	url = str(url).strip()
				618	if url[:1] == '<' and url[-1:] == '>':
				619	url = url[1:-1].strip()
				620	if url[:4] == 'URL:': url = url[4:].strip()
				621	return url
				622
				623	_typeprog = None
				624	def splittype(url):
				625	"""splittype('type:opaquestring') --> 'type', 'opaquestring'."""
				626	global _typeprog
				627	if _typeprog is None:
				628	import re
				629	_typeprog = re.compile('^([^/:]+):')
				630
				631	match = _typeprog.match(url)
				632	if match:
				633	scheme = match.group(1)
				634	return scheme.lower(), url[len(scheme) + 1:]
				635	return None, url
				636
				637	_hostprog = None
				638	def splithost(url):
				639	"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
				640	global _hostprog
				641	if _hostprog is None:
				642	import re
				643	_hostprog = re.compile('^//([^/?])(.)$')
				644
				645	match = _hostprog.match(url)
				646	if match: return match.group(1, 2)
				647	return None, url
				648
				649	_userprog = None
				650	def splituser(host):
				651	"""splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
				652	global _userprog
				653	if _userprog is None:
				654	import re
				655	_userprog = re.compile('^(.)@(.)$')
				656
				657	match = _userprog.match(host)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	658	if match: return map(unquote, match.group(1, 2))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	659	return None, host
				660
				661	_passwdprog = None
				662	def splitpasswd(user):
				663	"""splitpasswd('user:passwd') -> 'user', 'passwd'."""
				664	global _passwdprog
				665	if _passwdprog is None:
				666	import re
Senthil Kumaran	eaaec27	2009-03-30 21:54:41 +0000	[diff] [blame]	667	_passwdprog = re.compile('^([^:]):(.)$',re.S)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	668
				669	match = _passwdprog.match(user)
				670	if match: return match.group(1, 2)
				671	return user, None
				672
				673	# splittag('/path#tag') --> '/path', 'tag'
				674	_portprog = None
				675	def splitport(host):
				676	"""splitport('host:port') --> 'host', 'port'."""
				677	global _portprog
				678	if _portprog is None:
				679	import re
				680	_portprog = re.compile('^(.*):([0-9]+)$')
				681
				682	match = _portprog.match(host)
				683	if match: return match.group(1, 2)
				684	return host, None
				685
				686	_nportprog = None
				687	def splitnport(host, defport=-1):
				688	"""Split host and port, returning numeric port.
				689	Return given default port if no ':' found; defaults to -1.
				690	Return numerical port if a valid number are found after ':'.
				691	Return None if ':' but not a valid number."""
				692	global _nportprog
				693	if _nportprog is None:
				694	import re
				695	_nportprog = re.compile('^(.):(.)$')
				696
				697	match = _nportprog.match(host)
				698	if match:
				699	host, port = match.group(1, 2)
				700	try:
				701	if not port: raise ValueError("no digits")
				702	nport = int(port)
				703	except ValueError:
				704	nport = None
				705	return host, nport
				706	return host, defport
				707
				708	_queryprog = None
				709	def splitquery(url):
				710	"""splitquery('/path?query') --> '/path', 'query'."""
				711	global _queryprog
				712	if _queryprog is None:
				713	import re
				714	_queryprog = re.compile('^(.)\?([^?])$')
				715
				716	match = _queryprog.match(url)
				717	if match: return match.group(1, 2)
				718	return url, None
				719
				720	_tagprog = None
				721	def splittag(url):
				722	"""splittag('/path#tag') --> '/path', 'tag'."""
				723	global _tagprog
				724	if _tagprog is None:
				725	import re
				726	_tagprog = re.compile('^(.)#([^#])$')
				727
				728	match = _tagprog.match(url)
				729	if match: return match.group(1, 2)
				730	return url, None
				731
				732	def splitattr(url):
				733	"""splitattr('/path;attr1=value1;attr2=value2;...') ->
				734	'/path', ['attr1=value1', 'attr2=value2', ...]."""
				735	words = url.split(';')
				736	return words[0], words[1:]
				737
				738	_valueprog = None
				739	def splitvalue(attr):
				740	"""splitvalue('attr=value') --> 'attr', 'value'."""
				741	global _valueprog
				742	if _valueprog is None:
				743	import re
				744	_valueprog = re.compile('^([^=])=(.)$')
				745
				746	match = _valueprog.match(attr)
				747	if match: return match.group(1, 2)
				748	return attr, None
				749
				750	test_input = """
				751	http://a/b/c/d
				752
				753	g:h = <URL:g:h>
				754	http:g = <URL:http://a/b/c/g>
				755	http: = <URL:http://a/b/c/d>
				756	g = <URL:http://a/b/c/g>
				757	./g = <URL:http://a/b/c/g>
				758	g/ = <URL:http://a/b/c/g/>
				759	/g = <URL:http://a/g>
				760	//g = <URL:http://g>
				761	?y = <URL:http://a/b/c/d?y>
				762	g?y = <URL:http://a/b/c/g?y>
				763	g?y/./x = <URL:http://a/b/c/g?y/./x>
				764	. = <URL:http://a/b/c/>
				765	./ = <URL:http://a/b/c/>
				766	.. = <URL:http://a/b/>
				767	../ = <URL:http://a/b/>
				768	../g = <URL:http://a/b/g>
				769	../.. = <URL:http://a/>
				770	../../g = <URL:http://a/g>
				771	../../../g = <URL:http://a/../g>
				772	./../g = <URL:http://a/b/g>
				773	./g/. = <URL:http://a/b/c/g/>
				774	/./g = <URL:http://a/./g>
				775	g/./h = <URL:http://a/b/c/g/h>
				776	g/../h = <URL:http://a/b/c/h>
				777	http:g = <URL:http://a/b/c/g>
				778	http: = <URL:http://a/b/c/d>
				779	http:?y = <URL:http://a/b/c/d?y>
				780	http:g?y = <URL:http://a/b/c/g?y>
				781	http:g?y/./x = <URL:http://a/b/c/g?y/./x>
				782	"""
				783
				784	def test():
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	785	base = ''
				786	if sys.argv[1:]:
				787	fn = sys.argv[1]
				788	if fn == '-':
				789	fp = sys.stdin
				790	else:
				791	fp = open(fn)
				792	else:
				793	from io import StringIO
				794	fp = StringIO(test_input)
				795	for line in fp:
				796	words = line.split()
				797	if not words:
				798	continue
				799	url = words[0]
				800	parts = urlparse(url)
				801	print('%-10s : %s' % (url, parts))
				802	abs = urljoin(base, url)
				803	if not base:
				804	base = abs
				805	wrapped = '<URL:%s>' % abs
				806	print('%-10s = %s' % (url, wrapped))
				807	if len(words) == 3 and words[1] == '=':
				808	if wrapped != words[2]:
				809	print('EXPECTED', words[2], '!!!!!!!!!!')
				810
				811	if __name__ == '__main__':
				812	test()