Blame - Lib/urllib/parse.py - platform/external/python/cpython2

blob: 8bba1500e35b76d97c8a0d32cba63366c9a698ae [file] [log] [blame]

Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1	"""Parse (absolute and relative) URLs.
				2
Senthil Kumaran	6ffdb6f	2010-04-17 14:47:13 +0000	[diff] [blame^]	3	urlparse module is based upon the following RFC specifications.
				4
				5	RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
				6	and L. Masinter, January 2005.
				7
				8	RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
				9	and L.Masinter, December 1999.
				10
				11	RFC2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
				12	Berners-Lee, R. Fielding, and L. Masinter, August 1998.
				13
				14	RFC2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
				15
				16	RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
				17	1995.
				18
				19	RFC1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
				20	McCahill, December 1994
				21
				22	RFC 3986 is considered the current standard and any changes to urlparse module
				23	should conform to this. urlparse module is not entirely compliant with this.
				24	The defacto scenarios of parsing are considered sometimes and for backward
				25	compatiblity purposes, older RFC uses of parsing are retained. The testcases in
				26	test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	27	"""
				28
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	29	import sys
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	30	import collections
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	31
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	32	__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	33	"urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	34	"quote", "quote_plus", "quote_from_bytes",
				35	"unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	36
				37	# A classification of schemes ('' means apply by default)
				38	uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
				39	'wais', 'file', 'https', 'shttp', 'mms',
				40	'prospero', 'rtsp', 'rtspu', '', 'sftp']
				41	uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
				42	'imap', 'wais', 'file', 'mms', 'https', 'shttp',
				43	'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaran	eaaec27	2009-03-30 21:54:41 +0000	[diff] [blame]	44	'svn', 'svn+ssh', 'sftp','nfs']
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	45	non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
				46	'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
				47	uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
				48	'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
				49	'mms', '', 'sftp']
				50	uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
				51	'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
				52	uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
				53	'nntp', 'wais', 'https', 'shttp', 'snews',
				54	'file', 'prospero', '']
				55
				56	# Characters valid in scheme names
				57	scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
				58	'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				59	'0123456789'
				60	'+-.')
				61
				62	MAX_CACHE_SIZE = 20
				63	_parse_cache = {}
				64
				65	def clear_cache():
				66	"""Clear the parse cache."""
				67	_parse_cache.clear()
				68
				69
				70	class ResultMixin(object):
				71	"""Shared methods for the parsed result objects."""
				72
				73	@property
				74	def username(self):
				75	netloc = self.netloc
				76	if "@" in netloc:
				77	userinfo = netloc.rsplit("@", 1)[0]
				78	if ":" in userinfo:
				79	userinfo = userinfo.split(":", 1)[0]
				80	return userinfo
				81	return None
				82
				83	@property
				84	def password(self):
				85	netloc = self.netloc
				86	if "@" in netloc:
				87	userinfo = netloc.rsplit("@", 1)[0]
				88	if ":" in userinfo:
				89	return userinfo.split(":", 1)[1]
				90	return None
				91
				92	@property
				93	def hostname(self):
Senthil Kumaran	a6023ca	2010-04-16 11:28:05 +0000	[diff] [blame]	94	netloc = self.netloc
				95	if "@" in netloc:
				96	netloc = netloc.rsplit("@", 1)[1]
				97	if ":" in netloc:
				98	netloc = netloc.split(":", 1)[0]
				99	return netloc.lower() or None
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	100
				101	@property
				102	def port(self):
Senthil Kumaran	a6023ca	2010-04-16 11:28:05 +0000	[diff] [blame]	103	netloc = self.netloc
				104	if "@" in netloc:
				105	netloc = netloc.rsplit("@", 1)[1]
				106	if ":" in netloc:
				107	port = netloc.split(":", 1)[1]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	108	return int(port, 10)
Senthil Kumaran	a6023ca	2010-04-16 11:28:05 +0000	[diff] [blame]	109	return None
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	110
				111	from collections import namedtuple
				112
				113	class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
				114
				115	__slots__ = ()
				116
				117	def geturl(self):
				118	return urlunsplit(self)
				119
				120
				121	class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
				122
				123	__slots__ = ()
				124
				125	def geturl(self):
				126	return urlunparse(self)
				127
				128
				129	def urlparse(url, scheme='', allow_fragments=True):
				130	"""Parse a URL into 6 components:
				131	<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
				132	Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
				133	Note that we don't break the components up in smaller bits
				134	(e.g. netloc is a single string) and we don't expand % escapes."""
				135	tuple = urlsplit(url, scheme, allow_fragments)
				136	scheme, netloc, url, query, fragment = tuple
				137	if scheme in uses_params and ';' in url:
				138	url, params = _splitparams(url)
				139	else:
				140	params = ''
				141	return ParseResult(scheme, netloc, url, params, query, fragment)
				142
				143	def _splitparams(url):
				144	if '/' in url:
				145	i = url.find(';', url.rfind('/'))
				146	if i < 0:
				147	return url, ''
				148	else:
				149	i = url.find(';')
				150	return url[:i], url[i+1:]
				151
				152	def _splitnetloc(url, start=0):
				153	delim = len(url) # position of end of domain part of url, default is end
				154	for c in '/?#': # look for delimiters; the order is NOT important
				155	wdelim = url.find(c, start) # find first of this delim
				156	if wdelim >= 0: # if found
				157	delim = min(delim, wdelim) # use earliest delim position
				158	return url[start:delim], url[delim:] # return (domain, rest)
				159
				160	def urlsplit(url, scheme='', allow_fragments=True):
				161	"""Parse a URL into 5 components:
				162	<scheme>://<netloc>/<path>?<query>#<fragment>
				163	Return a 5-tuple: (scheme, netloc, path, query, fragment).
				164	Note that we don't break the components up in smaller bits
				165	(e.g. netloc is a single string) and we don't expand % escapes."""
				166	allow_fragments = bool(allow_fragments)
				167	key = url, scheme, allow_fragments, type(url), type(scheme)
				168	cached = _parse_cache.get(key, None)
				169	if cached:
				170	return cached
				171	if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
				172	clear_cache()
				173	netloc = query = fragment = ''
				174	i = url.find(':')
				175	if i > 0:
				176	if url[:i] == 'http': # optimize the common case
				177	scheme = url[:i].lower()
				178	url = url[i+1:]
				179	if url[:2] == '//':
				180	netloc, url = _splitnetloc(url, 2)
				181	if allow_fragments and '#' in url:
				182	url, fragment = url.split('#', 1)
				183	if '?' in url:
				184	url, query = url.split('?', 1)
				185	v = SplitResult(scheme, netloc, url, query, fragment)
				186	_parse_cache[key] = v
				187	return v
				188	for c in url[:i]:
				189	if c not in scheme_chars:
				190	break
				191	else:
				192	scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran	a8dbb24	2010-02-19 07:45:03 +0000	[diff] [blame]	193	if url[:2] == '//':
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	194	netloc, url = _splitnetloc(url, 2)
				195	if allow_fragments and scheme in uses_fragment and '#' in url:
				196	url, fragment = url.split('#', 1)
				197	if scheme in uses_query and '?' in url:
				198	url, query = url.split('?', 1)
				199	v = SplitResult(scheme, netloc, url, query, fragment)
				200	_parse_cache[key] = v
				201	return v
				202
				203	def urlunparse(components):
				204	"""Put a parsed URL back together again. This may result in a
				205	slightly different, but equivalent URL, if the URL that was parsed
				206	originally had redundant delimiters, e.g. a ? with an empty query
				207	(the draft states that these are equivalent)."""
				208	scheme, netloc, url, params, query, fragment = components
				209	if params:
				210	url = "%s;%s" % (url, params)
				211	return urlunsplit((scheme, netloc, url, query, fragment))
				212
				213	def urlunsplit(components):
				214	scheme, netloc, url, query, fragment = components
				215	if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
				216	if url and url[:1] != '/': url = '/' + url
				217	url = '//' + (netloc or '') + url
				218	if scheme:
				219	url = scheme + ':' + url
				220	if query:
				221	url = url + '?' + query
				222	if fragment:
				223	url = url + '#' + fragment
				224	return url
				225
				226	def urljoin(base, url, allow_fragments=True):
				227	"""Join a base URL and a possibly relative URL to form an absolute
				228	interpretation of the latter."""
				229	if not base:
				230	return url
				231	if not url:
				232	return base
				233	bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
				234	urlparse(base, '', allow_fragments)
				235	scheme, netloc, path, params, query, fragment = \
				236	urlparse(url, bscheme, allow_fragments)
				237	if scheme != bscheme or scheme not in uses_relative:
				238	return url
				239	if scheme in uses_netloc:
				240	if netloc:
				241	return urlunparse((scheme, netloc, path,
				242	params, query, fragment))
				243	netloc = bnetloc
				244	if path[:1] == '/':
				245	return urlunparse((scheme, netloc, path,
				246	params, query, fragment))
Facundo Batista	23e3856	2008-08-14 16:55:14 +0000	[diff] [blame]	247	if not path:
				248	path = bpath
				249	if not params:
				250	params = bparams
				251	else:
				252	path = path[:-1]
				253	return urlunparse((scheme, netloc, path,
				254	params, query, fragment))
				255	if not query:
				256	query = bquery
				257	return urlunparse((scheme, netloc, path,
				258	params, query, fragment))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	259	segments = bpath.split('/')[:-1] + path.split('/')
				260	# XXX The stuff below is bogus in various ways...
				261	if segments[-1] == '.':
				262	segments[-1] = ''
				263	while '.' in segments:
				264	segments.remove('.')
				265	while 1:
				266	i = 1
				267	n = len(segments) - 1
				268	while i < n:
				269	if (segments[i] == '..'
				270	and segments[i-1] not in ('', '..')):
				271	del segments[i-1:i+1]
				272	break
				273	i = i+1
				274	else:
				275	break
				276	if segments == ['', '..']:
				277	segments[-1] = ''
				278	elif len(segments) >= 2 and segments[-1] == '..':
				279	segments[-2:] = ['']
				280	return urlunparse((scheme, netloc, '/'.join(segments),
				281	params, query, fragment))
				282
				283	def urldefrag(url):
				284	"""Removes any existing fragment from URL.
				285
				286	Returns a tuple of the defragmented URL and the fragment. If
				287	the URL contained no fragments, the second element is the
				288	empty string.
				289	"""
				290	if '#' in url:
				291	s, n, p, a, q, frag = urlparse(url)
				292	defrag = urlunparse((s, n, p, a, q, ''))
				293	return defrag, frag
				294	else:
				295	return url, ''
				296
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	297	def unquote_to_bytes(string):
				298	"""unquote_to_bytes('abc%20def') -> b'abc def'."""
				299	# Note: strings are encoded as UTF-8. This is only an issue if it contains
				300	# unescaped non-ASCII characters, which URIs should not.
				301	if isinstance(string, str):
				302	string = string.encode('utf-8')
				303	res = string.split(b'%')
				304	res[0] = res[0]
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	305	for i in range(1, len(res)):
				306	item = res[i]
				307	try:
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	308	res[i] = bytes([int(item[:2], 16)]) + item[2:]
				309	except ValueError:
				310	res[i] = b'%' + item
				311	return b''.join(res)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	312
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	313	def unquote(string, encoding='utf-8', errors='replace'):
				314	"""Replace %xx escapes by their single-character equivalent. The optional
				315	encoding and errors parameters specify how to decode percent-encoded
				316	sequences into Unicode characters, as accepted by the bytes.decode()
				317	method.
				318	By default, percent-encoded sequences are decoded with UTF-8, and invalid
				319	sequences are replaced by a placeholder character.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	320
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	321	unquote('abc%20def') -> 'abc def'.
				322	"""
				323	if encoding is None: encoding = 'utf-8'
				324	if errors is None: errors = 'replace'
				325	# pct_sequence: contiguous sequence of percent-encoded bytes, decoded
				326	# (list of single-byte bytes objects)
				327	pct_sequence = []
				328	res = string.split('%')
				329	for i in range(1, len(res)):
				330	item = res[i]
				331	try:
				332	if not item: raise ValueError
				333	pct_sequence.append(bytes.fromhex(item[:2]))
				334	rest = item[2:]
				335	except ValueError:
				336	rest = '%' + item
				337	if not rest:
				338	# This segment was just a single percent-encoded character.
				339	# May be part of a sequence of code units, so delay decoding.
				340	# (Stored in pct_sequence).
				341	res[i] = ''
				342	else:
				343	# Encountered non-percent-encoded characters. Flush the current
				344	# pct_sequence.
				345	res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
				346	pct_sequence = []
				347	if pct_sequence:
				348	# Flush the final pct_sequence
				349	# res[-1] will always be empty if pct_sequence != []
				350	assert not res[-1], "string=%r, res=%r" % (string, res)
				351	res[-1] = b''.join(pct_sequence).decode(encoding, errors)
				352	return ''.join(res)
				353
Georg Brandl	b044b2a	2009-09-16 16:05:59 +0000	[diff] [blame]	354	def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	355	"""Parse a query given as a string argument.
				356
				357	Arguments:
				358
				359	qs: URL-encoded query string to be parsed
				360
				361	keep_blank_values: flag indicating whether blank values in
				362	URL encoded queries should be treated as blank strings.
				363	A true value indicates that blanks should be retained as
				364	blank strings. The default false value indicates that
				365	blank values are to be ignored and treated as if they were
				366	not included.
				367
				368	strict_parsing: flag indicating what to do with parsing errors.
				369	If false (the default), errors are silently ignored.
				370	If true, errors raise a ValueError exception.
				371	"""
				372	dict = {}
				373	for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
				374	if name in dict:
				375	dict[name].append(value)
				376	else:
				377	dict[name] = [value]
				378	return dict
				379
Georg Brandl	b044b2a	2009-09-16 16:05:59 +0000	[diff] [blame]	380	def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	381	"""Parse a query given as a string argument.
				382
				383	Arguments:
				384
				385	qs: URL-encoded query string to be parsed
				386
				387	keep_blank_values: flag indicating whether blank values in
				388	URL encoded queries should be treated as blank strings. A
				389	true value indicates that blanks should be retained as blank
				390	strings. The default false value indicates that blank values
				391	are to be ignored and treated as if they were not included.
				392
				393	strict_parsing: flag indicating what to do with parsing errors. If
				394	false (the default), errors are silently ignored. If true,
				395	errors raise a ValueError exception.
				396
				397	Returns a list, as G-d intended.
				398	"""
				399	pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
				400	r = []
				401	for name_value in pairs:
				402	if not name_value and not strict_parsing:
				403	continue
				404	nv = name_value.split('=', 1)
				405	if len(nv) != 2:
				406	if strict_parsing:
				407	raise ValueError("bad query field: %r" % (name_value,))
				408	# Handle case of a control-name with no equal sign
				409	if keep_blank_values:
				410	nv.append('')
				411	else:
				412	continue
				413	if len(nv[1]) or keep_blank_values:
				414	name = unquote(nv[0].replace('+', ' '))
				415	value = unquote(nv[1].replace('+', ' '))
				416	r.append((name, value))
				417
				418	return r
				419
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	420	def unquote_plus(string, encoding='utf-8', errors='replace'):
				421	"""Like unquote(), but also replace plus signs by spaces, as required for
				422	unquoting HTML form values.
				423
				424	unquote_plus('%7e/abc+def') -> '~/abc def'
				425	"""
				426	string = string.replace('+', ' ')
				427	return unquote(string, encoding, errors)
				428
				429	_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				430	b'abcdefghijklmnopqrstuvwxyz'
				431	b'0123456789'
				432	b'_.-')
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	433	_safe_quoters= {}
				434
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	435	class Quoter(collections.defaultdict):
				436	"""A mapping from bytes (in range(0,256)) to strings.
				437
				438	String values are percent-encoded byte values, unless the key < 128, and
				439	in the "safe" set (either the specified safe set, or default set).
				440	"""
				441	# Keeps a cache internally, using defaultdict, for efficiency (lookups
				442	# of cached keys don't call Python code at all).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	443	def __init__(self, safe):
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	444	"""safe: bytes object."""
				445	self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	446
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	447	def __repr__(self):
				448	# Without this, will just display as a defaultdict
				449	return "<Quoter %r>" % dict(self)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	450
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	451	def __missing__(self, b):
				452	# Handle a cache miss. Store quoted string in cache and return.
				453	res = b in self.safe and chr(b) or ('%%%02X' % b)
				454	self[b] = res
				455	return res
				456
				457	def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	458	"""quote('abc def') -> 'abc%20def'
				459
				460	Each part of a URL, e.g. the path info, the query, etc., has a
				461	different set of reserved characters that must be quoted.
				462
				463	RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
				464	the following reserved characters.
				465
				466	reserved = ";" \| "/" \| "?" \| ":" \| "@" \| "&" \| "=" \| "+" \|
				467	"$" \| ","
				468
				469	Each of these characters is reserved in some component of a URL,
				470	but not necessarily in all of them.
				471
				472	By default, the quote function is intended for quoting the path
				473	section of a URL. Thus, it will not encode '/'. This character
				474	is reserved, but in typical usage the quote function is being
				475	called on a path where the existing slash characters are used as
				476	reserved characters.
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	477
				478	string and safe may be either str or bytes objects. encoding must
				479	not be specified if string is a str.
				480
				481	The optional encoding and errors parameters specify how to deal with
				482	non-ASCII characters, as accepted by the str.encode method.
				483	By default, encoding='utf-8' (characters are encoded with UTF-8), and
				484	errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	485	"""
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	486	if isinstance(string, str):
				487	if encoding is None:
				488	encoding = 'utf-8'
				489	if errors is None:
				490	errors = 'strict'
				491	string = string.encode(encoding, errors)
				492	else:
				493	if encoding is not None:
				494	raise TypeError("quote() doesn't support 'encoding' for bytes")
				495	if errors is not None:
				496	raise TypeError("quote() doesn't support 'errors' for bytes")
				497	return quote_from_bytes(string, safe)
				498
				499	def quote_plus(string, safe='', encoding=None, errors=None):
				500	"""Like quote(), but also replace ' ' with '+', as required for quoting
				501	HTML form values. Plus signs in the original string are escaped unless
				502	they are included in safe. It also does not have safe default to '/'.
				503	"""
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	504	# Check if ' ' in string, where string may either be a str or bytes. If
				505	# there are no spaces, the regular quote will produce the right answer.
				506	if ((isinstance(string, str) and ' ' not in string) or
				507	(isinstance(string, bytes) and b' ' not in string)):
				508	return quote(string, safe, encoding, errors)
				509	if isinstance(safe, str):
				510	space = ' '
				511	else:
				512	space = b' '
Georg Brandl	faf4149	2009-05-26 18:31:11 +0000	[diff] [blame]	513	string = quote(string, safe + space, encoding, errors)
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	514	return string.replace(' ', '+')
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	515
				516	def quote_from_bytes(bs, safe='/'):
				517	"""Like quote(), but accepts a bytes object rather than a str, and does
				518	not perform string-to-bytes encoding. It always returns an ASCII string.
				519	quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
				520	"""
				521	if isinstance(safe, str):
				522	# Normalize 'safe' by converting to bytes and removing non-ASCII chars
				523	safe = safe.encode('ascii', 'ignore')
				524	cachekey = bytes(safe) # In case it was a bytearray
				525	if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
				526	raise TypeError("quote_from_bytes() expected a bytes")
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	527	try:
				528	quoter = _safe_quoters[cachekey]
				529	except KeyError:
				530	quoter = Quoter(safe)
				531	_safe_quoters[cachekey] = quoter
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	532	return ''.join([quoter[char] for char in bs])
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	533
Georg Brandl	b044b2a	2009-09-16 16:05:59 +0000	[diff] [blame]	534	def urlencode(query, doseq=False):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	535	"""Encode a sequence of two-element tuples or dictionary into a URL query string.
				536
				537	If any values in the query arg are sequences and doseq is true, each
				538	sequence element is converted to a separate parameter.
				539
				540	If the query arg is a sequence of two-element tuples, the order of the
				541	parameters in the output will match the order of parameters in the
				542	input.
				543	"""
				544
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	545	if hasattr(query, "items"):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	546	query = query.items()
				547	else:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	548	# It's a bother at times that strings and string-like objects are
				549	# sequences.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	550	try:
				551	# non-sequence items should not work with len()
				552	# non-empty strings will fail this
				553	if len(query) and not isinstance(query[0], tuple):
				554	raise TypeError
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	555	# Zero-length sequences of all types will get here and succeed,
				556	# but that's a minor nit. Since the original implementation
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	557	# allowed empty dicts that type of behavior probably should be
				558	# preserved for consistency
				559	except TypeError:
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	560	ty, va, tb = sys.exc_info()
				561	raise TypeError("not a valid non-string sequence "
				562	"or mapping object").with_traceback(tb)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	563
				564	l = []
				565	if not doseq:
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	566	for k, v in query:
				567	k = quote_plus(str(k))
				568	v = quote_plus(str(v))
				569	l.append(k + '=' + v)
				570	else:
				571	for k, v in query:
				572	k = quote_plus(str(k))
				573	if isinstance(v, str):
				574	v = quote_plus(v)
				575	l.append(k + '=' + v)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	576	else:
				577	try:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	578	# Is this a sufficient test for sequence-ness?
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	579	x = len(v)
				580	except TypeError:
				581	# not a sequence
				582	v = quote_plus(str(v))
				583	l.append(k + '=' + v)
				584	else:
				585	# loop over the sequence
				586	for elt in v:
				587	l.append(k + '=' + quote_plus(str(elt)))
				588	return '&'.join(l)
				589
				590	# Utilities to parse URLs (most of these return None for missing parts):
				591	# unwrap('<URL:type://host/path>') --> 'type://host/path'
				592	# splittype('type:opaquestring') --> 'type', 'opaquestring'
				593	# splithost('//host[:port]/path') --> 'host[:port]', '/path'
				594	# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
				595	# splitpasswd('user:passwd') -> 'user', 'passwd'
				596	# splitport('host:port') --> 'host', 'port'
				597	# splitquery('/path?query') --> '/path', 'query'
				598	# splittag('/path#tag') --> '/path', 'tag'
				599	# splitattr('/path;attr1=value1;attr2=value2;...') ->
				600	# '/path', ['attr1=value1', 'attr2=value2', ...]
				601	# splitvalue('attr=value') --> 'attr', 'value'
				602	# urllib.parse.unquote('abc%20def') -> 'abc def'
				603	# quote('abc def') -> 'abc%20def')
				604
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	605	def to_bytes(url):
				606	"""to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	607	# Most URL schemes require ASCII. If that changes, the conversion
				608	# can be relaxed.
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	609	# XXX get rid of to_bytes()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	610	if isinstance(url, str):
				611	try:
				612	url = url.encode("ASCII").decode()
				613	except UnicodeError:
				614	raise UnicodeError("URL " + repr(url) +
				615	" contains non-ASCII characters")
				616	return url
				617
				618	def unwrap(url):
				619	"""unwrap('<URL:type://host/path>') --> 'type://host/path'."""
				620	url = str(url).strip()
				621	if url[:1] == '<' and url[-1:] == '>':
				622	url = url[1:-1].strip()
				623	if url[:4] == 'URL:': url = url[4:].strip()
				624	return url
				625
				626	_typeprog = None
				627	def splittype(url):
				628	"""splittype('type:opaquestring') --> 'type', 'opaquestring'."""
				629	global _typeprog
				630	if _typeprog is None:
				631	import re
				632	_typeprog = re.compile('^([^/:]+):')
				633
				634	match = _typeprog.match(url)
				635	if match:
				636	scheme = match.group(1)
				637	return scheme.lower(), url[len(scheme) + 1:]
				638	return None, url
				639
				640	_hostprog = None
				641	def splithost(url):
				642	"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
				643	global _hostprog
				644	if _hostprog is None:
				645	import re
				646	_hostprog = re.compile('^//([^/?])(.)$')
				647
				648	match = _hostprog.match(url)
				649	if match: return match.group(1, 2)
				650	return None, url
				651
				652	_userprog = None
				653	def splituser(host):
				654	"""splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
				655	global _userprog
				656	if _userprog is None:
				657	import re
				658	_userprog = re.compile('^(.)@(.)$')
				659
				660	match = _userprog.match(host)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	661	if match: return map(unquote, match.group(1, 2))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	662	return None, host
				663
				664	_passwdprog = None
				665	def splitpasswd(user):
				666	"""splitpasswd('user:passwd') -> 'user', 'passwd'."""
				667	global _passwdprog
				668	if _passwdprog is None:
				669	import re
Senthil Kumaran	eaaec27	2009-03-30 21:54:41 +0000	[diff] [blame]	670	_passwdprog = re.compile('^([^:]):(.)$',re.S)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	671
				672	match = _passwdprog.match(user)
				673	if match: return match.group(1, 2)
				674	return user, None
				675
				676	# splittag('/path#tag') --> '/path', 'tag'
				677	_portprog = None
				678	def splitport(host):
				679	"""splitport('host:port') --> 'host', 'port'."""
				680	global _portprog
				681	if _portprog is None:
				682	import re
				683	_portprog = re.compile('^(.*):([0-9]+)$')
				684
				685	match = _portprog.match(host)
				686	if match: return match.group(1, 2)
				687	return host, None
				688
				689	_nportprog = None
				690	def splitnport(host, defport=-1):
				691	"""Split host and port, returning numeric port.
				692	Return given default port if no ':' found; defaults to -1.
				693	Return numerical port if a valid number are found after ':'.
				694	Return None if ':' but not a valid number."""
				695	global _nportprog
				696	if _nportprog is None:
				697	import re
				698	_nportprog = re.compile('^(.):(.)$')
				699
				700	match = _nportprog.match(host)
				701	if match:
				702	host, port = match.group(1, 2)
				703	try:
				704	if not port: raise ValueError("no digits")
				705	nport = int(port)
				706	except ValueError:
				707	nport = None
				708	return host, nport
				709	return host, defport
				710
				711	_queryprog = None
				712	def splitquery(url):
				713	"""splitquery('/path?query') --> '/path', 'query'."""
				714	global _queryprog
				715	if _queryprog is None:
				716	import re
				717	_queryprog = re.compile('^(.)\?([^?])$')
				718
				719	match = _queryprog.match(url)
				720	if match: return match.group(1, 2)
				721	return url, None
				722
				723	_tagprog = None
				724	def splittag(url):
				725	"""splittag('/path#tag') --> '/path', 'tag'."""
				726	global _tagprog
				727	if _tagprog is None:
				728	import re
				729	_tagprog = re.compile('^(.)#([^#])$')
				730
				731	match = _tagprog.match(url)
				732	if match: return match.group(1, 2)
				733	return url, None
				734
				735	def splitattr(url):
				736	"""splitattr('/path;attr1=value1;attr2=value2;...') ->
				737	'/path', ['attr1=value1', 'attr2=value2', ...]."""
				738	words = url.split(';')
				739	return words[0], words[1:]
				740
				741	_valueprog = None
				742	def splitvalue(attr):
				743	"""splitvalue('attr=value') --> 'attr', 'value'."""
				744	global _valueprog
				745	if _valueprog is None:
				746	import re
				747	_valueprog = re.compile('^([^=])=(.)$')
				748
				749	match = _valueprog.match(attr)
				750	if match: return match.group(1, 2)
				751	return attr, None
				752
				753	test_input = """
				754	http://a/b/c/d
				755
				756	g:h = <URL:g:h>
				757	http:g = <URL:http://a/b/c/g>
				758	http: = <URL:http://a/b/c/d>
				759	g = <URL:http://a/b/c/g>
				760	./g = <URL:http://a/b/c/g>
				761	g/ = <URL:http://a/b/c/g/>
				762	/g = <URL:http://a/g>
				763	//g = <URL:http://g>
				764	?y = <URL:http://a/b/c/d?y>
				765	g?y = <URL:http://a/b/c/g?y>
				766	g?y/./x = <URL:http://a/b/c/g?y/./x>
				767	. = <URL:http://a/b/c/>
				768	./ = <URL:http://a/b/c/>
				769	.. = <URL:http://a/b/>
				770	../ = <URL:http://a/b/>
				771	../g = <URL:http://a/b/g>
				772	../.. = <URL:http://a/>
				773	../../g = <URL:http://a/g>
				774	../../../g = <URL:http://a/../g>
				775	./../g = <URL:http://a/b/g>
				776	./g/. = <URL:http://a/b/c/g/>
				777	/./g = <URL:http://a/./g>
				778	g/./h = <URL:http://a/b/c/g/h>
				779	g/../h = <URL:http://a/b/c/h>
				780	http:g = <URL:http://a/b/c/g>
				781	http: = <URL:http://a/b/c/d>
				782	http:?y = <URL:http://a/b/c/d?y>
				783	http:g?y = <URL:http://a/b/c/g?y>
				784	http:g?y/./x = <URL:http://a/b/c/g?y/./x>
				785	"""
				786
				787	def test():
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	788	base = ''
				789	if sys.argv[1:]:
				790	fn = sys.argv[1]
				791	if fn == '-':
				792	fp = sys.stdin
				793	else:
				794	fp = open(fn)
				795	else:
				796	from io import StringIO
				797	fp = StringIO(test_input)
				798	for line in fp:
				799	words = line.split()
				800	if not words:
				801	continue
				802	url = words[0]
				803	parts = urlparse(url)
				804	print('%-10s : %s' % (url, parts))
				805	abs = urljoin(base, url)
				806	if not base:
				807	base = abs
				808	wrapped = '<URL:%s>' % abs
				809	print('%-10s = %s' % (url, wrapped))
				810	if len(words) == 3 and words[1] == '=':
				811	if wrapped != words[2]:
				812	print('EXPECTED', words[2], '!!!!!!!!!!')
				813
				814	if __name__ == '__main__':
				815	test()