Blame - Lib/urllib/parse.py - platform/external/python/cpython3

blob: 3d541a72063406c6b1a46e25a840e030dbbc1e83 [file] [log] [blame]

Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1	"""Parse (absolute and relative) URLs.
				2
Senthil Kumaran	fd41e08	2010-04-17 14:44:14 +0000	[diff] [blame]	3	urlparse module is based upon the following RFC specifications.
				4
				5	RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
				6	and L. Masinter, January 2005.
				7
				8	RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
				9	and L.Masinter, December 1999.
				10
				11	RFC2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
				12	Berners-Lee, R. Fielding, and L. Masinter, August 1998.
				13
				14	RFC2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
				15
				16	RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
				17	1995.
				18
				19	RFC1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
				20	McCahill, December 1994
				21
				22	RFC 3986 is considered the current standard and any changes to urlparse module
				23	should conform to this. urlparse module is not entirely compliant with this.
				24	The defacto scenarios of parsing are considered sometimes and for backward
				25	compatiblity purposes, older RFC uses of parsing are retained. The testcases in
				26	test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	27	"""
				28
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	29	import sys
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	30	import collections
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	31
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	32	__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	33	"urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	34	"quote", "quote_plus", "quote_from_bytes",
				35	"unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	36
				37	# A classification of schemes ('' means apply by default)
				38	uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
				39	'wais', 'file', 'https', 'shttp', 'mms',
				40	'prospero', 'rtsp', 'rtspu', '', 'sftp']
				41	uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
				42	'imap', 'wais', 'file', 'mms', 'https', 'shttp',
				43	'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaran	eaaec27	2009-03-30 21:54:41 +0000	[diff] [blame]	44	'svn', 'svn+ssh', 'sftp','nfs']
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	45	non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
				46	'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
				47	uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
				48	'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
				49	'mms', '', 'sftp']
				50	uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
				51	'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
				52	uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
				53	'nntp', 'wais', 'https', 'shttp', 'snews',
				54	'file', 'prospero', '']
				55
				56	# Characters valid in scheme names
				57	scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
				58	'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				59	'0123456789'
				60	'+-.')
				61
				62	MAX_CACHE_SIZE = 20
				63	_parse_cache = {}
				64
				65	def clear_cache():
Antoine Pitrou	2df5fc7	2009-12-08 19:38:17 +0000	[diff] [blame]	66	"""Clear the parse cache and the quoters cache."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	67	_parse_cache.clear()
Antoine Pitrou	2df5fc7	2009-12-08 19:38:17 +0000	[diff] [blame]	68	_safe_quoters.clear()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	69
				70
				71	class ResultMixin(object):
				72	"""Shared methods for the parsed result objects."""
				73
				74	@property
				75	def username(self):
				76	netloc = self.netloc
				77	if "@" in netloc:
				78	userinfo = netloc.rsplit("@", 1)[0]
				79	if ":" in userinfo:
				80	userinfo = userinfo.split(":", 1)[0]
				81	return userinfo
				82	return None
				83
				84	@property
				85	def password(self):
				86	netloc = self.netloc
				87	if "@" in netloc:
				88	userinfo = netloc.rsplit("@", 1)[0]
				89	if ":" in userinfo:
				90	return userinfo.split(":", 1)[1]
				91	return None
				92
				93	@property
				94	def hostname(self):
Senthil Kumaran	ad02d23	2010-04-16 03:02:13 +0000	[diff] [blame]	95	netloc = self.netloc.split('@')[-1]
				96	if '[' in netloc and ']' in netloc:
				97	return netloc.split(']')[0][1:].lower()
Senthil Kumaran	ad02d23	2010-04-16 03:02:13 +0000	[diff] [blame]	98	elif ':' in netloc:
				99	return netloc.split(':')[0].lower()
				100	elif netloc == '':
				101	return None
				102	else:
				103	return netloc.lower()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	104
				105	@property
				106	def port(self):
Senthil Kumaran	ad02d23	2010-04-16 03:02:13 +0000	[diff] [blame]	107	netloc = self.netloc.split('@')[-1].split(']')[-1]
				108	if ':' in netloc:
				109	port = netloc.split(':')[1]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	110	return int(port, 10)
Senthil Kumaran	ad02d23	2010-04-16 03:02:13 +0000	[diff] [blame]	111	else:
				112	return None
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	113
				114	from collections import namedtuple
				115
				116	class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
				117
				118	__slots__ = ()
				119
				120	def geturl(self):
				121	return urlunsplit(self)
				122
				123
				124	class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
				125
				126	__slots__ = ()
				127
				128	def geturl(self):
				129	return urlunparse(self)
				130
				131
				132	def urlparse(url, scheme='', allow_fragments=True):
				133	"""Parse a URL into 6 components:
				134	<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
				135	Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
				136	Note that we don't break the components up in smaller bits
				137	(e.g. netloc is a single string) and we don't expand % escapes."""
				138	tuple = urlsplit(url, scheme, allow_fragments)
				139	scheme, netloc, url, query, fragment = tuple
				140	if scheme in uses_params and ';' in url:
				141	url, params = _splitparams(url)
				142	else:
				143	params = ''
				144	return ParseResult(scheme, netloc, url, params, query, fragment)
				145
				146	def _splitparams(url):
				147	if '/' in url:
				148	i = url.find(';', url.rfind('/'))
				149	if i < 0:
				150	return url, ''
				151	else:
				152	i = url.find(';')
				153	return url[:i], url[i+1:]
				154
				155	def _splitnetloc(url, start=0):
				156	delim = len(url) # position of end of domain part of url, default is end
				157	for c in '/?#': # look for delimiters; the order is NOT important
				158	wdelim = url.find(c, start) # find first of this delim
				159	if wdelim >= 0: # if found
				160	delim = min(delim, wdelim) # use earliest delim position
				161	return url[start:delim], url[delim:] # return (domain, rest)
				162
				163	def urlsplit(url, scheme='', allow_fragments=True):
				164	"""Parse a URL into 5 components:
				165	<scheme>://<netloc>/<path>?<query>#<fragment>
				166	Return a 5-tuple: (scheme, netloc, path, query, fragment).
				167	Note that we don't break the components up in smaller bits
				168	(e.g. netloc is a single string) and we don't expand % escapes."""
				169	allow_fragments = bool(allow_fragments)
				170	key = url, scheme, allow_fragments, type(url), type(scheme)
				171	cached = _parse_cache.get(key, None)
				172	if cached:
				173	return cached
				174	if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
				175	clear_cache()
				176	netloc = query = fragment = ''
				177	i = url.find(':')
				178	if i > 0:
				179	if url[:i] == 'http': # optimize the common case
				180	scheme = url[:i].lower()
				181	url = url[i+1:]
				182	if url[:2] == '//':
				183	netloc, url = _splitnetloc(url, 2)
Senthil Kumaran	7a1e09f	2010-04-22 12:19:46 +0000	[diff] [blame^]	184	if (('[' in netloc and ']' not in netloc) or
				185	(']' in netloc and '[' not in netloc)):
				186	raise ValueError("Invalid IPv6 URL")
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	187	if allow_fragments and '#' in url:
				188	url, fragment = url.split('#', 1)
				189	if '?' in url:
				190	url, query = url.split('?', 1)
				191	v = SplitResult(scheme, netloc, url, query, fragment)
				192	_parse_cache[key] = v
				193	return v
				194	for c in url[:i]:
				195	if c not in scheme_chars:
				196	break
				197	else:
				198	scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran	6be85c5	2010-02-19 07:42:50 +0000	[diff] [blame]	199	if url[:2] == '//':
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	200	netloc, url = _splitnetloc(url, 2)
Senthil Kumaran	7a1e09f	2010-04-22 12:19:46 +0000	[diff] [blame^]	201	if (('[' in netloc and ']' not in netloc) or
				202	(']' in netloc and '[' not in netloc)):
				203	raise ValueError("Invalid IPv6 URL")
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	204	if allow_fragments and scheme in uses_fragment and '#' in url:
				205	url, fragment = url.split('#', 1)
				206	if scheme in uses_query and '?' in url:
				207	url, query = url.split('?', 1)
				208	v = SplitResult(scheme, netloc, url, query, fragment)
				209	_parse_cache[key] = v
				210	return v
				211
				212	def urlunparse(components):
				213	"""Put a parsed URL back together again. This may result in a
				214	slightly different, but equivalent URL, if the URL that was parsed
				215	originally had redundant delimiters, e.g. a ? with an empty query
				216	(the draft states that these are equivalent)."""
				217	scheme, netloc, url, params, query, fragment = components
				218	if params:
				219	url = "%s;%s" % (url, params)
				220	return urlunsplit((scheme, netloc, url, query, fragment))
				221
				222	def urlunsplit(components):
				223	scheme, netloc, url, query, fragment = components
				224	if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
				225	if url and url[:1] != '/': url = '/' + url
				226	url = '//' + (netloc or '') + url
				227	if scheme:
				228	url = scheme + ':' + url
				229	if query:
				230	url = url + '?' + query
				231	if fragment:
				232	url = url + '#' + fragment
				233	return url
				234
				235	def urljoin(base, url, allow_fragments=True):
				236	"""Join a base URL and a possibly relative URL to form an absolute
				237	interpretation of the latter."""
				238	if not base:
				239	return url
				240	if not url:
				241	return base
				242	bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
				243	urlparse(base, '', allow_fragments)
				244	scheme, netloc, path, params, query, fragment = \
				245	urlparse(url, bscheme, allow_fragments)
				246	if scheme != bscheme or scheme not in uses_relative:
				247	return url
				248	if scheme in uses_netloc:
				249	if netloc:
				250	return urlunparse((scheme, netloc, path,
				251	params, query, fragment))
				252	netloc = bnetloc
				253	if path[:1] == '/':
				254	return urlunparse((scheme, netloc, path,
				255	params, query, fragment))
Facundo Batista	23e3856	2008-08-14 16:55:14 +0000	[diff] [blame]	256	if not path:
				257	path = bpath
				258	if not params:
				259	params = bparams
				260	else:
				261	path = path[:-1]
				262	return urlunparse((scheme, netloc, path,
				263	params, query, fragment))
				264	if not query:
				265	query = bquery
				266	return urlunparse((scheme, netloc, path,
				267	params, query, fragment))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	268	segments = bpath.split('/')[:-1] + path.split('/')
				269	# XXX The stuff below is bogus in various ways...
				270	if segments[-1] == '.':
				271	segments[-1] = ''
				272	while '.' in segments:
				273	segments.remove('.')
				274	while 1:
				275	i = 1
				276	n = len(segments) - 1
				277	while i < n:
				278	if (segments[i] == '..'
				279	and segments[i-1] not in ('', '..')):
				280	del segments[i-1:i+1]
				281	break
				282	i = i+1
				283	else:
				284	break
				285	if segments == ['', '..']:
				286	segments[-1] = ''
				287	elif len(segments) >= 2 and segments[-1] == '..':
				288	segments[-2:] = ['']
				289	return urlunparse((scheme, netloc, '/'.join(segments),
				290	params, query, fragment))
				291
				292	def urldefrag(url):
				293	"""Removes any existing fragment from URL.
				294
				295	Returns a tuple of the defragmented URL and the fragment. If
				296	the URL contained no fragments, the second element is the
				297	empty string.
				298	"""
				299	if '#' in url:
				300	s, n, p, a, q, frag = urlparse(url)
				301	defrag = urlunparse((s, n, p, a, q, ''))
				302	return defrag, frag
				303	else:
				304	return url, ''
				305
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	306	def unquote_to_bytes(string):
				307	"""unquote_to_bytes('abc%20def') -> b'abc def'."""
				308	# Note: strings are encoded as UTF-8. This is only an issue if it contains
				309	# unescaped non-ASCII characters, which URIs should not.
				310	if isinstance(string, str):
				311	string = string.encode('utf-8')
				312	res = string.split(b'%')
				313	res[0] = res[0]
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	314	for i in range(1, len(res)):
				315	item = res[i]
				316	try:
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	317	res[i] = bytes([int(item[:2], 16)]) + item[2:]
				318	except ValueError:
				319	res[i] = b'%' + item
				320	return b''.join(res)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	321
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	322	def unquote(string, encoding='utf-8', errors='replace'):
				323	"""Replace %xx escapes by their single-character equivalent. The optional
				324	encoding and errors parameters specify how to decode percent-encoded
				325	sequences into Unicode characters, as accepted by the bytes.decode()
				326	method.
				327	By default, percent-encoded sequences are decoded with UTF-8, and invalid
				328	sequences are replaced by a placeholder character.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	329
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	330	unquote('abc%20def') -> 'abc def'.
				331	"""
				332	if encoding is None: encoding = 'utf-8'
				333	if errors is None: errors = 'replace'
				334	# pct_sequence: contiguous sequence of percent-encoded bytes, decoded
				335	# (list of single-byte bytes objects)
				336	pct_sequence = []
				337	res = string.split('%')
				338	for i in range(1, len(res)):
				339	item = res[i]
				340	try:
				341	if not item: raise ValueError
				342	pct_sequence.append(bytes.fromhex(item[:2]))
				343	rest = item[2:]
				344	except ValueError:
				345	rest = '%' + item
				346	if not rest:
				347	# This segment was just a single percent-encoded character.
				348	# May be part of a sequence of code units, so delay decoding.
				349	# (Stored in pct_sequence).
				350	res[i] = ''
				351	else:
				352	# Encountered non-percent-encoded characters. Flush the current
				353	# pct_sequence.
				354	res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
				355	pct_sequence = []
				356	if pct_sequence:
				357	# Flush the final pct_sequence
				358	# res[-1] will always be empty if pct_sequence != []
				359	assert not res[-1], "string=%r, res=%r" % (string, res)
				360	res[-1] = b''.join(pct_sequence).decode(encoding, errors)
				361	return ''.join(res)
				362
Georg Brandl	3d6575d	2009-09-16 14:36:22 +0000	[diff] [blame]	363	def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	364	"""Parse a query given as a string argument.
				365
				366	Arguments:
				367
				368	qs: URL-encoded query string to be parsed
				369
				370	keep_blank_values: flag indicating whether blank values in
				371	URL encoded queries should be treated as blank strings.
				372	A true value indicates that blanks should be retained as
				373	blank strings. The default false value indicates that
				374	blank values are to be ignored and treated as if they were
				375	not included.
				376
				377	strict_parsing: flag indicating what to do with parsing errors.
				378	If false (the default), errors are silently ignored.
				379	If true, errors raise a ValueError exception.
				380	"""
				381	dict = {}
				382	for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
				383	if name in dict:
				384	dict[name].append(value)
				385	else:
				386	dict[name] = [value]
				387	return dict
				388
Georg Brandl	3d6575d	2009-09-16 14:36:22 +0000	[diff] [blame]	389	def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	390	"""Parse a query given as a string argument.
				391
				392	Arguments:
				393
				394	qs: URL-encoded query string to be parsed
				395
				396	keep_blank_values: flag indicating whether blank values in
				397	URL encoded queries should be treated as blank strings. A
				398	true value indicates that blanks should be retained as blank
				399	strings. The default false value indicates that blank values
				400	are to be ignored and treated as if they were not included.
				401
				402	strict_parsing: flag indicating what to do with parsing errors. If
				403	false (the default), errors are silently ignored. If true,
				404	errors raise a ValueError exception.
				405
				406	Returns a list, as G-d intended.
				407	"""
				408	pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
				409	r = []
				410	for name_value in pairs:
				411	if not name_value and not strict_parsing:
				412	continue
				413	nv = name_value.split('=', 1)
				414	if len(nv) != 2:
				415	if strict_parsing:
				416	raise ValueError("bad query field: %r" % (name_value,))
				417	# Handle case of a control-name with no equal sign
				418	if keep_blank_values:
				419	nv.append('')
				420	else:
				421	continue
				422	if len(nv[1]) or keep_blank_values:
				423	name = unquote(nv[0].replace('+', ' '))
				424	value = unquote(nv[1].replace('+', ' '))
				425	r.append((name, value))
				426
				427	return r
				428
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	429	def unquote_plus(string, encoding='utf-8', errors='replace'):
				430	"""Like unquote(), but also replace plus signs by spaces, as required for
				431	unquoting HTML form values.
				432
				433	unquote_plus('%7e/abc+def') -> '~/abc def'
				434	"""
				435	string = string.replace('+', ' ')
				436	return unquote(string, encoding, errors)
				437
				438	_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				439	b'abcdefghijklmnopqrstuvwxyz'
				440	b'0123456789'
				441	b'_.-')
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	442	_safe_quoters= {}
				443
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	444	class Quoter(collections.defaultdict):
				445	"""A mapping from bytes (in range(0,256)) to strings.
				446
				447	String values are percent-encoded byte values, unless the key < 128, and
				448	in the "safe" set (either the specified safe set, or default set).
				449	"""
				450	# Keeps a cache internally, using defaultdict, for efficiency (lookups
				451	# of cached keys don't call Python code at all).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	452	def __init__(self, safe):
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	453	"""safe: bytes object."""
				454	self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	455
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	456	def __repr__(self):
				457	# Without this, will just display as a defaultdict
				458	return "<Quoter %r>" % dict(self)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	459
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	460	def __missing__(self, b):
				461	# Handle a cache miss. Store quoted string in cache and return.
				462	res = b in self.safe and chr(b) or ('%%%02X' % b)
				463	self[b] = res
				464	return res
				465
				466	def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	467	"""quote('abc def') -> 'abc%20def'
				468
				469	Each part of a URL, e.g. the path info, the query, etc., has a
				470	different set of reserved characters that must be quoted.
				471
				472	RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
				473	the following reserved characters.
				474
				475	reserved = ";" \| "/" \| "?" \| ":" \| "@" \| "&" \| "=" \| "+" \|
				476	"$" \| ","
				477
				478	Each of these characters is reserved in some component of a URL,
				479	but not necessarily in all of them.
				480
				481	By default, the quote function is intended for quoting the path
				482	section of a URL. Thus, it will not encode '/'. This character
				483	is reserved, but in typical usage the quote function is being
				484	called on a path where the existing slash characters are used as
				485	reserved characters.
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	486
				487	string and safe may be either str or bytes objects. encoding must
				488	not be specified if string is a str.
				489
				490	The optional encoding and errors parameters specify how to deal with
				491	non-ASCII characters, as accepted by the str.encode method.
				492	By default, encoding='utf-8' (characters are encoded with UTF-8), and
				493	errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	494	"""
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	495	if isinstance(string, str):
				496	if encoding is None:
				497	encoding = 'utf-8'
				498	if errors is None:
				499	errors = 'strict'
				500	string = string.encode(encoding, errors)
				501	else:
				502	if encoding is not None:
				503	raise TypeError("quote() doesn't support 'encoding' for bytes")
				504	if errors is not None:
				505	raise TypeError("quote() doesn't support 'errors' for bytes")
				506	return quote_from_bytes(string, safe)
				507
				508	def quote_plus(string, safe='', encoding=None, errors=None):
				509	"""Like quote(), but also replace ' ' with '+', as required for quoting
				510	HTML form values. Plus signs in the original string are escaped unless
				511	they are included in safe. It also does not have safe default to '/'.
				512	"""
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	513	# Check if ' ' in string, where string may either be a str or bytes. If
				514	# there are no spaces, the regular quote will produce the right answer.
				515	if ((isinstance(string, str) and ' ' not in string) or
				516	(isinstance(string, bytes) and b' ' not in string)):
				517	return quote(string, safe, encoding, errors)
				518	if isinstance(safe, str):
				519	space = ' '
				520	else:
				521	space = b' '
Georg Brandl	faf4149	2009-05-26 18:31:11 +0000	[diff] [blame]	522	string = quote(string, safe + space, encoding, errors)
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	523	return string.replace(' ', '+')
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	524
				525	def quote_from_bytes(bs, safe='/'):
				526	"""Like quote(), but accepts a bytes object rather than a str, and does
				527	not perform string-to-bytes encoding. It always returns an ASCII string.
				528	quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
				529	"""
				530	if isinstance(safe, str):
				531	# Normalize 'safe' by converting to bytes and removing non-ASCII chars
				532	safe = safe.encode('ascii', 'ignore')
				533	cachekey = bytes(safe) # In case it was a bytearray
				534	if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
				535	raise TypeError("quote_from_bytes() expected a bytes")
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	536	try:
				537	quoter = _safe_quoters[cachekey]
				538	except KeyError:
				539	quoter = Quoter(safe)
				540	_safe_quoters[cachekey] = quoter
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	541	return ''.join([quoter[char] for char in bs])
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	542
Georg Brandl	3d6575d	2009-09-16 14:36:22 +0000	[diff] [blame]	543	def urlencode(query, doseq=False):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	544	"""Encode a sequence of two-element tuples or dictionary into a URL query string.
				545
				546	If any values in the query arg are sequences and doseq is true, each
				547	sequence element is converted to a separate parameter.
				548
				549	If the query arg is a sequence of two-element tuples, the order of the
				550	parameters in the output will match the order of parameters in the
				551	input.
				552	"""
				553
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	554	if hasattr(query, "items"):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	555	query = query.items()
				556	else:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	557	# It's a bother at times that strings and string-like objects are
				558	# sequences.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	559	try:
				560	# non-sequence items should not work with len()
				561	# non-empty strings will fail this
				562	if len(query) and not isinstance(query[0], tuple):
				563	raise TypeError
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	564	# Zero-length sequences of all types will get here and succeed,
				565	# but that's a minor nit. Since the original implementation
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	566	# allowed empty dicts that type of behavior probably should be
				567	# preserved for consistency
				568	except TypeError:
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	569	ty, va, tb = sys.exc_info()
				570	raise TypeError("not a valid non-string sequence "
				571	"or mapping object").with_traceback(tb)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	572
				573	l = []
				574	if not doseq:
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	575	for k, v in query:
				576	k = quote_plus(str(k))
				577	v = quote_plus(str(v))
				578	l.append(k + '=' + v)
				579	else:
				580	for k, v in query:
				581	k = quote_plus(str(k))
				582	if isinstance(v, str):
				583	v = quote_plus(v)
				584	l.append(k + '=' + v)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	585	else:
				586	try:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	587	# Is this a sufficient test for sequence-ness?
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	588	x = len(v)
				589	except TypeError:
				590	# not a sequence
				591	v = quote_plus(str(v))
				592	l.append(k + '=' + v)
				593	else:
				594	# loop over the sequence
				595	for elt in v:
				596	l.append(k + '=' + quote_plus(str(elt)))
				597	return '&'.join(l)
				598
				599	# Utilities to parse URLs (most of these return None for missing parts):
				600	# unwrap('<URL:type://host/path>') --> 'type://host/path'
				601	# splittype('type:opaquestring') --> 'type', 'opaquestring'
				602	# splithost('//host[:port]/path') --> 'host[:port]', '/path'
				603	# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
				604	# splitpasswd('user:passwd') -> 'user', 'passwd'
				605	# splitport('host:port') --> 'host', 'port'
				606	# splitquery('/path?query') --> '/path', 'query'
				607	# splittag('/path#tag') --> '/path', 'tag'
				608	# splitattr('/path;attr1=value1;attr2=value2;...') ->
				609	# '/path', ['attr1=value1', 'attr2=value2', ...]
				610	# splitvalue('attr=value') --> 'attr', 'value'
				611	# urllib.parse.unquote('abc%20def') -> 'abc def'
				612	# quote('abc def') -> 'abc%20def')
				613
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	614	def to_bytes(url):
				615	"""to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	616	# Most URL schemes require ASCII. If that changes, the conversion
				617	# can be relaxed.
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	618	# XXX get rid of to_bytes()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	619	if isinstance(url, str):
				620	try:
				621	url = url.encode("ASCII").decode()
				622	except UnicodeError:
				623	raise UnicodeError("URL " + repr(url) +
				624	" contains non-ASCII characters")
				625	return url
				626
				627	def unwrap(url):
				628	"""unwrap('<URL:type://host/path>') --> 'type://host/path'."""
				629	url = str(url).strip()
				630	if url[:1] == '<' and url[-1:] == '>':
				631	url = url[1:-1].strip()
				632	if url[:4] == 'URL:': url = url[4:].strip()
				633	return url
				634
				635	_typeprog = None
				636	def splittype(url):
				637	"""splittype('type:opaquestring') --> 'type', 'opaquestring'."""
				638	global _typeprog
				639	if _typeprog is None:
				640	import re
				641	_typeprog = re.compile('^([^/:]+):')
				642
				643	match = _typeprog.match(url)
				644	if match:
				645	scheme = match.group(1)
				646	return scheme.lower(), url[len(scheme) + 1:]
				647	return None, url
				648
				649	_hostprog = None
				650	def splithost(url):
				651	"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
				652	global _hostprog
				653	if _hostprog is None:
				654	import re
				655	_hostprog = re.compile('^//([^/?])(.)$')
				656
				657	match = _hostprog.match(url)
				658	if match: return match.group(1, 2)
				659	return None, url
				660
				661	_userprog = None
				662	def splituser(host):
				663	"""splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
				664	global _userprog
				665	if _userprog is None:
				666	import re
				667	_userprog = re.compile('^(.)@(.)$')
				668
				669	match = _userprog.match(host)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	670	if match: return map(unquote, match.group(1, 2))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	671	return None, host
				672
				673	_passwdprog = None
				674	def splitpasswd(user):
				675	"""splitpasswd('user:passwd') -> 'user', 'passwd'."""
				676	global _passwdprog
				677	if _passwdprog is None:
				678	import re
Senthil Kumaran	eaaec27	2009-03-30 21:54:41 +0000	[diff] [blame]	679	_passwdprog = re.compile('^([^:]):(.)$',re.S)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	680
				681	match = _passwdprog.match(user)
				682	if match: return match.group(1, 2)
				683	return user, None
				684
				685	# splittag('/path#tag') --> '/path', 'tag'
				686	_portprog = None
				687	def splitport(host):
				688	"""splitport('host:port') --> 'host', 'port'."""
				689	global _portprog
				690	if _portprog is None:
				691	import re
				692	_portprog = re.compile('^(.*):([0-9]+)$')
				693
				694	match = _portprog.match(host)
				695	if match: return match.group(1, 2)
				696	return host, None
				697
				698	_nportprog = None
				699	def splitnport(host, defport=-1):
				700	"""Split host and port, returning numeric port.
				701	Return given default port if no ':' found; defaults to -1.
				702	Return numerical port if a valid number are found after ':'.
				703	Return None if ':' but not a valid number."""
				704	global _nportprog
				705	if _nportprog is None:
				706	import re
				707	_nportprog = re.compile('^(.):(.)$')
				708
				709	match = _nportprog.match(host)
				710	if match:
				711	host, port = match.group(1, 2)
				712	try:
				713	if not port: raise ValueError("no digits")
				714	nport = int(port)
				715	except ValueError:
				716	nport = None
				717	return host, nport
				718	return host, defport
				719
				720	_queryprog = None
				721	def splitquery(url):
				722	"""splitquery('/path?query') --> '/path', 'query'."""
				723	global _queryprog
				724	if _queryprog is None:
				725	import re
				726	_queryprog = re.compile('^(.)\?([^?])$')
				727
				728	match = _queryprog.match(url)
				729	if match: return match.group(1, 2)
				730	return url, None
				731
				732	_tagprog = None
				733	def splittag(url):
				734	"""splittag('/path#tag') --> '/path', 'tag'."""
				735	global _tagprog
				736	if _tagprog is None:
				737	import re
				738	_tagprog = re.compile('^(.)#([^#])$')
				739
				740	match = _tagprog.match(url)
				741	if match: return match.group(1, 2)
				742	return url, None
				743
				744	def splitattr(url):
				745	"""splitattr('/path;attr1=value1;attr2=value2;...') ->
				746	'/path', ['attr1=value1', 'attr2=value2', ...]."""
				747	words = url.split(';')
				748	return words[0], words[1:]
				749
				750	_valueprog = None
				751	def splitvalue(attr):
				752	"""splitvalue('attr=value') --> 'attr', 'value'."""
				753	global _valueprog
				754	if _valueprog is None:
				755	import re
				756	_valueprog = re.compile('^([^=])=(.)$')
				757
				758	match = _valueprog.match(attr)
				759	if match: return match.group(1, 2)
				760	return attr, None
				761
				762	test_input = """
				763	http://a/b/c/d
				764
				765	g:h = <URL:g:h>
				766	http:g = <URL:http://a/b/c/g>
				767	http: = <URL:http://a/b/c/d>
				768	g = <URL:http://a/b/c/g>
				769	./g = <URL:http://a/b/c/g>
				770	g/ = <URL:http://a/b/c/g/>
				771	/g = <URL:http://a/g>
				772	//g = <URL:http://g>
				773	?y = <URL:http://a/b/c/d?y>
				774	g?y = <URL:http://a/b/c/g?y>
				775	g?y/./x = <URL:http://a/b/c/g?y/./x>
				776	. = <URL:http://a/b/c/>
				777	./ = <URL:http://a/b/c/>
				778	.. = <URL:http://a/b/>
				779	../ = <URL:http://a/b/>
				780	../g = <URL:http://a/b/g>
				781	../.. = <URL:http://a/>
				782	../../g = <URL:http://a/g>
				783	../../../g = <URL:http://a/../g>
				784	./../g = <URL:http://a/b/g>
				785	./g/. = <URL:http://a/b/c/g/>
				786	/./g = <URL:http://a/./g>
				787	g/./h = <URL:http://a/b/c/g/h>
				788	g/../h = <URL:http://a/b/c/h>
				789	http:g = <URL:http://a/b/c/g>
				790	http: = <URL:http://a/b/c/d>
				791	http:?y = <URL:http://a/b/c/d?y>
				792	http:g?y = <URL:http://a/b/c/g?y>
				793	http:g?y/./x = <URL:http://a/b/c/g?y/./x>
				794	"""
				795
				796	def test():
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	797	base = ''
				798	if sys.argv[1:]:
				799	fn = sys.argv[1]
				800	if fn == '-':
				801	fp = sys.stdin
				802	else:
				803	fp = open(fn)
				804	else:
				805	from io import StringIO
				806	fp = StringIO(test_input)
				807	for line in fp:
				808	words = line.split()
				809	if not words:
				810	continue
				811	url = words[0]
				812	parts = urlparse(url)
				813	print('%-10s : %s' % (url, parts))
				814	abs = urljoin(base, url)
				815	if not base:
				816	base = abs
				817	wrapped = '<URL:%s>' % abs
				818	print('%-10s = %s' % (url, wrapped))
				819	if len(words) == 3 and words[1] == '=':
				820	if wrapped != words[2]:
				821	print('EXPECTED', words[2], '!!!!!!!!!!')
				822
				823	if __name__ == '__main__':
				824	test()