Blame - Lib/urllib/parse.py - platform/external/python/cpython3

blob: b39fc25eb845c830573d6616c3a3d7782806e25d [file] [log] [blame]

Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1	"""Parse (absolute and relative) URLs.
				2
Senthil Kumaran	fd41e08	2010-04-17 14:44:14 +0000	[diff] [blame]	3	urlparse module is based upon the following RFC specifications.
				4
				5	RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
				6	and L. Masinter, January 2005.
				7
				8	RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
				9	and L.Masinter, December 1999.
				10
				11	RFC2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
				12	Berners-Lee, R. Fielding, and L. Masinter, August 1998.
				13
				14	RFC2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
				15
				16	RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
				17	1995.
				18
				19	RFC1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
				20	McCahill, December 1994
				21
				22	RFC 3986 is considered the current standard and any changes to urlparse module
				23	should conform to this. urlparse module is not entirely compliant with this.
				24	The defacto scenarios of parsing are considered sometimes and for backward
				25	compatiblity purposes, older RFC uses of parsing are retained. The testcases in
				26	test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	27	"""
				28
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	29	import sys
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	30	import collections
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	31
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	32	__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	33	"urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	34	"quote", "quote_plus", "quote_from_bytes",
				35	"unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	36
				37	# A classification of schemes ('' means apply by default)
				38	uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
				39	'wais', 'file', 'https', 'shttp', 'mms',
				40	'prospero', 'rtsp', 'rtspu', '', 'sftp']
				41	uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
				42	'imap', 'wais', 'file', 'mms', 'https', 'shttp',
				43	'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	44	'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	45	non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
				46	'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
				47	uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
				48	'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
				49	'mms', '', 'sftp']
				50	uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
				51	'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
				52	uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
				53	'nntp', 'wais', 'https', 'shttp', 'snews',
				54	'file', 'prospero', '']
				55
				56	# Characters valid in scheme names
				57	scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
				58	'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				59	'0123456789'
				60	'+-.')
				61
				62	MAX_CACHE_SIZE = 20
				63	_parse_cache = {}
				64
				65	def clear_cache():
Antoine Pitrou	2df5fc7	2009-12-08 19:38:17 +0000	[diff] [blame]	66	"""Clear the parse cache and the quoters cache."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	67	_parse_cache.clear()
Antoine Pitrou	2df5fc7	2009-12-08 19:38:17 +0000	[diff] [blame]	68	_safe_quoters.clear()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	69
				70
				71	class ResultMixin(object):
				72	"""Shared methods for the parsed result objects."""
				73
				74	@property
				75	def username(self):
				76	netloc = self.netloc
				77	if "@" in netloc:
				78	userinfo = netloc.rsplit("@", 1)[0]
				79	if ":" in userinfo:
				80	userinfo = userinfo.split(":", 1)[0]
				81	return userinfo
				82	return None
				83
				84	@property
				85	def password(self):
				86	netloc = self.netloc
				87	if "@" in netloc:
				88	userinfo = netloc.rsplit("@", 1)[0]
				89	if ":" in userinfo:
				90	return userinfo.split(":", 1)[1]
				91	return None
				92
				93	@property
				94	def hostname(self):
Senthil Kumaran	ad02d23	2010-04-16 03:02:13 +0000	[diff] [blame]	95	netloc = self.netloc.split('@')[-1]
				96	if '[' in netloc and ']' in netloc:
				97	return netloc.split(']')[0][1:].lower()
Senthil Kumaran	ad02d23	2010-04-16 03:02:13 +0000	[diff] [blame]	98	elif ':' in netloc:
				99	return netloc.split(':')[0].lower()
				100	elif netloc == '':
				101	return None
				102	else:
				103	return netloc.lower()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	104
				105	@property
				106	def port(self):
Senthil Kumaran	ad02d23	2010-04-16 03:02:13 +0000	[diff] [blame]	107	netloc = self.netloc.split('@')[-1].split(']')[-1]
				108	if ':' in netloc:
				109	port = netloc.split(':')[1]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	110	return int(port, 10)
Senthil Kumaran	ad02d23	2010-04-16 03:02:13 +0000	[diff] [blame]	111	else:
				112	return None
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	113
				114	from collections import namedtuple
				115
				116	class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
				117
				118	__slots__ = ()
				119
				120	def geturl(self):
				121	return urlunsplit(self)
				122
				123
				124	class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
				125
				126	__slots__ = ()
				127
				128	def geturl(self):
				129	return urlunparse(self)
				130
				131
				132	def urlparse(url, scheme='', allow_fragments=True):
				133	"""Parse a URL into 6 components:
				134	<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
				135	Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
				136	Note that we don't break the components up in smaller bits
				137	(e.g. netloc is a single string) and we don't expand % escapes."""
				138	tuple = urlsplit(url, scheme, allow_fragments)
				139	scheme, netloc, url, query, fragment = tuple
				140	if scheme in uses_params and ';' in url:
				141	url, params = _splitparams(url)
				142	else:
				143	params = ''
				144	return ParseResult(scheme, netloc, url, params, query, fragment)
				145
				146	def _splitparams(url):
				147	if '/' in url:
				148	i = url.find(';', url.rfind('/'))
				149	if i < 0:
				150	return url, ''
				151	else:
				152	i = url.find(';')
				153	return url[:i], url[i+1:]
				154
				155	def _splitnetloc(url, start=0):
				156	delim = len(url) # position of end of domain part of url, default is end
				157	for c in '/?#': # look for delimiters; the order is NOT important
				158	wdelim = url.find(c, start) # find first of this delim
				159	if wdelim >= 0: # if found
				160	delim = min(delim, wdelim) # use earliest delim position
				161	return url[start:delim], url[delim:] # return (domain, rest)
				162
				163	def urlsplit(url, scheme='', allow_fragments=True):
				164	"""Parse a URL into 5 components:
				165	<scheme>://<netloc>/<path>?<query>#<fragment>
				166	Return a 5-tuple: (scheme, netloc, path, query, fragment).
				167	Note that we don't break the components up in smaller bits
				168	(e.g. netloc is a single string) and we don't expand % escapes."""
				169	allow_fragments = bool(allow_fragments)
				170	key = url, scheme, allow_fragments, type(url), type(scheme)
				171	cached = _parse_cache.get(key, None)
				172	if cached:
				173	return cached
				174	if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
				175	clear_cache()
				176	netloc = query = fragment = ''
				177	i = url.find(':')
				178	if i > 0:
				179	if url[:i] == 'http': # optimize the common case
				180	scheme = url[:i].lower()
				181	url = url[i+1:]
				182	if url[:2] == '//':
				183	netloc, url = _splitnetloc(url, 2)
Senthil Kumaran	7a1e09f	2010-04-22 12:19:46 +0000	[diff] [blame]	184	if (('[' in netloc and ']' not in netloc) or
				185	(']' in netloc and '[' not in netloc)):
				186	raise ValueError("Invalid IPv6 URL")
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	187	if allow_fragments and '#' in url:
				188	url, fragment = url.split('#', 1)
				189	if '?' in url:
				190	url, query = url.split('?', 1)
				191	v = SplitResult(scheme, netloc, url, query, fragment)
				192	_parse_cache[key] = v
				193	return v
				194	for c in url[:i]:
				195	if c not in scheme_chars:
				196	break
				197	else:
				198	scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran	6be85c5	2010-02-19 07:42:50 +0000	[diff] [blame]	199	if url[:2] == '//':
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	200	netloc, url = _splitnetloc(url, 2)
Senthil Kumaran	7a1e09f	2010-04-22 12:19:46 +0000	[diff] [blame]	201	if (('[' in netloc and ']' not in netloc) or
				202	(']' in netloc and '[' not in netloc)):
				203	raise ValueError("Invalid IPv6 URL")
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	204	if allow_fragments and scheme in uses_fragment and '#' in url:
				205	url, fragment = url.split('#', 1)
				206	if scheme in uses_query and '?' in url:
				207	url, query = url.split('?', 1)
				208	v = SplitResult(scheme, netloc, url, query, fragment)
				209	_parse_cache[key] = v
				210	return v
				211
				212	def urlunparse(components):
				213	"""Put a parsed URL back together again. This may result in a
				214	slightly different, but equivalent URL, if the URL that was parsed
				215	originally had redundant delimiters, e.g. a ? with an empty query
				216	(the draft states that these are equivalent)."""
				217	scheme, netloc, url, params, query, fragment = components
				218	if params:
				219	url = "%s;%s" % (url, params)
				220	return urlunsplit((scheme, netloc, url, query, fragment))
				221
				222	def urlunsplit(components):
				223	scheme, netloc, url, query, fragment = components
				224	if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
				225	if url and url[:1] != '/': url = '/' + url
				226	url = '//' + (netloc or '') + url
				227	if scheme:
				228	url = scheme + ':' + url
				229	if query:
				230	url = url + '?' + query
				231	if fragment:
				232	url = url + '#' + fragment
				233	return url
				234
				235	def urljoin(base, url, allow_fragments=True):
				236	"""Join a base URL and a possibly relative URL to form an absolute
				237	interpretation of the latter."""
				238	if not base:
				239	return url
				240	if not url:
				241	return base
				242	bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
				243	urlparse(base, '', allow_fragments)
				244	scheme, netloc, path, params, query, fragment = \
				245	urlparse(url, bscheme, allow_fragments)
				246	if scheme != bscheme or scheme not in uses_relative:
				247	return url
				248	if scheme in uses_netloc:
				249	if netloc:
				250	return urlunparse((scheme, netloc, path,
				251	params, query, fragment))
				252	netloc = bnetloc
				253	if path[:1] == '/':
				254	return urlunparse((scheme, netloc, path,
				255	params, query, fragment))
Facundo Batista	23e3856	2008-08-14 16:55:14 +0000	[diff] [blame]	256	if not path:
				257	path = bpath
				258	if not params:
				259	params = bparams
				260	else:
				261	path = path[:-1]
				262	return urlunparse((scheme, netloc, path,
				263	params, query, fragment))
				264	if not query:
				265	query = bquery
				266	return urlunparse((scheme, netloc, path,
				267	params, query, fragment))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	268	segments = bpath.split('/')[:-1] + path.split('/')
				269	# XXX The stuff below is bogus in various ways...
				270	if segments[-1] == '.':
				271	segments[-1] = ''
				272	while '.' in segments:
				273	segments.remove('.')
				274	while 1:
				275	i = 1
				276	n = len(segments) - 1
				277	while i < n:
				278	if (segments[i] == '..'
				279	and segments[i-1] not in ('', '..')):
				280	del segments[i-1:i+1]
				281	break
				282	i = i+1
				283	else:
				284	break
				285	if segments == ['', '..']:
				286	segments[-1] = ''
				287	elif len(segments) >= 2 and segments[-1] == '..':
				288	segments[-2:] = ['']
				289	return urlunparse((scheme, netloc, '/'.join(segments),
				290	params, query, fragment))
				291
				292	def urldefrag(url):
				293	"""Removes any existing fragment from URL.
				294
				295	Returns a tuple of the defragmented URL and the fragment. If
				296	the URL contained no fragments, the second element is the
				297	empty string.
				298	"""
				299	if '#' in url:
				300	s, n, p, a, q, frag = urlparse(url)
				301	defrag = urlunparse((s, n, p, a, q, ''))
				302	return defrag, frag
				303	else:
				304	return url, ''
				305
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	306	def unquote_to_bytes(string):
				307	"""unquote_to_bytes('abc%20def') -> b'abc def'."""
				308	# Note: strings are encoded as UTF-8. This is only an issue if it contains
				309	# unescaped non-ASCII characters, which URIs should not.
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	310	if not string:
				311	return b''
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	312	if isinstance(string, str):
				313	string = string.encode('utf-8')
				314	res = string.split(b'%')
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	315	if len(res) == 1:
				316	return string
				317	string = res[0]
				318	for item in res[1:]:
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	319	try:
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	320	string += bytes([int(item[:2], 16)]) + item[2:]
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	321	except ValueError:
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	322	string += b'%' + item
				323	return string
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	324
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	325	def unquote(string, encoding='utf-8', errors='replace'):
				326	"""Replace %xx escapes by their single-character equivalent. The optional
				327	encoding and errors parameters specify how to decode percent-encoded
				328	sequences into Unicode characters, as accepted by the bytes.decode()
				329	method.
				330	By default, percent-encoded sequences are decoded with UTF-8, and invalid
				331	sequences are replaced by a placeholder character.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	332
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	333	unquote('abc%20def') -> 'abc def'.
				334	"""
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	335	if not string:
				336	return string
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	337	res = string.split('%')
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	338	if len(res) == 1:
				339	return string
				340	if encoding is None:
				341	encoding = 'utf-8'
				342	if errors is None:
				343	errors = 'replace'
Florent Xicluna	0f78a94	2010-05-17 18:01:22 +0000	[diff] [blame]	344	# pct_sequence: contiguous sequence of percent-encoded bytes, decoded
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	345	pct_sequence = b''
				346	string = res[0]
				347	for item in res[1:]:
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	348	try:
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	349	if not item:
				350	raise ValueError
				351	pct_sequence += bytes.fromhex(item[:2])
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	352	rest = item[2:]
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	353	if not rest:
				354	# This segment was just a single percent-encoded character.
				355	# May be part of a sequence of code units, so delay decoding.
				356	# (Stored in pct_sequence).
				357	continue
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	358	except ValueError:
				359	rest = '%' + item
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	360	# Encountered non-percent-encoded characters. Flush the current
				361	# pct_sequence.
				362	string += pct_sequence.decode(encoding, errors) + rest
				363	pct_sequence = b''
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	364	if pct_sequence:
				365	# Flush the final pct_sequence
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	366	string += pct_sequence.decode(encoding, errors)
				367	return string
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	368
Georg Brandl	3d6575d	2009-09-16 14:36:22 +0000	[diff] [blame]	369	def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	370	"""Parse a query given as a string argument.
				371
				372	Arguments:
				373
				374	qs: URL-encoded query string to be parsed
				375
				376	keep_blank_values: flag indicating whether blank values in
				377	URL encoded queries should be treated as blank strings.
				378	A true value indicates that blanks should be retained as
				379	blank strings. The default false value indicates that
				380	blank values are to be ignored and treated as if they were
				381	not included.
				382
				383	strict_parsing: flag indicating what to do with parsing errors.
				384	If false (the default), errors are silently ignored.
				385	If true, errors raise a ValueError exception.
				386	"""
				387	dict = {}
				388	for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
				389	if name in dict:
				390	dict[name].append(value)
				391	else:
				392	dict[name] = [value]
				393	return dict
				394
Georg Brandl	3d6575d	2009-09-16 14:36:22 +0000	[diff] [blame]	395	def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	396	"""Parse a query given as a string argument.
				397
				398	Arguments:
				399
				400	qs: URL-encoded query string to be parsed
				401
				402	keep_blank_values: flag indicating whether blank values in
				403	URL encoded queries should be treated as blank strings. A
				404	true value indicates that blanks should be retained as blank
				405	strings. The default false value indicates that blank values
				406	are to be ignored and treated as if they were not included.
				407
				408	strict_parsing: flag indicating what to do with parsing errors. If
				409	false (the default), errors are silently ignored. If true,
				410	errors raise a ValueError exception.
				411
				412	Returns a list, as G-d intended.
				413	"""
				414	pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
				415	r = []
				416	for name_value in pairs:
				417	if not name_value and not strict_parsing:
				418	continue
				419	nv = name_value.split('=', 1)
				420	if len(nv) != 2:
				421	if strict_parsing:
				422	raise ValueError("bad query field: %r" % (name_value,))
				423	# Handle case of a control-name with no equal sign
				424	if keep_blank_values:
				425	nv.append('')
				426	else:
				427	continue
				428	if len(nv[1]) or keep_blank_values:
				429	name = unquote(nv[0].replace('+', ' '))
				430	value = unquote(nv[1].replace('+', ' '))
				431	r.append((name, value))
				432
				433	return r
				434
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	435	def unquote_plus(string, encoding='utf-8', errors='replace'):
				436	"""Like unquote(), but also replace plus signs by spaces, as required for
				437	unquoting HTML form values.
				438
				439	unquote_plus('%7e/abc+def') -> '~/abc def'
				440	"""
				441	string = string.replace('+', ' ')
				442	return unquote(string, encoding, errors)
				443
				444	_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				445	b'abcdefghijklmnopqrstuvwxyz'
				446	b'0123456789'
				447	b'_.-')
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	448	_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
				449	_safe_quoters = {}
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	450
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	451	class Quoter(collections.defaultdict):
				452	"""A mapping from bytes (in range(0,256)) to strings.
				453
				454	String values are percent-encoded byte values, unless the key < 128, and
				455	in the "safe" set (either the specified safe set, or default set).
				456	"""
				457	# Keeps a cache internally, using defaultdict, for efficiency (lookups
				458	# of cached keys don't call Python code at all).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	459	def __init__(self, safe):
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	460	"""safe: bytes object."""
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	461	self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	462
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	463	def __repr__(self):
				464	# Without this, will just display as a defaultdict
				465	return "<Quoter %r>" % dict(self)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	466
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	467	def __missing__(self, b):
				468	# Handle a cache miss. Store quoted string in cache and return.
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	469	res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	470	self[b] = res
				471	return res
				472
				473	def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	474	"""quote('abc def') -> 'abc%20def'
				475
				476	Each part of a URL, e.g. the path info, the query, etc., has a
				477	different set of reserved characters that must be quoted.
				478
				479	RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
				480	the following reserved characters.
				481
				482	reserved = ";" \| "/" \| "?" \| ":" \| "@" \| "&" \| "=" \| "+" \|
				483	"$" \| ","
				484
				485	Each of these characters is reserved in some component of a URL,
				486	but not necessarily in all of them.
				487
				488	By default, the quote function is intended for quoting the path
				489	section of a URL. Thus, it will not encode '/'. This character
				490	is reserved, but in typical usage the quote function is being
				491	called on a path where the existing slash characters are used as
				492	reserved characters.
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	493
				494	string and safe may be either str or bytes objects. encoding must
				495	not be specified if string is a str.
				496
				497	The optional encoding and errors parameters specify how to deal with
				498	non-ASCII characters, as accepted by the str.encode method.
				499	By default, encoding='utf-8' (characters are encoded with UTF-8), and
				500	errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	501	"""
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	502	if isinstance(string, str):
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	503	if not string:
				504	return string
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	505	if encoding is None:
				506	encoding = 'utf-8'
				507	if errors is None:
				508	errors = 'strict'
				509	string = string.encode(encoding, errors)
				510	else:
				511	if encoding is not None:
				512	raise TypeError("quote() doesn't support 'encoding' for bytes")
				513	if errors is not None:
				514	raise TypeError("quote() doesn't support 'errors' for bytes")
				515	return quote_from_bytes(string, safe)
				516
				517	def quote_plus(string, safe='', encoding=None, errors=None):
				518	"""Like quote(), but also replace ' ' with '+', as required for quoting
				519	HTML form values. Plus signs in the original string are escaped unless
				520	they are included in safe. It also does not have safe default to '/'.
				521	"""
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	522	# Check if ' ' in string, where string may either be a str or bytes. If
				523	# there are no spaces, the regular quote will produce the right answer.
				524	if ((isinstance(string, str) and ' ' not in string) or
				525	(isinstance(string, bytes) and b' ' not in string)):
				526	return quote(string, safe, encoding, errors)
				527	if isinstance(safe, str):
				528	space = ' '
				529	else:
				530	space = b' '
Georg Brandl	faf4149	2009-05-26 18:31:11 +0000	[diff] [blame]	531	string = quote(string, safe + space, encoding, errors)
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	532	return string.replace(' ', '+')
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	533
				534	def quote_from_bytes(bs, safe='/'):
				535	"""Like quote(), but accepts a bytes object rather than a str, and does
				536	not perform string-to-bytes encoding. It always returns an ASCII string.
				537	quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
				538	"""
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	539	if not isinstance(bs, (bytes, bytearray)):
				540	raise TypeError("quote_from_bytes() expected bytes")
				541	if not bs:
				542	return ''
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	543	if isinstance(safe, str):
				544	# Normalize 'safe' by converting to bytes and removing non-ASCII chars
				545	safe = safe.encode('ascii', 'ignore')
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	546	else:
				547	safe = bytes([c for c in safe if c < 128])
				548	if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
				549	return bs.decode()
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	550	try:
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	551	quoter = _safe_quoters[safe]
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	552	except KeyError:
Florent Xicluna	c7b8e86	2010-05-17 17:33:07 +0000	[diff] [blame]	553	_safe_quoters[safe] = quoter = Quoter(safe).__getitem__
				554	return ''.join([quoter(char) for char in bs])
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	555
Georg Brandl	3d6575d	2009-09-16 14:36:22 +0000	[diff] [blame]	556	def urlencode(query, doseq=False):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	557	"""Encode a sequence of two-element tuples or dictionary into a URL query string.
				558
				559	If any values in the query arg are sequences and doseq is true, each
				560	sequence element is converted to a separate parameter.
				561
				562	If the query arg is a sequence of two-element tuples, the order of the
				563	parameters in the output will match the order of parameters in the
				564	input.
				565	"""
				566
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	567	if hasattr(query, "items"):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	568	query = query.items()
				569	else:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	570	# It's a bother at times that strings and string-like objects are
				571	# sequences.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	572	try:
				573	# non-sequence items should not work with len()
				574	# non-empty strings will fail this
				575	if len(query) and not isinstance(query[0], tuple):
				576	raise TypeError
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	577	# Zero-length sequences of all types will get here and succeed,
				578	# but that's a minor nit. Since the original implementation
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	579	# allowed empty dicts that type of behavior probably should be
				580	# preserved for consistency
				581	except TypeError:
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	582	ty, va, tb = sys.exc_info()
				583	raise TypeError("not a valid non-string sequence "
				584	"or mapping object").with_traceback(tb)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	585
				586	l = []
				587	if not doseq:
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	588	for k, v in query:
				589	k = quote_plus(str(k))
				590	v = quote_plus(str(v))
				591	l.append(k + '=' + v)
				592	else:
				593	for k, v in query:
				594	k = quote_plus(str(k))
				595	if isinstance(v, str):
				596	v = quote_plus(v)
				597	l.append(k + '=' + v)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	598	else:
				599	try:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	600	# Is this a sufficient test for sequence-ness?
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	601	x = len(v)
				602	except TypeError:
				603	# not a sequence
				604	v = quote_plus(str(v))
				605	l.append(k + '=' + v)
				606	else:
				607	# loop over the sequence
				608	for elt in v:
				609	l.append(k + '=' + quote_plus(str(elt)))
				610	return '&'.join(l)
				611
				612	# Utilities to parse URLs (most of these return None for missing parts):
				613	# unwrap('<URL:type://host/path>') --> 'type://host/path'
				614	# splittype('type:opaquestring') --> 'type', 'opaquestring'
				615	# splithost('//host[:port]/path') --> 'host[:port]', '/path'
				616	# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
				617	# splitpasswd('user:passwd') -> 'user', 'passwd'
				618	# splitport('host:port') --> 'host', 'port'
				619	# splitquery('/path?query') --> '/path', 'query'
				620	# splittag('/path#tag') --> '/path', 'tag'
				621	# splitattr('/path;attr1=value1;attr2=value2;...') ->
				622	# '/path', ['attr1=value1', 'attr2=value2', ...]
				623	# splitvalue('attr=value') --> 'attr', 'value'
				624	# urllib.parse.unquote('abc%20def') -> 'abc def'
				625	# quote('abc def') -> 'abc%20def')
				626
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	627	def to_bytes(url):
				628	"""to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	629	# Most URL schemes require ASCII. If that changes, the conversion
				630	# can be relaxed.
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	631	# XXX get rid of to_bytes()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	632	if isinstance(url, str):
				633	try:
				634	url = url.encode("ASCII").decode()
				635	except UnicodeError:
				636	raise UnicodeError("URL " + repr(url) +
				637	" contains non-ASCII characters")
				638	return url
				639
				640	def unwrap(url):
				641	"""unwrap('<URL:type://host/path>') --> 'type://host/path'."""
				642	url = str(url).strip()
				643	if url[:1] == '<' and url[-1:] == '>':
				644	url = url[1:-1].strip()
				645	if url[:4] == 'URL:': url = url[4:].strip()
				646	return url
				647
				648	_typeprog = None
				649	def splittype(url):
				650	"""splittype('type:opaquestring') --> 'type', 'opaquestring'."""
				651	global _typeprog
				652	if _typeprog is None:
				653	import re
				654	_typeprog = re.compile('^([^/:]+):')
				655
				656	match = _typeprog.match(url)
				657	if match:
				658	scheme = match.group(1)
				659	return scheme.lower(), url[len(scheme) + 1:]
				660	return None, url
				661
				662	_hostprog = None
				663	def splithost(url):
				664	"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
				665	global _hostprog
				666	if _hostprog is None:
				667	import re
				668	_hostprog = re.compile('^//([^/?])(.)$')
				669
				670	match = _hostprog.match(url)
				671	if match: return match.group(1, 2)
				672	return None, url
				673
				674	_userprog = None
				675	def splituser(host):
				676	"""splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
				677	global _userprog
				678	if _userprog is None:
				679	import re
				680	_userprog = re.compile('^(.)@(.)$')
				681
				682	match = _userprog.match(host)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	683	if match: return map(unquote, match.group(1, 2))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	684	return None, host
				685
				686	_passwdprog = None
				687	def splitpasswd(user):
				688	"""splitpasswd('user:passwd') -> 'user', 'passwd'."""
				689	global _passwdprog
				690	if _passwdprog is None:
				691	import re
Senthil Kumaran	eaaec27	2009-03-30 21:54:41 +0000	[diff] [blame]	692	_passwdprog = re.compile('^([^:]):(.)$',re.S)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	693
				694	match = _passwdprog.match(user)
				695	if match: return match.group(1, 2)
				696	return user, None
				697
				698	# splittag('/path#tag') --> '/path', 'tag'
				699	_portprog = None
				700	def splitport(host):
				701	"""splitport('host:port') --> 'host', 'port'."""
				702	global _portprog
				703	if _portprog is None:
				704	import re
				705	_portprog = re.compile('^(.*):([0-9]+)$')
				706
				707	match = _portprog.match(host)
				708	if match: return match.group(1, 2)
				709	return host, None
				710
				711	_nportprog = None
				712	def splitnport(host, defport=-1):
				713	"""Split host and port, returning numeric port.
				714	Return given default port if no ':' found; defaults to -1.
				715	Return numerical port if a valid number are found after ':'.
				716	Return None if ':' but not a valid number."""
				717	global _nportprog
				718	if _nportprog is None:
				719	import re
				720	_nportprog = re.compile('^(.):(.)$')
				721
				722	match = _nportprog.match(host)
				723	if match:
				724	host, port = match.group(1, 2)
				725	try:
				726	if not port: raise ValueError("no digits")
				727	nport = int(port)
				728	except ValueError:
				729	nport = None
				730	return host, nport
				731	return host, defport
				732
				733	_queryprog = None
				734	def splitquery(url):
				735	"""splitquery('/path?query') --> '/path', 'query'."""
				736	global _queryprog
				737	if _queryprog is None:
				738	import re
				739	_queryprog = re.compile('^(.)\?([^?])$')
				740
				741	match = _queryprog.match(url)
				742	if match: return match.group(1, 2)
				743	return url, None
				744
				745	_tagprog = None
				746	def splittag(url):
				747	"""splittag('/path#tag') --> '/path', 'tag'."""
				748	global _tagprog
				749	if _tagprog is None:
				750	import re
				751	_tagprog = re.compile('^(.)#([^#])$')
				752
				753	match = _tagprog.match(url)
				754	if match: return match.group(1, 2)
				755	return url, None
				756
				757	def splitattr(url):
				758	"""splitattr('/path;attr1=value1;attr2=value2;...') ->
				759	'/path', ['attr1=value1', 'attr2=value2', ...]."""
				760	words = url.split(';')
				761	return words[0], words[1:]
				762
				763	_valueprog = None
				764	def splitvalue(attr):
				765	"""splitvalue('attr=value') --> 'attr', 'value'."""
				766	global _valueprog
				767	if _valueprog is None:
				768	import re
				769	_valueprog = re.compile('^([^=])=(.)$')
				770
				771	match = _valueprog.match(attr)
				772	if match: return match.group(1, 2)
				773	return attr, None
				774
				775	test_input = """
				776	http://a/b/c/d
				777
				778	g:h = <URL:g:h>
				779	http:g = <URL:http://a/b/c/g>
				780	http: = <URL:http://a/b/c/d>
				781	g = <URL:http://a/b/c/g>
				782	./g = <URL:http://a/b/c/g>
				783	g/ = <URL:http://a/b/c/g/>
				784	/g = <URL:http://a/g>
				785	//g = <URL:http://g>
				786	?y = <URL:http://a/b/c/d?y>
				787	g?y = <URL:http://a/b/c/g?y>
				788	g?y/./x = <URL:http://a/b/c/g?y/./x>
				789	. = <URL:http://a/b/c/>
				790	./ = <URL:http://a/b/c/>
				791	.. = <URL:http://a/b/>
				792	../ = <URL:http://a/b/>
				793	../g = <URL:http://a/b/g>
				794	../.. = <URL:http://a/>
				795	../../g = <URL:http://a/g>
				796	../../../g = <URL:http://a/../g>
				797	./../g = <URL:http://a/b/g>
				798	./g/. = <URL:http://a/b/c/g/>
				799	/./g = <URL:http://a/./g>
				800	g/./h = <URL:http://a/b/c/g/h>
				801	g/../h = <URL:http://a/b/c/h>
				802	http:g = <URL:http://a/b/c/g>
				803	http: = <URL:http://a/b/c/d>
				804	http:?y = <URL:http://a/b/c/d?y>
				805	http:g?y = <URL:http://a/b/c/g?y>
				806	http:g?y/./x = <URL:http://a/b/c/g?y/./x>
				807	"""
				808
				809	def test():
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	810	base = ''
				811	if sys.argv[1:]:
				812	fn = sys.argv[1]
				813	if fn == '-':
				814	fp = sys.stdin
				815	else:
				816	fp = open(fn)
				817	else:
				818	from io import StringIO
				819	fp = StringIO(test_input)
				820	for line in fp:
				821	words = line.split()
				822	if not words:
				823	continue
				824	url = words[0]
				825	parts = urlparse(url)
				826	print('%-10s : %s' % (url, parts))
				827	abs = urljoin(base, url)
				828	if not base:
				829	base = abs
				830	wrapped = '<URL:%s>' % abs
				831	print('%-10s = %s' % (url, wrapped))
				832	if len(words) == 3 and words[1] == '=':
				833	if wrapped != words[2]:
				834	print('EXPECTED', words[2], '!!!!!!!!!!')
				835
				836	if __name__ == '__main__':
				837	test()