Blame - Lib/urllib/parse.py - platform/external/python/cpython3

blob: 765f1c8334e5cbe1fa877e29bd21c1562d47fc1d [file] [log] [blame]

Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1	"""Parse (absolute and relative) URLs.
				2
Senthil Kumaran	6ffdb6f	2010-04-17 14:47:13 +0000	[diff] [blame]	3	urlparse module is based upon the following RFC specifications.
				4
				5	RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
				6	and L. Masinter, January 2005.
				7
Georg Brandl	c62efa8	2010-07-11 10:41:07 +0000	[diff] [blame]	8	RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaran	6ffdb6f	2010-04-17 14:47:13 +0000	[diff] [blame]	9	Berners-Lee, R. Fielding, and L. Masinter, August 1998.
				10
Georg Brandl	c62efa8	2010-07-11 10:41:07 +0000	[diff] [blame]	11	RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
Senthil Kumaran	6ffdb6f	2010-04-17 14:47:13 +0000	[diff] [blame]	12
				13	RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
				14	1995.
				15
Georg Brandl	c62efa8	2010-07-11 10:41:07 +0000	[diff] [blame]	16	RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaran	6ffdb6f	2010-04-17 14:47:13 +0000	[diff] [blame]	17	McCahill, December 1994
				18
Georg Brandl	c62efa8	2010-07-11 10:41:07 +0000	[diff] [blame]	19	RFC 3986 is considered the current standard and any future changes to
				20	urlparse module should conform with it. The urlparse module is
				21	currently not entirely compliant with this RFC due to defacto
				22	scenarios for parsing, and for backward compatibility purposes, some
				23	parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaran	6ffdb6f	2010-04-17 14:47:13 +0000	[diff] [blame]	24	test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	25	"""
				26
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	27	import sys
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	28	import collections
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	29
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	30	__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	31	"urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	32	"quote", "quote_plus", "quote_from_bytes",
				33	"unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	34
				35	# A classification of schemes ('' means apply by default)
				36	uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
				37	'wais', 'file', 'https', 'shttp', 'mms',
				38	'prospero', 'rtsp', 'rtspu', '', 'sftp']
				39	uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
				40	'imap', 'wais', 'file', 'mms', 'https', 'shttp',
				41	'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Florent Xicluna	37ddbb8	2010-08-14 21:06:29 +0000	[diff] [blame]	42	'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	43	non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
				44	'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
				45	uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
				46	'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
				47	'mms', '', 'sftp']
				48	uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
				49	'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
				50	uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
				51	'nntp', 'wais', 'https', 'shttp', 'snews',
				52	'file', 'prospero', '']
				53
				54	# Characters valid in scheme names
				55	scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
				56	'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				57	'0123456789'
				58	'+-.')
				59
				60	MAX_CACHE_SIZE = 20
				61	_parse_cache = {}
				62
				63	def clear_cache():
Florent Xicluna	37ddbb8	2010-08-14 21:06:29 +0000	[diff] [blame]	64	"""Clear the parse cache and the quoters cache."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	65	_parse_cache.clear()
Florent Xicluna	37ddbb8	2010-08-14 21:06:29 +0000	[diff] [blame]	66	_safe_quoters.clear()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	67
				68
				69	class ResultMixin(object):
				70	"""Shared methods for the parsed result objects."""
				71
				72	@property
				73	def username(self):
				74	netloc = self.netloc
				75	if "@" in netloc:
				76	userinfo = netloc.rsplit("@", 1)[0]
				77	if ":" in userinfo:
				78	userinfo = userinfo.split(":", 1)[0]
				79	return userinfo
				80	return None
				81
				82	@property
				83	def password(self):
				84	netloc = self.netloc
				85	if "@" in netloc:
				86	userinfo = netloc.rsplit("@", 1)[0]
				87	if ":" in userinfo:
				88	return userinfo.split(":", 1)[1]
				89	return None
				90
				91	@property
				92	def hostname(self):
Senthil Kumaran	a6023ca	2010-04-16 11:28:05 +0000	[diff] [blame]	93	netloc = self.netloc
				94	if "@" in netloc:
				95	netloc = netloc.rsplit("@", 1)[1]
				96	if ":" in netloc:
				97	netloc = netloc.split(":", 1)[0]
				98	return netloc.lower() or None
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	99
				100	@property
				101	def port(self):
Senthil Kumaran	a6023ca	2010-04-16 11:28:05 +0000	[diff] [blame]	102	netloc = self.netloc
				103	if "@" in netloc:
				104	netloc = netloc.rsplit("@", 1)[1]
				105	if ":" in netloc:
				106	port = netloc.split(":", 1)[1]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	107	return int(port, 10)
Senthil Kumaran	a6023ca	2010-04-16 11:28:05 +0000	[diff] [blame]	108	return None
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	109
				110	from collections import namedtuple
				111
				112	class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
				113
				114	__slots__ = ()
				115
				116	def geturl(self):
				117	return urlunsplit(self)
				118
				119
				120	class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
				121
				122	__slots__ = ()
				123
				124	def geturl(self):
				125	return urlunparse(self)
				126
				127
				128	def urlparse(url, scheme='', allow_fragments=True):
				129	"""Parse a URL into 6 components:
				130	<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
				131	Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
				132	Note that we don't break the components up in smaller bits
				133	(e.g. netloc is a single string) and we don't expand % escapes."""
				134	tuple = urlsplit(url, scheme, allow_fragments)
				135	scheme, netloc, url, query, fragment = tuple
				136	if scheme in uses_params and ';' in url:
				137	url, params = _splitparams(url)
				138	else:
				139	params = ''
				140	return ParseResult(scheme, netloc, url, params, query, fragment)
				141
				142	def _splitparams(url):
				143	if '/' in url:
				144	i = url.find(';', url.rfind('/'))
				145	if i < 0:
				146	return url, ''
				147	else:
				148	i = url.find(';')
				149	return url[:i], url[i+1:]
				150
				151	def _splitnetloc(url, start=0):
				152	delim = len(url) # position of end of domain part of url, default is end
				153	for c in '/?#': # look for delimiters; the order is NOT important
				154	wdelim = url.find(c, start) # find first of this delim
				155	if wdelim >= 0: # if found
				156	delim = min(delim, wdelim) # use earliest delim position
				157	return url[start:delim], url[delim:] # return (domain, rest)
				158
				159	def urlsplit(url, scheme='', allow_fragments=True):
				160	"""Parse a URL into 5 components:
				161	<scheme>://<netloc>/<path>?<query>#<fragment>
				162	Return a 5-tuple: (scheme, netloc, path, query, fragment).
				163	Note that we don't break the components up in smaller bits
				164	(e.g. netloc is a single string) and we don't expand % escapes."""
				165	allow_fragments = bool(allow_fragments)
				166	key = url, scheme, allow_fragments, type(url), type(scheme)
				167	cached = _parse_cache.get(key, None)
				168	if cached:
				169	return cached
				170	if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
				171	clear_cache()
				172	netloc = query = fragment = ''
				173	i = url.find(':')
				174	if i > 0:
				175	if url[:i] == 'http': # optimize the common case
				176	scheme = url[:i].lower()
				177	url = url[i+1:]
				178	if url[:2] == '//':
				179	netloc, url = _splitnetloc(url, 2)
				180	if allow_fragments and '#' in url:
				181	url, fragment = url.split('#', 1)
				182	if '?' in url:
				183	url, query = url.split('?', 1)
				184	v = SplitResult(scheme, netloc, url, query, fragment)
				185	_parse_cache[key] = v
				186	return v
Senthil Kumaran	8801f7a	2010-08-04 04:53:07 +0000	[diff] [blame]	187	if url.endswith(':') or not url[i+1].isdigit():
				188	for c in url[:i]:
				189	if c not in scheme_chars:
				190	break
				191	else:
				192	scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran	a8dbb24	2010-02-19 07:45:03 +0000	[diff] [blame]	193	if url[:2] == '//':
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	194	netloc, url = _splitnetloc(url, 2)
				195	if allow_fragments and scheme in uses_fragment and '#' in url:
				196	url, fragment = url.split('#', 1)
				197	if scheme in uses_query and '?' in url:
				198	url, query = url.split('?', 1)
				199	v = SplitResult(scheme, netloc, url, query, fragment)
				200	_parse_cache[key] = v
				201	return v
				202
				203	def urlunparse(components):
				204	"""Put a parsed URL back together again. This may result in a
				205	slightly different, but equivalent URL, if the URL that was parsed
				206	originally had redundant delimiters, e.g. a ? with an empty query
				207	(the draft states that these are equivalent)."""
				208	scheme, netloc, url, params, query, fragment = components
				209	if params:
				210	url = "%s;%s" % (url, params)
				211	return urlunsplit((scheme, netloc, url, query, fragment))
				212
				213	def urlunsplit(components):
Senthil Kumaran	930049b	2010-06-28 14:12:18 +0000	[diff] [blame]	214	"""Combine the elements of a tuple as returned by urlsplit() into a
				215	complete URL as a string. The data argument can be any five-item iterable.
				216	This may result in a slightly different, but equivalent URL, if the URL that
				217	was parsed originally had unnecessary delimiters (for example, a ? with an
				218	empty query; the RFC states that these are equivalent)."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	219	scheme, netloc, url, query, fragment = components
				220	if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
				221	if url and url[:1] != '/': url = '/' + url
				222	url = '//' + (netloc or '') + url
				223	if scheme:
				224	url = scheme + ':' + url
				225	if query:
				226	url = url + '?' + query
				227	if fragment:
				228	url = url + '#' + fragment
				229	return url
				230
				231	def urljoin(base, url, allow_fragments=True):
				232	"""Join a base URL and a possibly relative URL to form an absolute
				233	interpretation of the latter."""
				234	if not base:
				235	return url
				236	if not url:
				237	return base
				238	bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
				239	urlparse(base, '', allow_fragments)
				240	scheme, netloc, path, params, query, fragment = \
				241	urlparse(url, bscheme, allow_fragments)
				242	if scheme != bscheme or scheme not in uses_relative:
				243	return url
				244	if scheme in uses_netloc:
				245	if netloc:
				246	return urlunparse((scheme, netloc, path,
				247	params, query, fragment))
				248	netloc = bnetloc
				249	if path[:1] == '/':
				250	return urlunparse((scheme, netloc, path,
				251	params, query, fragment))
Facundo Batista	23e3856	2008-08-14 16:55:14 +0000	[diff] [blame]	252	if not path:
				253	path = bpath
				254	if not params:
				255	params = bparams
				256	else:
				257	path = path[:-1]
				258	return urlunparse((scheme, netloc, path,
				259	params, query, fragment))
				260	if not query:
				261	query = bquery
				262	return urlunparse((scheme, netloc, path,
				263	params, query, fragment))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	264	segments = bpath.split('/')[:-1] + path.split('/')
				265	# XXX The stuff below is bogus in various ways...
				266	if segments[-1] == '.':
				267	segments[-1] = ''
				268	while '.' in segments:
				269	segments.remove('.')
				270	while 1:
				271	i = 1
				272	n = len(segments) - 1
				273	while i < n:
				274	if (segments[i] == '..'
				275	and segments[i-1] not in ('', '..')):
				276	del segments[i-1:i+1]
				277	break
				278	i = i+1
				279	else:
				280	break
				281	if segments == ['', '..']:
				282	segments[-1] = ''
				283	elif len(segments) >= 2 and segments[-1] == '..':
				284	segments[-2:] = ['']
				285	return urlunparse((scheme, netloc, '/'.join(segments),
				286	params, query, fragment))
				287
				288	def urldefrag(url):
				289	"""Removes any existing fragment from URL.
				290
				291	Returns a tuple of the defragmented URL and the fragment. If
				292	the URL contained no fragments, the second element is the
				293	empty string.
				294	"""
				295	if '#' in url:
				296	s, n, p, a, q, frag = urlparse(url)
				297	defrag = urlunparse((s, n, p, a, q, ''))
				298	return defrag, frag
				299	else:
				300	return url, ''
				301
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	302	def unquote_to_bytes(string):
				303	"""unquote_to_bytes('abc%20def') -> b'abc def'."""
				304	# Note: strings are encoded as UTF-8. This is only an issue if it contains
				305	# unescaped non-ASCII characters, which URIs should not.
Florent Xicluna	37ddbb8	2010-08-14 21:06:29 +0000	[diff] [blame]	306	if not string:
				307	# Is it a string-like object?
				308	string.split
				309	return b''
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	310	if isinstance(string, str):
				311	string = string.encode('utf-8')
				312	res = string.split(b'%')
Florent Xicluna	37ddbb8	2010-08-14 21:06:29 +0000	[diff] [blame]	313	if len(res) == 1:
				314	return string
				315	string = res[0]
				316	for item in res[1:]:
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	317	try:
Florent Xicluna	37ddbb8	2010-08-14 21:06:29 +0000	[diff] [blame]	318	string += bytes([int(item[:2], 16)]) + item[2:]
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	319	except ValueError:
Florent Xicluna	37ddbb8	2010-08-14 21:06:29 +0000	[diff] [blame]	320	string += b'%' + item
				321	return string
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	322
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	323	def unquote(string, encoding='utf-8', errors='replace'):
				324	"""Replace %xx escapes by their single-character equivalent. The optional
				325	encoding and errors parameters specify how to decode percent-encoded
				326	sequences into Unicode characters, as accepted by the bytes.decode()
				327	method.
				328	By default, percent-encoded sequences are decoded with UTF-8, and invalid
				329	sequences are replaced by a placeholder character.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	330
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	331	unquote('abc%20def') -> 'abc def'.
				332	"""
Florent Xicluna	37ddbb8	2010-08-14 21:06:29 +0000	[diff] [blame]	333	if string == '':
				334	return string
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	335	res = string.split('%')
Florent Xicluna	37ddbb8	2010-08-14 21:06:29 +0000	[diff] [blame]	336	if len(res) == 1:
				337	return string
				338	if encoding is None:
				339	encoding = 'utf-8'
				340	if errors is None:
				341	errors = 'replace'
				342	# pct_sequence: contiguous sequence of percent-encoded bytes, decoded
				343	pct_sequence = b''
				344	string = res[0]
				345	for item in res[1:]:
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	346	try:
Florent Xicluna	37ddbb8	2010-08-14 21:06:29 +0000	[diff] [blame]	347	if not item:
				348	raise ValueError
				349	pct_sequence += bytes.fromhex(item[:2])
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	350	rest = item[2:]
Florent Xicluna	37ddbb8	2010-08-14 21:06:29 +0000	[diff] [blame]	351	if not rest:
				352	# This segment was just a single percent-encoded character.
				353	# May be part of a sequence of code units, so delay decoding.
				354	# (Stored in pct_sequence).
				355	continue
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	356	except ValueError:
				357	rest = '%' + item
Florent Xicluna	37ddbb8	2010-08-14 21:06:29 +0000	[diff] [blame]	358	# Encountered non-percent-encoded characters. Flush the current
				359	# pct_sequence.
				360	string += pct_sequence.decode(encoding, errors) + rest
				361	pct_sequence = b''
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	362	if pct_sequence:
				363	# Flush the final pct_sequence
Florent Xicluna	37ddbb8	2010-08-14 21:06:29 +0000	[diff] [blame]	364	string += pct_sequence.decode(encoding, errors)
				365	return string
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	366
Georg Brandl	b044b2a	2009-09-16 16:05:59 +0000	[diff] [blame]	367	def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	368	"""Parse a query given as a string argument.
				369
				370	Arguments:
				371
Senthil Kumaran	7a956cc	2010-08-09 20:08:48 +0000	[diff] [blame]	372	qs: percent-encoded query string to be parsed
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	373
				374	keep_blank_values: flag indicating whether blank values in
Senthil Kumaran	7a956cc	2010-08-09 20:08:48 +0000	[diff] [blame]	375	percent-encoded queries should be treated as blank strings.
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	376	A true value indicates that blanks should be retained as
				377	blank strings. The default false value indicates that
				378	blank values are to be ignored and treated as if they were
				379	not included.
				380
				381	strict_parsing: flag indicating what to do with parsing errors.
				382	If false (the default), errors are silently ignored.
				383	If true, errors raise a ValueError exception.
				384	"""
				385	dict = {}
				386	for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
				387	if name in dict:
				388	dict[name].append(value)
				389	else:
				390	dict[name] = [value]
				391	return dict
				392
Georg Brandl	b044b2a	2009-09-16 16:05:59 +0000	[diff] [blame]	393	def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	394	"""Parse a query given as a string argument.
				395
				396	Arguments:
				397
Senthil Kumaran	7a956cc	2010-08-09 20:08:48 +0000	[diff] [blame]	398	qs: percent-encoded query string to be parsed
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	399
				400	keep_blank_values: flag indicating whether blank values in
Senthil Kumaran	7a956cc	2010-08-09 20:08:48 +0000	[diff] [blame]	401	percent-encoded queries should be treated as blank strings. A
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	402	true value indicates that blanks should be retained as blank
				403	strings. The default false value indicates that blank values
				404	are to be ignored and treated as if they were not included.
				405
				406	strict_parsing: flag indicating what to do with parsing errors. If
				407	false (the default), errors are silently ignored. If true,
				408	errors raise a ValueError exception.
				409
				410	Returns a list, as G-d intended.
				411	"""
				412	pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
				413	r = []
				414	for name_value in pairs:
				415	if not name_value and not strict_parsing:
				416	continue
				417	nv = name_value.split('=', 1)
				418	if len(nv) != 2:
				419	if strict_parsing:
				420	raise ValueError("bad query field: %r" % (name_value,))
				421	# Handle case of a control-name with no equal sign
				422	if keep_blank_values:
				423	nv.append('')
				424	else:
				425	continue
				426	if len(nv[1]) or keep_blank_values:
				427	name = unquote(nv[0].replace('+', ' '))
				428	value = unquote(nv[1].replace('+', ' '))
				429	r.append((name, value))
				430
				431	return r
				432
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	433	def unquote_plus(string, encoding='utf-8', errors='replace'):
				434	"""Like unquote(), but also replace plus signs by spaces, as required for
				435	unquoting HTML form values.
				436
				437	unquote_plus('%7e/abc+def') -> '~/abc def'
				438	"""
				439	string = string.replace('+', ' ')
				440	return unquote(string, encoding, errors)
				441
				442	_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				443	b'abcdefghijklmnopqrstuvwxyz'
				444	b'0123456789'
				445	b'_.-')
Florent Xicluna	37ddbb8	2010-08-14 21:06:29 +0000	[diff] [blame]	446	_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
				447	_safe_quoters = {}
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	448
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	449	class Quoter(collections.defaultdict):
				450	"""A mapping from bytes (in range(0,256)) to strings.
				451
				452	String values are percent-encoded byte values, unless the key < 128, and
				453	in the "safe" set (either the specified safe set, or default set).
				454	"""
				455	# Keeps a cache internally, using defaultdict, for efficiency (lookups
				456	# of cached keys don't call Python code at all).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	457	def __init__(self, safe):
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	458	"""safe: bytes object."""
Florent Xicluna	37ddbb8	2010-08-14 21:06:29 +0000	[diff] [blame]	459	self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	460
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	461	def __repr__(self):
				462	# Without this, will just display as a defaultdict
				463	return "<Quoter %r>" % dict(self)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	464
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	465	def __missing__(self, b):
				466	# Handle a cache miss. Store quoted string in cache and return.
Florent Xicluna	37ddbb8	2010-08-14 21:06:29 +0000	[diff] [blame]	467	res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	468	self[b] = res
				469	return res
				470
				471	def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	472	"""quote('abc def') -> 'abc%20def'
				473
				474	Each part of a URL, e.g. the path info, the query, etc., has a
				475	different set of reserved characters that must be quoted.
				476
				477	RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
				478	the following reserved characters.
				479
				480	reserved = ";" \| "/" \| "?" \| ":" \| "@" \| "&" \| "=" \| "+" \|
				481	"$" \| ","
				482
				483	Each of these characters is reserved in some component of a URL,
				484	but not necessarily in all of them.
				485
				486	By default, the quote function is intended for quoting the path
				487	section of a URL. Thus, it will not encode '/'. This character
				488	is reserved, but in typical usage the quote function is being
				489	called on a path where the existing slash characters are used as
				490	reserved characters.
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	491
				492	string and safe may be either str or bytes objects. encoding must
				493	not be specified if string is a str.
				494
				495	The optional encoding and errors parameters specify how to deal with
				496	non-ASCII characters, as accepted by the str.encode method.
				497	By default, encoding='utf-8' (characters are encoded with UTF-8), and
				498	errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	499	"""
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	500	if isinstance(string, str):
Florent Xicluna	37ddbb8	2010-08-14 21:06:29 +0000	[diff] [blame]	501	if not string:
				502	return string
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	503	if encoding is None:
				504	encoding = 'utf-8'
				505	if errors is None:
				506	errors = 'strict'
				507	string = string.encode(encoding, errors)
				508	else:
				509	if encoding is not None:
				510	raise TypeError("quote() doesn't support 'encoding' for bytes")
				511	if errors is not None:
				512	raise TypeError("quote() doesn't support 'errors' for bytes")
				513	return quote_from_bytes(string, safe)
				514
				515	def quote_plus(string, safe='', encoding=None, errors=None):
				516	"""Like quote(), but also replace ' ' with '+', as required for quoting
				517	HTML form values. Plus signs in the original string are escaped unless
				518	they are included in safe. It also does not have safe default to '/'.
				519	"""
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	520	# Check if ' ' in string, where string may either be a str or bytes. If
				521	# there are no spaces, the regular quote will produce the right answer.
				522	if ((isinstance(string, str) and ' ' not in string) or
				523	(isinstance(string, bytes) and b' ' not in string)):
				524	return quote(string, safe, encoding, errors)
				525	if isinstance(safe, str):
				526	space = ' '
				527	else:
				528	space = b' '
Georg Brandl	faf4149	2009-05-26 18:31:11 +0000	[diff] [blame]	529	string = quote(string, safe + space, encoding, errors)
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	530	return string.replace(' ', '+')
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	531
				532	def quote_from_bytes(bs, safe='/'):
				533	"""Like quote(), but accepts a bytes object rather than a str, and does
				534	not perform string-to-bytes encoding. It always returns an ASCII string.
				535	quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
				536	"""
Florent Xicluna	37ddbb8	2010-08-14 21:06:29 +0000	[diff] [blame]	537	if not isinstance(bs, (bytes, bytearray)):
				538	raise TypeError("quote_from_bytes() expected bytes")
				539	if not bs:
				540	return ''
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	541	if isinstance(safe, str):
				542	# Normalize 'safe' by converting to bytes and removing non-ASCII chars
				543	safe = safe.encode('ascii', 'ignore')
Florent Xicluna	37ddbb8	2010-08-14 21:06:29 +0000	[diff] [blame]	544	else:
				545	safe = bytes([c for c in safe if c < 128])
				546	if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
				547	return bs.decode()
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	548	try:
Florent Xicluna	37ddbb8	2010-08-14 21:06:29 +0000	[diff] [blame]	549	quoter = _safe_quoters[safe]
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	550	except KeyError:
Florent Xicluna	37ddbb8	2010-08-14 21:06:29 +0000	[diff] [blame]	551	_safe_quoters[safe] = quoter = Quoter(safe).__getitem__
				552	return ''.join([quoter(char) for char in bs])
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	553
Senthil Kumaran	fe1ad15	2010-07-03 17:55:41 +0000	[diff] [blame]	554	def urlencode(query, doseq=False, safe='', encoding=None, errors=None):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	555	"""Encode a sequence of two-element tuples or dictionary into a URL query string.
				556
				557	If any values in the query arg are sequences and doseq is true, each
				558	sequence element is converted to a separate parameter.
				559
				560	If the query arg is a sequence of two-element tuples, the order of the
				561	parameters in the output will match the order of parameters in the
				562	input.
Senthil Kumaran	fe1ad15	2010-07-03 17:55:41 +0000	[diff] [blame]	563
				564	The query arg may be either a string or a bytes type. When query arg is a
				565	string, the safe, encoding and error parameters are sent the quote_plus for
				566	encoding.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	567	"""
				568
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	569	if hasattr(query, "items"):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	570	query = query.items()
				571	else:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	572	# It's a bother at times that strings and string-like objects are
				573	# sequences.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	574	try:
				575	# non-sequence items should not work with len()
				576	# non-empty strings will fail this
				577	if len(query) and not isinstance(query[0], tuple):
				578	raise TypeError
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	579	# Zero-length sequences of all types will get here and succeed,
				580	# but that's a minor nit. Since the original implementation
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	581	# allowed empty dicts that type of behavior probably should be
				582	# preserved for consistency
				583	except TypeError:
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	584	ty, va, tb = sys.exc_info()
				585	raise TypeError("not a valid non-string sequence "
				586	"or mapping object").with_traceback(tb)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	587
				588	l = []
				589	if not doseq:
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	590	for k, v in query:
Senthil Kumaran	fe1ad15	2010-07-03 17:55:41 +0000	[diff] [blame]	591	if isinstance(k, bytes):
				592	k = quote_plus(k, safe)
				593	else:
				594	k = quote_plus(str(k), safe, encoding, errors)
				595
				596	if isinstance(v, bytes):
				597	v = quote_plus(v, safe)
				598	else:
				599	v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	600	l.append(k + '=' + v)
				601	else:
				602	for k, v in query:
Senthil Kumaran	fe1ad15	2010-07-03 17:55:41 +0000	[diff] [blame]	603	if isinstance(k, bytes):
				604	k = quote_plus(k, safe)
				605	else:
				606	k = quote_plus(str(k), safe, encoding, errors)
				607
				608	if isinstance(v, bytes):
				609	v = quote_plus(v, safe)
				610	l.append(k + '=' + v)
				611	elif isinstance(v, str):
				612	v = quote_plus(v, safe, encoding, errors)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	613	l.append(k + '=' + v)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	614	else:
				615	try:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	616	# Is this a sufficient test for sequence-ness?
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	617	x = len(v)
				618	except TypeError:
				619	# not a sequence
Senthil Kumaran	fe1ad15	2010-07-03 17:55:41 +0000	[diff] [blame]	620	v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	621	l.append(k + '=' + v)
				622	else:
				623	# loop over the sequence
				624	for elt in v:
Senthil Kumaran	fe1ad15	2010-07-03 17:55:41 +0000	[diff] [blame]	625	if isinstance(elt, bytes):
				626	elt = quote_plus(elt, safe)
				627	else:
				628	elt = quote_plus(str(elt), safe, encoding, errors)
				629	l.append(k + '=' + elt)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	630	return '&'.join(l)
				631
				632	# Utilities to parse URLs (most of these return None for missing parts):
				633	# unwrap('<URL:type://host/path>') --> 'type://host/path'
				634	# splittype('type:opaquestring') --> 'type', 'opaquestring'
				635	# splithost('//host[:port]/path') --> 'host[:port]', '/path'
				636	# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
				637	# splitpasswd('user:passwd') -> 'user', 'passwd'
				638	# splitport('host:port') --> 'host', 'port'
				639	# splitquery('/path?query') --> '/path', 'query'
				640	# splittag('/path#tag') --> '/path', 'tag'
				641	# splitattr('/path;attr1=value1;attr2=value2;...') ->
				642	# '/path', ['attr1=value1', 'attr2=value2', ...]
				643	# splitvalue('attr=value') --> 'attr', 'value'
				644	# urllib.parse.unquote('abc%20def') -> 'abc def'
				645	# quote('abc def') -> 'abc%20def')
				646
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	647	def to_bytes(url):
				648	"""to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	649	# Most URL schemes require ASCII. If that changes, the conversion
				650	# can be relaxed.
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	651	# XXX get rid of to_bytes()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	652	if isinstance(url, str):
				653	try:
				654	url = url.encode("ASCII").decode()
				655	except UnicodeError:
				656	raise UnicodeError("URL " + repr(url) +
				657	" contains non-ASCII characters")
				658	return url
				659
				660	def unwrap(url):
				661	"""unwrap('<URL:type://host/path>') --> 'type://host/path'."""
				662	url = str(url).strip()
				663	if url[:1] == '<' and url[-1:] == '>':
				664	url = url[1:-1].strip()
				665	if url[:4] == 'URL:': url = url[4:].strip()
				666	return url
				667
				668	_typeprog = None
				669	def splittype(url):
				670	"""splittype('type:opaquestring') --> 'type', 'opaquestring'."""
				671	global _typeprog
				672	if _typeprog is None:
				673	import re
				674	_typeprog = re.compile('^([^/:]+):')
				675
				676	match = _typeprog.match(url)
				677	if match:
				678	scheme = match.group(1)
				679	return scheme.lower(), url[len(scheme) + 1:]
				680	return None, url
				681
				682	_hostprog = None
				683	def splithost(url):
				684	"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
				685	global _hostprog
				686	if _hostprog is None:
				687	import re
				688	_hostprog = re.compile('^//([^/?])(.)$')
				689
				690	match = _hostprog.match(url)
				691	if match: return match.group(1, 2)
				692	return None, url
				693
				694	_userprog = None
				695	def splituser(host):
				696	"""splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
				697	global _userprog
				698	if _userprog is None:
				699	import re
				700	_userprog = re.compile('^(.)@(.)$')
				701
				702	match = _userprog.match(host)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	703	if match: return map(unquote, match.group(1, 2))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	704	return None, host
				705
				706	_passwdprog = None
				707	def splitpasswd(user):
				708	"""splitpasswd('user:passwd') -> 'user', 'passwd'."""
				709	global _passwdprog
				710	if _passwdprog is None:
				711	import re
Senthil Kumaran	eaaec27	2009-03-30 21:54:41 +0000	[diff] [blame]	712	_passwdprog = re.compile('^([^:]):(.)$',re.S)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	713
				714	match = _passwdprog.match(user)
				715	if match: return match.group(1, 2)
				716	return user, None
				717
				718	# splittag('/path#tag') --> '/path', 'tag'
				719	_portprog = None
				720	def splitport(host):
				721	"""splitport('host:port') --> 'host', 'port'."""
				722	global _portprog
				723	if _portprog is None:
				724	import re
				725	_portprog = re.compile('^(.*):([0-9]+)$')
				726
				727	match = _portprog.match(host)
				728	if match: return match.group(1, 2)
				729	return host, None
				730
				731	_nportprog = None
				732	def splitnport(host, defport=-1):
				733	"""Split host and port, returning numeric port.
				734	Return given default port if no ':' found; defaults to -1.
				735	Return numerical port if a valid number are found after ':'.
				736	Return None if ':' but not a valid number."""
				737	global _nportprog
				738	if _nportprog is None:
				739	import re
				740	_nportprog = re.compile('^(.):(.)$')
				741
				742	match = _nportprog.match(host)
				743	if match:
				744	host, port = match.group(1, 2)
				745	try:
				746	if not port: raise ValueError("no digits")
				747	nport = int(port)
				748	except ValueError:
				749	nport = None
				750	return host, nport
				751	return host, defport
				752
				753	_queryprog = None
				754	def splitquery(url):
				755	"""splitquery('/path?query') --> '/path', 'query'."""
				756	global _queryprog
				757	if _queryprog is None:
				758	import re
				759	_queryprog = re.compile('^(.)\?([^?])$')
				760
				761	match = _queryprog.match(url)
				762	if match: return match.group(1, 2)
				763	return url, None
				764
				765	_tagprog = None
				766	def splittag(url):
				767	"""splittag('/path#tag') --> '/path', 'tag'."""
				768	global _tagprog
				769	if _tagprog is None:
				770	import re
				771	_tagprog = re.compile('^(.)#([^#])$')
				772
				773	match = _tagprog.match(url)
				774	if match: return match.group(1, 2)
				775	return url, None
				776
				777	def splitattr(url):
				778	"""splitattr('/path;attr1=value1;attr2=value2;...') ->
				779	'/path', ['attr1=value1', 'attr2=value2', ...]."""
				780	words = url.split(';')
				781	return words[0], words[1:]
				782
				783	_valueprog = None
				784	def splitvalue(attr):
				785	"""splitvalue('attr=value') --> 'attr', 'value'."""
				786	global _valueprog
				787	if _valueprog is None:
				788	import re
				789	_valueprog = re.compile('^([^=])=(.)$')
				790
				791	match = _valueprog.match(attr)
				792	if match: return match.group(1, 2)
				793	return attr, None