Blame - Lib/urllib/parse.py - platform/external/python/cpython3

blob: 3a8fd6046f47764637f5b7190242ed5fdaf5ab93 [file] [log] [blame]

Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1	"""Parse (absolute and relative) URLs.
				2
				3	See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
				4	UC Irvine, June 1995.
				5	"""
				6
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	7	import sys
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	8	import collections
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	9
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	10	__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	11	"urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	12	"quote", "quote_plus", "quote_from_bytes",
				13	"unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	14
				15	# A classification of schemes ('' means apply by default)
				16	uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
				17	'wais', 'file', 'https', 'shttp', 'mms',
				18	'prospero', 'rtsp', 'rtspu', '', 'sftp']
				19	uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
				20	'imap', 'wais', 'file', 'mms', 'https', 'shttp',
				21	'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaran	eaaec27	2009-03-30 21:54:41 +0000	[diff] [blame]	22	'svn', 'svn+ssh', 'sftp','nfs']
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	23	non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
				24	'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
				25	uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
				26	'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
				27	'mms', '', 'sftp']
				28	uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
				29	'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
				30	uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
				31	'nntp', 'wais', 'https', 'shttp', 'snews',
				32	'file', 'prospero', '']
				33
				34	# Characters valid in scheme names
				35	scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
				36	'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				37	'0123456789'
				38	'+-.')
				39
				40	MAX_CACHE_SIZE = 20
				41	_parse_cache = {}
				42
				43	def clear_cache():
Antoine Pitrou	2df5fc7	2009-12-08 19:38:17 +0000	[diff] [blame]	44	"""Clear the parse cache and the quoters cache."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	45	_parse_cache.clear()
Antoine Pitrou	2df5fc7	2009-12-08 19:38:17 +0000	[diff] [blame]	46	_safe_quoters.clear()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	47
				48
				49	class ResultMixin(object):
				50	"""Shared methods for the parsed result objects."""
				51
				52	@property
				53	def username(self):
				54	netloc = self.netloc
				55	if "@" in netloc:
				56	userinfo = netloc.rsplit("@", 1)[0]
				57	if ":" in userinfo:
				58	userinfo = userinfo.split(":", 1)[0]
				59	return userinfo
				60	return None
				61
				62	@property
				63	def password(self):
				64	netloc = self.netloc
				65	if "@" in netloc:
				66	userinfo = netloc.rsplit("@", 1)[0]
				67	if ":" in userinfo:
				68	return userinfo.split(":", 1)[1]
				69	return None
				70
				71	@property
				72	def hostname(self):
Senthil Kumaran	ad02d23	2010-04-16 03:02:13 +0000	[diff] [blame^]	73	netloc = self.netloc.split('@')[-1]
				74	if '[' in netloc and ']' in netloc:
				75	return netloc.split(']')[0][1:].lower()
				76	elif '[' in netloc or ']' in netloc:
				77	raise ValueError("Invalid IPv6 hostname")
				78	elif ':' in netloc:
				79	return netloc.split(':')[0].lower()
				80	elif netloc == '':
				81	return None
				82	else:
				83	return netloc.lower()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	84
				85	@property
				86	def port(self):
Senthil Kumaran	ad02d23	2010-04-16 03:02:13 +0000	[diff] [blame^]	87	netloc = self.netloc.split('@')[-1].split(']')[-1]
				88	if ':' in netloc:
				89	port = netloc.split(':')[1]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	90	return int(port, 10)
Senthil Kumaran	ad02d23	2010-04-16 03:02:13 +0000	[diff] [blame^]	91	else:
				92	return None
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	93
				94	from collections import namedtuple
				95
				96	class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
				97
				98	__slots__ = ()
				99
				100	def geturl(self):
				101	return urlunsplit(self)
				102
				103
				104	class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
				105
				106	__slots__ = ()
				107
				108	def geturl(self):
				109	return urlunparse(self)
				110
				111
				112	def urlparse(url, scheme='', allow_fragments=True):
				113	"""Parse a URL into 6 components:
				114	<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
				115	Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
				116	Note that we don't break the components up in smaller bits
				117	(e.g. netloc is a single string) and we don't expand % escapes."""
				118	tuple = urlsplit(url, scheme, allow_fragments)
				119	scheme, netloc, url, query, fragment = tuple
				120	if scheme in uses_params and ';' in url:
				121	url, params = _splitparams(url)
				122	else:
				123	params = ''
				124	return ParseResult(scheme, netloc, url, params, query, fragment)
				125
				126	def _splitparams(url):
				127	if '/' in url:
				128	i = url.find(';', url.rfind('/'))
				129	if i < 0:
				130	return url, ''
				131	else:
				132	i = url.find(';')
				133	return url[:i], url[i+1:]
				134
				135	def _splitnetloc(url, start=0):
				136	delim = len(url) # position of end of domain part of url, default is end
Senthil Kumaran	ad02d23	2010-04-16 03:02:13 +0000	[diff] [blame^]	137	if '[' in url: # check for invalid IPv6 URL
				138	if not ']' in url: raise ValueError("Invalid IPv6 URL")
				139	elif ']' in url:
				140	if not '[' in url: raise ValueError("Invalid IPv6 URL")
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	141	for c in '/?#': # look for delimiters; the order is NOT important
				142	wdelim = url.find(c, start) # find first of this delim
				143	if wdelim >= 0: # if found
				144	delim = min(delim, wdelim) # use earliest delim position
				145	return url[start:delim], url[delim:] # return (domain, rest)
				146
				147	def urlsplit(url, scheme='', allow_fragments=True):
				148	"""Parse a URL into 5 components:
				149	<scheme>://<netloc>/<path>?<query>#<fragment>
				150	Return a 5-tuple: (scheme, netloc, path, query, fragment).
				151	Note that we don't break the components up in smaller bits
				152	(e.g. netloc is a single string) and we don't expand % escapes."""
				153	allow_fragments = bool(allow_fragments)
				154	key = url, scheme, allow_fragments, type(url), type(scheme)
				155	cached = _parse_cache.get(key, None)
				156	if cached:
				157	return cached
				158	if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
				159	clear_cache()
				160	netloc = query = fragment = ''
				161	i = url.find(':')
				162	if i > 0:
				163	if url[:i] == 'http': # optimize the common case
				164	scheme = url[:i].lower()
				165	url = url[i+1:]
				166	if url[:2] == '//':
				167	netloc, url = _splitnetloc(url, 2)
				168	if allow_fragments and '#' in url:
				169	url, fragment = url.split('#', 1)
				170	if '?' in url:
				171	url, query = url.split('?', 1)
				172	v = SplitResult(scheme, netloc, url, query, fragment)
				173	_parse_cache[key] = v
				174	return v
				175	for c in url[:i]:
				176	if c not in scheme_chars:
				177	break
				178	else:
				179	scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran	6be85c5	2010-02-19 07:42:50 +0000	[diff] [blame]	180	if url[:2] == '//':
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	181	netloc, url = _splitnetloc(url, 2)
				182	if allow_fragments and scheme in uses_fragment and '#' in url:
				183	url, fragment = url.split('#', 1)
				184	if scheme in uses_query and '?' in url:
				185	url, query = url.split('?', 1)
				186	v = SplitResult(scheme, netloc, url, query, fragment)
				187	_parse_cache[key] = v
				188	return v
				189
				190	def urlunparse(components):
				191	"""Put a parsed URL back together again. This may result in a
				192	slightly different, but equivalent URL, if the URL that was parsed
				193	originally had redundant delimiters, e.g. a ? with an empty query
				194	(the draft states that these are equivalent)."""
				195	scheme, netloc, url, params, query, fragment = components
				196	if params:
				197	url = "%s;%s" % (url, params)
				198	return urlunsplit((scheme, netloc, url, query, fragment))
				199
				200	def urlunsplit(components):
				201	scheme, netloc, url, query, fragment = components
				202	if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
				203	if url and url[:1] != '/': url = '/' + url
				204	url = '//' + (netloc or '') + url
				205	if scheme:
				206	url = scheme + ':' + url
				207	if query:
				208	url = url + '?' + query
				209	if fragment:
				210	url = url + '#' + fragment
				211	return url
				212
				213	def urljoin(base, url, allow_fragments=True):
				214	"""Join a base URL and a possibly relative URL to form an absolute
				215	interpretation of the latter."""
				216	if not base:
				217	return url
				218	if not url:
				219	return base
				220	bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
				221	urlparse(base, '', allow_fragments)
				222	scheme, netloc, path, params, query, fragment = \
				223	urlparse(url, bscheme, allow_fragments)
				224	if scheme != bscheme or scheme not in uses_relative:
				225	return url
				226	if scheme in uses_netloc:
				227	if netloc:
				228	return urlunparse((scheme, netloc, path,
				229	params, query, fragment))
				230	netloc = bnetloc
				231	if path[:1] == '/':
				232	return urlunparse((scheme, netloc, path,
				233	params, query, fragment))
Facundo Batista	23e3856	2008-08-14 16:55:14 +0000	[diff] [blame]	234	if not path:
				235	path = bpath
				236	if not params:
				237	params = bparams
				238	else:
				239	path = path[:-1]
				240	return urlunparse((scheme, netloc, path,
				241	params, query, fragment))
				242	if not query:
				243	query = bquery
				244	return urlunparse((scheme, netloc, path,
				245	params, query, fragment))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	246	segments = bpath.split('/')[:-1] + path.split('/')
				247	# XXX The stuff below is bogus in various ways...
				248	if segments[-1] == '.':
				249	segments[-1] = ''
				250	while '.' in segments:
				251	segments.remove('.')
				252	while 1:
				253	i = 1
				254	n = len(segments) - 1
				255	while i < n:
				256	if (segments[i] == '..'
				257	and segments[i-1] not in ('', '..')):
				258	del segments[i-1:i+1]
				259	break
				260	i = i+1
				261	else:
				262	break
				263	if segments == ['', '..']:
				264	segments[-1] = ''
				265	elif len(segments) >= 2 and segments[-1] == '..':
				266	segments[-2:] = ['']
				267	return urlunparse((scheme, netloc, '/'.join(segments),
				268	params, query, fragment))
				269
				270	def urldefrag(url):
				271	"""Removes any existing fragment from URL.
				272
				273	Returns a tuple of the defragmented URL and the fragment. If
				274	the URL contained no fragments, the second element is the
				275	empty string.
				276	"""
				277	if '#' in url:
				278	s, n, p, a, q, frag = urlparse(url)
				279	defrag = urlunparse((s, n, p, a, q, ''))
				280	return defrag, frag
				281	else:
				282	return url, ''
				283
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	284	def unquote_to_bytes(string):
				285	"""unquote_to_bytes('abc%20def') -> b'abc def'."""
				286	# Note: strings are encoded as UTF-8. This is only an issue if it contains
				287	# unescaped non-ASCII characters, which URIs should not.
				288	if isinstance(string, str):
				289	string = string.encode('utf-8')
				290	res = string.split(b'%')
				291	res[0] = res[0]
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	292	for i in range(1, len(res)):
				293	item = res[i]
				294	try:
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	295	res[i] = bytes([int(item[:2], 16)]) + item[2:]
				296	except ValueError:
				297	res[i] = b'%' + item
				298	return b''.join(res)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	299
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	300	def unquote(string, encoding='utf-8', errors='replace'):
				301	"""Replace %xx escapes by their single-character equivalent. The optional
				302	encoding and errors parameters specify how to decode percent-encoded
				303	sequences into Unicode characters, as accepted by the bytes.decode()
				304	method.
				305	By default, percent-encoded sequences are decoded with UTF-8, and invalid
				306	sequences are replaced by a placeholder character.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	307
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	308	unquote('abc%20def') -> 'abc def'.
				309	"""
				310	if encoding is None: encoding = 'utf-8'
				311	if errors is None: errors = 'replace'
				312	# pct_sequence: contiguous sequence of percent-encoded bytes, decoded
				313	# (list of single-byte bytes objects)
				314	pct_sequence = []
				315	res = string.split('%')
				316	for i in range(1, len(res)):
				317	item = res[i]
				318	try:
				319	if not item: raise ValueError
				320	pct_sequence.append(bytes.fromhex(item[:2]))
				321	rest = item[2:]
				322	except ValueError:
				323	rest = '%' + item
				324	if not rest:
				325	# This segment was just a single percent-encoded character.
				326	# May be part of a sequence of code units, so delay decoding.
				327	# (Stored in pct_sequence).
				328	res[i] = ''
				329	else:
				330	# Encountered non-percent-encoded characters. Flush the current
				331	# pct_sequence.
				332	res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
				333	pct_sequence = []
				334	if pct_sequence:
				335	# Flush the final pct_sequence
				336	# res[-1] will always be empty if pct_sequence != []
				337	assert not res[-1], "string=%r, res=%r" % (string, res)
				338	res[-1] = b''.join(pct_sequence).decode(encoding, errors)
				339	return ''.join(res)
				340
Georg Brandl	3d6575d	2009-09-16 14:36:22 +0000	[diff] [blame]	341	def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	342	"""Parse a query given as a string argument.
				343
				344	Arguments:
				345
				346	qs: URL-encoded query string to be parsed
				347
				348	keep_blank_values: flag indicating whether blank values in
				349	URL encoded queries should be treated as blank strings.
				350	A true value indicates that blanks should be retained as
				351	blank strings. The default false value indicates that
				352	blank values are to be ignored and treated as if they were
				353	not included.
				354
				355	strict_parsing: flag indicating what to do with parsing errors.
				356	If false (the default), errors are silently ignored.
				357	If true, errors raise a ValueError exception.
				358	"""
				359	dict = {}
				360	for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
				361	if name in dict:
				362	dict[name].append(value)
				363	else:
				364	dict[name] = [value]
				365	return dict
				366
Georg Brandl	3d6575d	2009-09-16 14:36:22 +0000	[diff] [blame]	367	def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	368	"""Parse a query given as a string argument.
				369
				370	Arguments:
				371
				372	qs: URL-encoded query string to be parsed
				373
				374	keep_blank_values: flag indicating whether blank values in
				375	URL encoded queries should be treated as blank strings. A
				376	true value indicates that blanks should be retained as blank
				377	strings. The default false value indicates that blank values
				378	are to be ignored and treated as if they were not included.
				379
				380	strict_parsing: flag indicating what to do with parsing errors. If
				381	false (the default), errors are silently ignored. If true,
				382	errors raise a ValueError exception.
				383
				384	Returns a list, as G-d intended.
				385	"""
				386	pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
				387	r = []
				388	for name_value in pairs:
				389	if not name_value and not strict_parsing:
				390	continue
				391	nv = name_value.split('=', 1)
				392	if len(nv) != 2:
				393	if strict_parsing:
				394	raise ValueError("bad query field: %r" % (name_value,))
				395	# Handle case of a control-name with no equal sign
				396	if keep_blank_values:
				397	nv.append('')
				398	else:
				399	continue
				400	if len(nv[1]) or keep_blank_values:
				401	name = unquote(nv[0].replace('+', ' '))
				402	value = unquote(nv[1].replace('+', ' '))
				403	r.append((name, value))
				404
				405	return r
				406
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	407	def unquote_plus(string, encoding='utf-8', errors='replace'):
				408	"""Like unquote(), but also replace plus signs by spaces, as required for
				409	unquoting HTML form values.
				410
				411	unquote_plus('%7e/abc+def') -> '~/abc def'
				412	"""
				413	string = string.replace('+', ' ')
				414	return unquote(string, encoding, errors)
				415
				416	_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				417	b'abcdefghijklmnopqrstuvwxyz'
				418	b'0123456789'
				419	b'_.-')
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	420	_safe_quoters= {}
				421
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	422	class Quoter(collections.defaultdict):
				423	"""A mapping from bytes (in range(0,256)) to strings.
				424
				425	String values are percent-encoded byte values, unless the key < 128, and
				426	in the "safe" set (either the specified safe set, or default set).
				427	"""
				428	# Keeps a cache internally, using defaultdict, for efficiency (lookups
				429	# of cached keys don't call Python code at all).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	430	def __init__(self, safe):
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	431	"""safe: bytes object."""
				432	self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	433
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	434	def __repr__(self):
				435	# Without this, will just display as a defaultdict
				436	return "<Quoter %r>" % dict(self)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	437
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	438	def __missing__(self, b):
				439	# Handle a cache miss. Store quoted string in cache and return.
				440	res = b in self.safe and chr(b) or ('%%%02X' % b)
				441	self[b] = res
				442	return res
				443
				444	def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	445	"""quote('abc def') -> 'abc%20def'
				446
				447	Each part of a URL, e.g. the path info, the query, etc., has a
				448	different set of reserved characters that must be quoted.
				449
				450	RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
				451	the following reserved characters.
				452
				453	reserved = ";" \| "/" \| "?" \| ":" \| "@" \| "&" \| "=" \| "+" \|
				454	"$" \| ","
				455
				456	Each of these characters is reserved in some component of a URL,
				457	but not necessarily in all of them.
				458
				459	By default, the quote function is intended for quoting the path
				460	section of a URL. Thus, it will not encode '/'. This character
				461	is reserved, but in typical usage the quote function is being
				462	called on a path where the existing slash characters are used as
				463	reserved characters.
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	464
				465	string and safe may be either str or bytes objects. encoding must
				466	not be specified if string is a str.
				467
				468	The optional encoding and errors parameters specify how to deal with
				469	non-ASCII characters, as accepted by the str.encode method.
				470	By default, encoding='utf-8' (characters are encoded with UTF-8), and
				471	errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	472	"""
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	473	if isinstance(string, str):
				474	if encoding is None:
				475	encoding = 'utf-8'
				476	if errors is None:
				477	errors = 'strict'
				478	string = string.encode(encoding, errors)
				479	else:
				480	if encoding is not None:
				481	raise TypeError("quote() doesn't support 'encoding' for bytes")
				482	if errors is not None:
				483	raise TypeError("quote() doesn't support 'errors' for bytes")
				484	return quote_from_bytes(string, safe)
				485
				486	def quote_plus(string, safe='', encoding=None, errors=None):
				487	"""Like quote(), but also replace ' ' with '+', as required for quoting
				488	HTML form values. Plus signs in the original string are escaped unless
				489	they are included in safe. It also does not have safe default to '/'.
				490	"""
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	491	# Check if ' ' in string, where string may either be a str or bytes. If
				492	# there are no spaces, the regular quote will produce the right answer.
				493	if ((isinstance(string, str) and ' ' not in string) or
				494	(isinstance(string, bytes) and b' ' not in string)):
				495	return quote(string, safe, encoding, errors)
				496	if isinstance(safe, str):
				497	space = ' '
				498	else:
				499	space = b' '
Georg Brandl	faf4149	2009-05-26 18:31:11 +0000	[diff] [blame]	500	string = quote(string, safe + space, encoding, errors)
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	501	return string.replace(' ', '+')
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	502
				503	def quote_from_bytes(bs, safe='/'):
				504	"""Like quote(), but accepts a bytes object rather than a str, and does
				505	not perform string-to-bytes encoding. It always returns an ASCII string.
				506	quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
				507	"""
				508	if isinstance(safe, str):
				509	# Normalize 'safe' by converting to bytes and removing non-ASCII chars
				510	safe = safe.encode('ascii', 'ignore')
				511	cachekey = bytes(safe) # In case it was a bytearray
				512	if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
				513	raise TypeError("quote_from_bytes() expected a bytes")
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	514	try:
				515	quoter = _safe_quoters[cachekey]
				516	except KeyError:
				517	quoter = Quoter(safe)
				518	_safe_quoters[cachekey] = quoter
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	519	return ''.join([quoter[char] for char in bs])
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	520
Georg Brandl	3d6575d	2009-09-16 14:36:22 +0000	[diff] [blame]	521	def urlencode(query, doseq=False):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	522	"""Encode a sequence of two-element tuples or dictionary into a URL query string.
				523
				524	If any values in the query arg are sequences and doseq is true, each
				525	sequence element is converted to a separate parameter.
				526
				527	If the query arg is a sequence of two-element tuples, the order of the
				528	parameters in the output will match the order of parameters in the
				529	input.
				530	"""
				531
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	532	if hasattr(query, "items"):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	533	query = query.items()
				534	else:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	535	# It's a bother at times that strings and string-like objects are
				536	# sequences.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	537	try:
				538	# non-sequence items should not work with len()
				539	# non-empty strings will fail this
				540	if len(query) and not isinstance(query[0], tuple):
				541	raise TypeError
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	542	# Zero-length sequences of all types will get here and succeed,
				543	# but that's a minor nit. Since the original implementation
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	544	# allowed empty dicts that type of behavior probably should be
				545	# preserved for consistency
				546	except TypeError:
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	547	ty, va, tb = sys.exc_info()
				548	raise TypeError("not a valid non-string sequence "
				549	"or mapping object").with_traceback(tb)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	550
				551	l = []
				552	if not doseq:
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	553	for k, v in query:
				554	k = quote_plus(str(k))
				555	v = quote_plus(str(v))
				556	l.append(k + '=' + v)
				557	else:
				558	for k, v in query:
				559	k = quote_plus(str(k))
				560	if isinstance(v, str):
				561	v = quote_plus(v)
				562	l.append(k + '=' + v)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	563	else:
				564	try:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	565	# Is this a sufficient test for sequence-ness?
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	566	x = len(v)
				567	except TypeError:
				568	# not a sequence
				569	v = quote_plus(str(v))
				570	l.append(k + '=' + v)
				571	else:
				572	# loop over the sequence
				573	for elt in v:
				574	l.append(k + '=' + quote_plus(str(elt)))
				575	return '&'.join(l)
				576
				577	# Utilities to parse URLs (most of these return None for missing parts):
				578	# unwrap('<URL:type://host/path>') --> 'type://host/path'
				579	# splittype('type:opaquestring') --> 'type', 'opaquestring'
				580	# splithost('//host[:port]/path') --> 'host[:port]', '/path'
				581	# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
				582	# splitpasswd('user:passwd') -> 'user', 'passwd'
				583	# splitport('host:port') --> 'host', 'port'
				584	# splitquery('/path?query') --> '/path', 'query'
				585	# splittag('/path#tag') --> '/path', 'tag'
				586	# splitattr('/path;attr1=value1;attr2=value2;...') ->
				587	# '/path', ['attr1=value1', 'attr2=value2', ...]
				588	# splitvalue('attr=value') --> 'attr', 'value'
				589	# urllib.parse.unquote('abc%20def') -> 'abc def'
				590	# quote('abc def') -> 'abc%20def')
				591
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	592	def to_bytes(url):
				593	"""to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	594	# Most URL schemes require ASCII. If that changes, the conversion
				595	# can be relaxed.
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	596	# XXX get rid of to_bytes()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	597	if isinstance(url, str):
				598	try:
				599	url = url.encode("ASCII").decode()
				600	except UnicodeError:
				601	raise UnicodeError("URL " + repr(url) +
				602	" contains non-ASCII characters")
				603	return url
				604
				605	def unwrap(url):
				606	"""unwrap('<URL:type://host/path>') --> 'type://host/path'."""
				607	url = str(url).strip()
				608	if url[:1] == '<' and url[-1:] == '>':
				609	url = url[1:-1].strip()
				610	if url[:4] == 'URL:': url = url[4:].strip()
				611	return url
				612
				613	_typeprog = None
				614	def splittype(url):
				615	"""splittype('type:opaquestring') --> 'type', 'opaquestring'."""
				616	global _typeprog
				617	if _typeprog is None:
				618	import re
				619	_typeprog = re.compile('^([^/:]+):')
				620
				621	match = _typeprog.match(url)
				622	if match:
				623	scheme = match.group(1)
				624	return scheme.lower(), url[len(scheme) + 1:]
				625	return None, url
				626
				627	_hostprog = None
				628	def splithost(url):
				629	"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
				630	global _hostprog
				631	if _hostprog is None:
				632	import re
				633	_hostprog = re.compile('^//([^/?])(.)$')
				634
				635	match = _hostprog.match(url)
				636	if match: return match.group(1, 2)
				637	return None, url
				638
				639	_userprog = None
				640	def splituser(host):
				641	"""splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
				642	global _userprog
				643	if _userprog is None:
				644	import re
				645	_userprog = re.compile('^(.)@(.)$')
				646
				647	match = _userprog.match(host)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	648	if match: return map(unquote, match.group(1, 2))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	649	return None, host
				650
				651	_passwdprog = None
				652	def splitpasswd(user):
				653	"""splitpasswd('user:passwd') -> 'user', 'passwd'."""
				654	global _passwdprog
				655	if _passwdprog is None:
				656	import re
Senthil Kumaran	eaaec27	2009-03-30 21:54:41 +0000	[diff] [blame]	657	_passwdprog = re.compile('^([^:]):(.)$',re.S)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	658
				659	match = _passwdprog.match(user)
				660	if match: return match.group(1, 2)
				661	return user, None
				662
				663	# splittag('/path#tag') --> '/path', 'tag'
				664	_portprog = None
				665	def splitport(host):
				666	"""splitport('host:port') --> 'host', 'port'."""
				667	global _portprog
				668	if _portprog is None:
				669	import re
				670	_portprog = re.compile('^(.*):([0-9]+)$')
				671
				672	match = _portprog.match(host)
				673	if match: return match.group(1, 2)
				674	return host, None
				675
				676	_nportprog = None
				677	def splitnport(host, defport=-1):
				678	"""Split host and port, returning numeric port.
				679	Return given default port if no ':' found; defaults to -1.
				680	Return numerical port if a valid number are found after ':'.
				681	Return None if ':' but not a valid number."""
				682	global _nportprog
				683	if _nportprog is None:
				684	import re
				685	_nportprog = re.compile('^(.):(.)$')
				686
				687	match = _nportprog.match(host)
				688	if match:
				689	host, port = match.group(1, 2)
				690	try:
				691	if not port: raise ValueError("no digits")
				692	nport = int(port)
				693	except ValueError:
				694	nport = None
				695	return host, nport
				696	return host, defport
				697
				698	_queryprog = None
				699	def splitquery(url):
				700	"""splitquery('/path?query') --> '/path', 'query'."""
				701	global _queryprog
				702	if _queryprog is None:
				703	import re
				704	_queryprog = re.compile('^(.)\?([^?])$')
				705
				706	match = _queryprog.match(url)
				707	if match: return match.group(1, 2)
				708	return url, None
				709
				710	_tagprog = None
				711	def splittag(url):
				712	"""splittag('/path#tag') --> '/path', 'tag'."""
				713	global _tagprog
				714	if _tagprog is None:
				715	import re
				716	_tagprog = re.compile('^(.)#([^#])$')
				717
				718	match = _tagprog.match(url)
				719	if match: return match.group(1, 2)
				720	return url, None
				721
				722	def splitattr(url):
				723	"""splitattr('/path;attr1=value1;attr2=value2;...') ->
				724	'/path', ['attr1=value1', 'attr2=value2', ...]."""
				725	words = url.split(';')
				726	return words[0], words[1:]
				727
				728	_valueprog = None
				729	def splitvalue(attr):
				730	"""splitvalue('attr=value') --> 'attr', 'value'."""
				731	global _valueprog
				732	if _valueprog is None:
				733	import re
				734	_valueprog = re.compile('^([^=])=(.)$')
				735
				736	match = _valueprog.match(attr)
				737	if match: return match.group(1, 2)
				738	return attr, None
				739
				740	test_input = """
				741	http://a/b/c/d
				742
				743	g:h = <URL:g:h>
				744	http:g = <URL:http://a/b/c/g>
				745	http: = <URL:http://a/b/c/d>
				746	g = <URL:http://a/b/c/g>
				747	./g = <URL:http://a/b/c/g>
				748	g/ = <URL:http://a/b/c/g/>
				749	/g = <URL:http://a/g>
				750	//g = <URL:http://g>
				751	?y = <URL:http://a/b/c/d?y>
				752	g?y = <URL:http://a/b/c/g?y>
				753	g?y/./x = <URL:http://a/b/c/g?y/./x>
				754	. = <URL:http://a/b/c/>
				755	./ = <URL:http://a/b/c/>
				756	.. = <URL:http://a/b/>
				757	../ = <URL:http://a/b/>
				758	../g = <URL:http://a/b/g>
				759	../.. = <URL:http://a/>
				760	../../g = <URL:http://a/g>
				761	../../../g = <URL:http://a/../g>
				762	./../g = <URL:http://a/b/g>
				763	./g/. = <URL:http://a/b/c/g/>
				764	/./g = <URL:http://a/./g>
				765	g/./h = <URL:http://a/b/c/g/h>
				766	g/../h = <URL:http://a/b/c/h>
				767	http:g = <URL:http://a/b/c/g>
				768	http: = <URL:http://a/b/c/d>
				769	http:?y = <URL:http://a/b/c/d?y>
				770	http:g?y = <URL:http://a/b/c/g?y>
				771	http:g?y/./x = <URL:http://a/b/c/g?y/./x>
				772	"""
				773
				774	def test():
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	775	base = ''
				776	if sys.argv[1:]:
				777	fn = sys.argv[1]
				778	if fn == '-':
				779	fp = sys.stdin
				780	else:
				781	fp = open(fn)
				782	else:
				783	from io import StringIO
				784	fp = StringIO(test_input)
				785	for line in fp:
				786	words = line.split()
				787	if not words:
				788	continue
				789	url = words[0]
				790	parts = urlparse(url)
				791	print('%-10s : %s' % (url, parts))
				792	abs = urljoin(base, url)
				793	if not base:
				794	base = abs
				795	wrapped = '<URL:%s>' % abs
				796	print('%-10s = %s' % (url, wrapped))
				797	if len(words) == 3 and words[1] == '=':
				798	if wrapped != words[2]:
				799	print('EXPECTED', words[2], '!!!!!!!!!!')
				800
				801	if __name__ == '__main__':
				802	test()