Blame - Lib/urllib/parse.py - platform/external/python/cpython3

blob: 16f46d6e48754f2a5d0c7a070dbe4d234fb6d723 [file] [log] [blame]

Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1	"""Parse (absolute and relative) URLs.
				2
				3	See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
				4	UC Irvine, June 1995.
				5	"""
				6
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	7	import sys
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	8	import collections
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	9
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	10	__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	11	"urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	12	"quote", "quote_plus", "quote_from_bytes",
				13	"unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	14
				15	# A classification of schemes ('' means apply by default)
				16	uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
				17	'wais', 'file', 'https', 'shttp', 'mms',
				18	'prospero', 'rtsp', 'rtspu', '', 'sftp']
				19	uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
				20	'imap', 'wais', 'file', 'mms', 'https', 'shttp',
				21	'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaran	eaaec27	2009-03-30 21:54:41 +0000	[diff] [blame]	22	'svn', 'svn+ssh', 'sftp','nfs']
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	23	non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
				24	'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
				25	uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
				26	'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
				27	'mms', '', 'sftp']
				28	uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
				29	'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
				30	uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
				31	'nntp', 'wais', 'https', 'shttp', 'snews',
				32	'file', 'prospero', '']
				33
				34	# Characters valid in scheme names
				35	scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
				36	'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				37	'0123456789'
				38	'+-.')
				39
				40	MAX_CACHE_SIZE = 20
				41	_parse_cache = {}
				42
				43	def clear_cache():
Antoine Pitrou	2df5fc7	2009-12-08 19:38:17 +0000	[diff] [blame^]	44	"""Clear the parse cache and the quoters cache."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	45	_parse_cache.clear()
Antoine Pitrou	2df5fc7	2009-12-08 19:38:17 +0000	[diff] [blame^]	46	_safe_quoters.clear()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	47
				48
				49	class ResultMixin(object):
				50	"""Shared methods for the parsed result objects."""
				51
				52	@property
				53	def username(self):
				54	netloc = self.netloc
				55	if "@" in netloc:
				56	userinfo = netloc.rsplit("@", 1)[0]
				57	if ":" in userinfo:
				58	userinfo = userinfo.split(":", 1)[0]
				59	return userinfo
				60	return None
				61
				62	@property
				63	def password(self):
				64	netloc = self.netloc
				65	if "@" in netloc:
				66	userinfo = netloc.rsplit("@", 1)[0]
				67	if ":" in userinfo:
				68	return userinfo.split(":", 1)[1]
				69	return None
				70
				71	@property
				72	def hostname(self):
				73	netloc = self.netloc
				74	if "@" in netloc:
				75	netloc = netloc.rsplit("@", 1)[1]
				76	if ":" in netloc:
				77	netloc = netloc.split(":", 1)[0]
				78	return netloc.lower() or None
				79
				80	@property
				81	def port(self):
				82	netloc = self.netloc
				83	if "@" in netloc:
				84	netloc = netloc.rsplit("@", 1)[1]
				85	if ":" in netloc:
				86	port = netloc.split(":", 1)[1]
				87	return int(port, 10)
				88	return None
				89
				90	from collections import namedtuple
				91
				92	class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
				93
				94	__slots__ = ()
				95
				96	def geturl(self):
				97	return urlunsplit(self)
				98
				99
				100	class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
				101
				102	__slots__ = ()
				103
				104	def geturl(self):
				105	return urlunparse(self)
				106
				107
				108	def urlparse(url, scheme='', allow_fragments=True):
				109	"""Parse a URL into 6 components:
				110	<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
				111	Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
				112	Note that we don't break the components up in smaller bits
				113	(e.g. netloc is a single string) and we don't expand % escapes."""
				114	tuple = urlsplit(url, scheme, allow_fragments)
				115	scheme, netloc, url, query, fragment = tuple
				116	if scheme in uses_params and ';' in url:
				117	url, params = _splitparams(url)
				118	else:
				119	params = ''
				120	return ParseResult(scheme, netloc, url, params, query, fragment)
				121
				122	def _splitparams(url):
				123	if '/' in url:
				124	i = url.find(';', url.rfind('/'))
				125	if i < 0:
				126	return url, ''
				127	else:
				128	i = url.find(';')
				129	return url[:i], url[i+1:]
				130
				131	def _splitnetloc(url, start=0):
				132	delim = len(url) # position of end of domain part of url, default is end
				133	for c in '/?#': # look for delimiters; the order is NOT important
				134	wdelim = url.find(c, start) # find first of this delim
				135	if wdelim >= 0: # if found
				136	delim = min(delim, wdelim) # use earliest delim position
				137	return url[start:delim], url[delim:] # return (domain, rest)
				138
				139	def urlsplit(url, scheme='', allow_fragments=True):
				140	"""Parse a URL into 5 components:
				141	<scheme>://<netloc>/<path>?<query>#<fragment>
				142	Return a 5-tuple: (scheme, netloc, path, query, fragment).
				143	Note that we don't break the components up in smaller bits
				144	(e.g. netloc is a single string) and we don't expand % escapes."""
				145	allow_fragments = bool(allow_fragments)
				146	key = url, scheme, allow_fragments, type(url), type(scheme)
				147	cached = _parse_cache.get(key, None)
				148	if cached:
				149	return cached
				150	if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
				151	clear_cache()
				152	netloc = query = fragment = ''
				153	i = url.find(':')
				154	if i > 0:
				155	if url[:i] == 'http': # optimize the common case
				156	scheme = url[:i].lower()
				157	url = url[i+1:]
				158	if url[:2] == '//':
				159	netloc, url = _splitnetloc(url, 2)
				160	if allow_fragments and '#' in url:
				161	url, fragment = url.split('#', 1)
				162	if '?' in url:
				163	url, query = url.split('?', 1)
				164	v = SplitResult(scheme, netloc, url, query, fragment)
				165	_parse_cache[key] = v
				166	return v
				167	for c in url[:i]:
				168	if c not in scheme_chars:
				169	break
				170	else:
				171	scheme, url = url[:i].lower(), url[i+1:]
				172	if scheme in uses_netloc and url[:2] == '//':
				173	netloc, url = _splitnetloc(url, 2)
				174	if allow_fragments and scheme in uses_fragment and '#' in url:
				175	url, fragment = url.split('#', 1)
				176	if scheme in uses_query and '?' in url:
				177	url, query = url.split('?', 1)
				178	v = SplitResult(scheme, netloc, url, query, fragment)
				179	_parse_cache[key] = v
				180	return v
				181
				182	def urlunparse(components):
				183	"""Put a parsed URL back together again. This may result in a
				184	slightly different, but equivalent URL, if the URL that was parsed
				185	originally had redundant delimiters, e.g. a ? with an empty query
				186	(the draft states that these are equivalent)."""
				187	scheme, netloc, url, params, query, fragment = components
				188	if params:
				189	url = "%s;%s" % (url, params)
				190	return urlunsplit((scheme, netloc, url, query, fragment))
				191
				192	def urlunsplit(components):
				193	scheme, netloc, url, query, fragment = components
				194	if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
				195	if url and url[:1] != '/': url = '/' + url
				196	url = '//' + (netloc or '') + url
				197	if scheme:
				198	url = scheme + ':' + url
				199	if query:
				200	url = url + '?' + query
				201	if fragment:
				202	url = url + '#' + fragment
				203	return url
				204
				205	def urljoin(base, url, allow_fragments=True):
				206	"""Join a base URL and a possibly relative URL to form an absolute
				207	interpretation of the latter."""
				208	if not base:
				209	return url
				210	if not url:
				211	return base
				212	bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
				213	urlparse(base, '', allow_fragments)
				214	scheme, netloc, path, params, query, fragment = \
				215	urlparse(url, bscheme, allow_fragments)
				216	if scheme != bscheme or scheme not in uses_relative:
				217	return url
				218	if scheme in uses_netloc:
				219	if netloc:
				220	return urlunparse((scheme, netloc, path,
				221	params, query, fragment))
				222	netloc = bnetloc
				223	if path[:1] == '/':
				224	return urlunparse((scheme, netloc, path,
				225	params, query, fragment))
Facundo Batista	23e3856	2008-08-14 16:55:14 +0000	[diff] [blame]	226	if not path:
				227	path = bpath
				228	if not params:
				229	params = bparams
				230	else:
				231	path = path[:-1]
				232	return urlunparse((scheme, netloc, path,
				233	params, query, fragment))
				234	if not query:
				235	query = bquery
				236	return urlunparse((scheme, netloc, path,
				237	params, query, fragment))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	238	segments = bpath.split('/')[:-1] + path.split('/')
				239	# XXX The stuff below is bogus in various ways...
				240	if segments[-1] == '.':
				241	segments[-1] = ''
				242	while '.' in segments:
				243	segments.remove('.')
				244	while 1:
				245	i = 1
				246	n = len(segments) - 1
				247	while i < n:
				248	if (segments[i] == '..'
				249	and segments[i-1] not in ('', '..')):
				250	del segments[i-1:i+1]
				251	break
				252	i = i+1
				253	else:
				254	break
				255	if segments == ['', '..']:
				256	segments[-1] = ''
				257	elif len(segments) >= 2 and segments[-1] == '..':
				258	segments[-2:] = ['']
				259	return urlunparse((scheme, netloc, '/'.join(segments),
				260	params, query, fragment))
				261
				262	def urldefrag(url):
				263	"""Removes any existing fragment from URL.
				264
				265	Returns a tuple of the defragmented URL and the fragment. If
				266	the URL contained no fragments, the second element is the
				267	empty string.
				268	"""
				269	if '#' in url:
				270	s, n, p, a, q, frag = urlparse(url)
				271	defrag = urlunparse((s, n, p, a, q, ''))
				272	return defrag, frag
				273	else:
				274	return url, ''
				275
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	276	def unquote_to_bytes(string):
				277	"""unquote_to_bytes('abc%20def') -> b'abc def'."""
				278	# Note: strings are encoded as UTF-8. This is only an issue if it contains
				279	# unescaped non-ASCII characters, which URIs should not.
				280	if isinstance(string, str):
				281	string = string.encode('utf-8')
				282	res = string.split(b'%')
				283	res[0] = res[0]
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	284	for i in range(1, len(res)):
				285	item = res[i]
				286	try:
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	287	res[i] = bytes([int(item[:2], 16)]) + item[2:]
				288	except ValueError:
				289	res[i] = b'%' + item
				290	return b''.join(res)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	291
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	292	def unquote(string, encoding='utf-8', errors='replace'):
				293	"""Replace %xx escapes by their single-character equivalent. The optional
				294	encoding and errors parameters specify how to decode percent-encoded
				295	sequences into Unicode characters, as accepted by the bytes.decode()
				296	method.
				297	By default, percent-encoded sequences are decoded with UTF-8, and invalid
				298	sequences are replaced by a placeholder character.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	299
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	300	unquote('abc%20def') -> 'abc def'.
				301	"""
				302	if encoding is None: encoding = 'utf-8'
				303	if errors is None: errors = 'replace'
				304	# pct_sequence: contiguous sequence of percent-encoded bytes, decoded
				305	# (list of single-byte bytes objects)
				306	pct_sequence = []
				307	res = string.split('%')
				308	for i in range(1, len(res)):
				309	item = res[i]
				310	try:
				311	if not item: raise ValueError
				312	pct_sequence.append(bytes.fromhex(item[:2]))
				313	rest = item[2:]
				314	except ValueError:
				315	rest = '%' + item
				316	if not rest:
				317	# This segment was just a single percent-encoded character.
				318	# May be part of a sequence of code units, so delay decoding.
				319	# (Stored in pct_sequence).
				320	res[i] = ''
				321	else:
				322	# Encountered non-percent-encoded characters. Flush the current
				323	# pct_sequence.
				324	res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
				325	pct_sequence = []
				326	if pct_sequence:
				327	# Flush the final pct_sequence
				328	# res[-1] will always be empty if pct_sequence != []
				329	assert not res[-1], "string=%r, res=%r" % (string, res)
				330	res[-1] = b''.join(pct_sequence).decode(encoding, errors)
				331	return ''.join(res)
				332
Georg Brandl	3d6575d	2009-09-16 14:36:22 +0000	[diff] [blame]	333	def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	334	"""Parse a query given as a string argument.
				335
				336	Arguments:
				337
				338	qs: URL-encoded query string to be parsed
				339
				340	keep_blank_values: flag indicating whether blank values in
				341	URL encoded queries should be treated as blank strings.
				342	A true value indicates that blanks should be retained as
				343	blank strings. The default false value indicates that
				344	blank values are to be ignored and treated as if they were
				345	not included.
				346
				347	strict_parsing: flag indicating what to do with parsing errors.
				348	If false (the default), errors are silently ignored.
				349	If true, errors raise a ValueError exception.
				350	"""
				351	dict = {}
				352	for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
				353	if name in dict:
				354	dict[name].append(value)
				355	else:
				356	dict[name] = [value]
				357	return dict
				358
Georg Brandl	3d6575d	2009-09-16 14:36:22 +0000	[diff] [blame]	359	def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	360	"""Parse a query given as a string argument.
				361
				362	Arguments:
				363
				364	qs: URL-encoded query string to be parsed
				365
				366	keep_blank_values: flag indicating whether blank values in
				367	URL encoded queries should be treated as blank strings. A
				368	true value indicates that blanks should be retained as blank
				369	strings. The default false value indicates that blank values
				370	are to be ignored and treated as if they were not included.
				371
				372	strict_parsing: flag indicating what to do with parsing errors. If
				373	false (the default), errors are silently ignored. If true,
				374	errors raise a ValueError exception.
				375
				376	Returns a list, as G-d intended.
				377	"""
				378	pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
				379	r = []
				380	for name_value in pairs:
				381	if not name_value and not strict_parsing:
				382	continue
				383	nv = name_value.split('=', 1)
				384	if len(nv) != 2:
				385	if strict_parsing:
				386	raise ValueError("bad query field: %r" % (name_value,))
				387	# Handle case of a control-name with no equal sign
				388	if keep_blank_values:
				389	nv.append('')
				390	else:
				391	continue
				392	if len(nv[1]) or keep_blank_values:
				393	name = unquote(nv[0].replace('+', ' '))
				394	value = unquote(nv[1].replace('+', ' '))
				395	r.append((name, value))
				396
				397	return r
				398
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	399	def unquote_plus(string, encoding='utf-8', errors='replace'):
				400	"""Like unquote(), but also replace plus signs by spaces, as required for
				401	unquoting HTML form values.
				402
				403	unquote_plus('%7e/abc+def') -> '~/abc def'
				404	"""
				405	string = string.replace('+', ' ')
				406	return unquote(string, encoding, errors)
				407
				408	_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				409	b'abcdefghijklmnopqrstuvwxyz'
				410	b'0123456789'
				411	b'_.-')
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	412	_safe_quoters= {}
				413
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	414	class Quoter(collections.defaultdict):
				415	"""A mapping from bytes (in range(0,256)) to strings.
				416
				417	String values are percent-encoded byte values, unless the key < 128, and
				418	in the "safe" set (either the specified safe set, or default set).
				419	"""
				420	# Keeps a cache internally, using defaultdict, for efficiency (lookups
				421	# of cached keys don't call Python code at all).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	422	def __init__(self, safe):
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	423	"""safe: bytes object."""
				424	self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	425
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	426	def __repr__(self):
				427	# Without this, will just display as a defaultdict
				428	return "<Quoter %r>" % dict(self)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	429
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	430	def __missing__(self, b):
				431	# Handle a cache miss. Store quoted string in cache and return.
				432	res = b in self.safe and chr(b) or ('%%%02X' % b)
				433	self[b] = res
				434	return res
				435
				436	def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	437	"""quote('abc def') -> 'abc%20def'
				438
				439	Each part of a URL, e.g. the path info, the query, etc., has a
				440	different set of reserved characters that must be quoted.
				441
				442	RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
				443	the following reserved characters.
				444
				445	reserved = ";" \| "/" \| "?" \| ":" \| "@" \| "&" \| "=" \| "+" \|
				446	"$" \| ","
				447
				448	Each of these characters is reserved in some component of a URL,
				449	but not necessarily in all of them.
				450
				451	By default, the quote function is intended for quoting the path
				452	section of a URL. Thus, it will not encode '/'. This character
				453	is reserved, but in typical usage the quote function is being
				454	called on a path where the existing slash characters are used as
				455	reserved characters.
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	456
				457	string and safe may be either str or bytes objects. encoding must
				458	not be specified if string is a str.
				459
				460	The optional encoding and errors parameters specify how to deal with
				461	non-ASCII characters, as accepted by the str.encode method.
				462	By default, encoding='utf-8' (characters are encoded with UTF-8), and
				463	errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	464	"""
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	465	if isinstance(string, str):
				466	if encoding is None:
				467	encoding = 'utf-8'
				468	if errors is None:
				469	errors = 'strict'
				470	string = string.encode(encoding, errors)
				471	else:
				472	if encoding is not None:
				473	raise TypeError("quote() doesn't support 'encoding' for bytes")
				474	if errors is not None:
				475	raise TypeError("quote() doesn't support 'errors' for bytes")
				476	return quote_from_bytes(string, safe)
				477
				478	def quote_plus(string, safe='', encoding=None, errors=None):
				479	"""Like quote(), but also replace ' ' with '+', as required for quoting
				480	HTML form values. Plus signs in the original string are escaped unless
				481	they are included in safe. It also does not have safe default to '/'.
				482	"""
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	483	# Check if ' ' in string, where string may either be a str or bytes. If
				484	# there are no spaces, the regular quote will produce the right answer.
				485	if ((isinstance(string, str) and ' ' not in string) or
				486	(isinstance(string, bytes) and b' ' not in string)):
				487	return quote(string, safe, encoding, errors)
				488	if isinstance(safe, str):
				489	space = ' '
				490	else:
				491	space = b' '
Georg Brandl	faf4149	2009-05-26 18:31:11 +0000	[diff] [blame]	492	string = quote(string, safe + space, encoding, errors)
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	493	return string.replace(' ', '+')
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	494
				495	def quote_from_bytes(bs, safe='/'):
				496	"""Like quote(), but accepts a bytes object rather than a str, and does
				497	not perform string-to-bytes encoding. It always returns an ASCII string.
				498	quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
				499	"""
				500	if isinstance(safe, str):
				501	# Normalize 'safe' by converting to bytes and removing non-ASCII chars
				502	safe = safe.encode('ascii', 'ignore')
				503	cachekey = bytes(safe) # In case it was a bytearray
				504	if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
				505	raise TypeError("quote_from_bytes() expected a bytes")
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	506	try:
				507	quoter = _safe_quoters[cachekey]
				508	except KeyError:
				509	quoter = Quoter(safe)
				510	_safe_quoters[cachekey] = quoter
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	511	return ''.join([quoter[char] for char in bs])
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	512
Georg Brandl	3d6575d	2009-09-16 14:36:22 +0000	[diff] [blame]	513	def urlencode(query, doseq=False):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	514	"""Encode a sequence of two-element tuples or dictionary into a URL query string.
				515
				516	If any values in the query arg are sequences and doseq is true, each
				517	sequence element is converted to a separate parameter.
				518
				519	If the query arg is a sequence of two-element tuples, the order of the
				520	parameters in the output will match the order of parameters in the
				521	input.
				522	"""
				523
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	524	if hasattr(query, "items"):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	525	query = query.items()
				526	else:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	527	# It's a bother at times that strings and string-like objects are
				528	# sequences.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	529	try:
				530	# non-sequence items should not work with len()
				531	# non-empty strings will fail this
				532	if len(query) and not isinstance(query[0], tuple):
				533	raise TypeError
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	534	# Zero-length sequences of all types will get here and succeed,
				535	# but that's a minor nit. Since the original implementation
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	536	# allowed empty dicts that type of behavior probably should be
				537	# preserved for consistency
				538	except TypeError:
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	539	ty, va, tb = sys.exc_info()
				540	raise TypeError("not a valid non-string sequence "
				541	"or mapping object").with_traceback(tb)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	542
				543	l = []
				544	if not doseq:
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	545	for k, v in query:
				546	k = quote_plus(str(k))
				547	v = quote_plus(str(v))
				548	l.append(k + '=' + v)
				549	else:
				550	for k, v in query:
				551	k = quote_plus(str(k))
				552	if isinstance(v, str):
				553	v = quote_plus(v)
				554	l.append(k + '=' + v)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	555	else:
				556	try:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	557	# Is this a sufficient test for sequence-ness?
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	558	x = len(v)
				559	except TypeError:
				560	# not a sequence
				561	v = quote_plus(str(v))
				562	l.append(k + '=' + v)
				563	else:
				564	# loop over the sequence
				565	for elt in v:
				566	l.append(k + '=' + quote_plus(str(elt)))
				567	return '&'.join(l)
				568
				569	# Utilities to parse URLs (most of these return None for missing parts):
				570	# unwrap('<URL:type://host/path>') --> 'type://host/path'
				571	# splittype('type:opaquestring') --> 'type', 'opaquestring'
				572	# splithost('//host[:port]/path') --> 'host[:port]', '/path'
				573	# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
				574	# splitpasswd('user:passwd') -> 'user', 'passwd'
				575	# splitport('host:port') --> 'host', 'port'
				576	# splitquery('/path?query') --> '/path', 'query'
				577	# splittag('/path#tag') --> '/path', 'tag'
				578	# splitattr('/path;attr1=value1;attr2=value2;...') ->
				579	# '/path', ['attr1=value1', 'attr2=value2', ...]
				580	# splitvalue('attr=value') --> 'attr', 'value'
				581	# urllib.parse.unquote('abc%20def') -> 'abc def'
				582	# quote('abc def') -> 'abc%20def')
				583
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	584	def to_bytes(url):
				585	"""to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	586	# Most URL schemes require ASCII. If that changes, the conversion
				587	# can be relaxed.
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	588	# XXX get rid of to_bytes()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	589	if isinstance(url, str):
				590	try:
				591	url = url.encode("ASCII").decode()
				592	except UnicodeError:
				593	raise UnicodeError("URL " + repr(url) +
				594	" contains non-ASCII characters")
				595	return url
				596
				597	def unwrap(url):
				598	"""unwrap('<URL:type://host/path>') --> 'type://host/path'."""
				599	url = str(url).strip()
				600	if url[:1] == '<' and url[-1:] == '>':
				601	url = url[1:-1].strip()
				602	if url[:4] == 'URL:': url = url[4:].strip()
				603	return url
				604
				605	_typeprog = None
				606	def splittype(url):
				607	"""splittype('type:opaquestring') --> 'type', 'opaquestring'."""
				608	global _typeprog
				609	if _typeprog is None:
				610	import re
				611	_typeprog = re.compile('^([^/:]+):')
				612
				613	match = _typeprog.match(url)
				614	if match:
				615	scheme = match.group(1)
				616	return scheme.lower(), url[len(scheme) + 1:]
				617	return None, url
				618
				619	_hostprog = None
				620	def splithost(url):
				621	"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
				622	global _hostprog
				623	if _hostprog is None:
				624	import re
				625	_hostprog = re.compile('^//([^/?])(.)$')
				626
				627	match = _hostprog.match(url)
				628	if match: return match.group(1, 2)
				629	return None, url
				630
				631	_userprog = None
				632	def splituser(host):
				633	"""splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
				634	global _userprog
				635	if _userprog is None:
				636	import re
				637	_userprog = re.compile('^(.)@(.)$')
				638
				639	match = _userprog.match(host)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	640	if match: return map(unquote, match.group(1, 2))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	641	return None, host
				642
				643	_passwdprog = None
				644	def splitpasswd(user):
				645	"""splitpasswd('user:passwd') -> 'user', 'passwd'."""
				646	global _passwdprog
				647	if _passwdprog is None:
				648	import re
Senthil Kumaran	eaaec27	2009-03-30 21:54:41 +0000	[diff] [blame]	649	_passwdprog = re.compile('^([^:]):(.)$',re.S)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	650
				651	match = _passwdprog.match(user)
				652	if match: return match.group(1, 2)
				653	return user, None
				654
				655	# splittag('/path#tag') --> '/path', 'tag'
				656	_portprog = None
				657	def splitport(host):
				658	"""splitport('host:port') --> 'host', 'port'."""
				659	global _portprog
				660	if _portprog is None:
				661	import re
				662	_portprog = re.compile('^(.*):([0-9]+)$')
				663
				664	match = _portprog.match(host)
				665	if match: return match.group(1, 2)
				666	return host, None
				667
				668	_nportprog = None
				669	def splitnport(host, defport=-1):
				670	"""Split host and port, returning numeric port.
				671	Return given default port if no ':' found; defaults to -1.
				672	Return numerical port if a valid number are found after ':'.
				673	Return None if ':' but not a valid number."""
				674	global _nportprog
				675	if _nportprog is None:
				676	import re
				677	_nportprog = re.compile('^(.):(.)$')
				678
				679	match = _nportprog.match(host)
				680	if match:
				681	host, port = match.group(1, 2)
				682	try:
				683	if not port: raise ValueError("no digits")
				684	nport = int(port)
				685	except ValueError:
				686	nport = None
				687	return host, nport
				688	return host, defport
				689
				690	_queryprog = None
				691	def splitquery(url):
				692	"""splitquery('/path?query') --> '/path', 'query'."""
				693	global _queryprog
				694	if _queryprog is None:
				695	import re
				696	_queryprog = re.compile('^(.)\?([^?])$')
				697
				698	match = _queryprog.match(url)
				699	if match: return match.group(1, 2)
				700	return url, None
				701
				702	_tagprog = None
				703	def splittag(url):
				704	"""splittag('/path#tag') --> '/path', 'tag'."""
				705	global _tagprog
				706	if _tagprog is None:
				707	import re
				708	_tagprog = re.compile('^(.)#([^#])$')
				709
				710	match = _tagprog.match(url)
				711	if match: return match.group(1, 2)
				712	return url, None
				713
				714	def splitattr(url):
				715	"""splitattr('/path;attr1=value1;attr2=value2;...') ->
				716	'/path', ['attr1=value1', 'attr2=value2', ...]."""
				717	words = url.split(';')
				718	return words[0], words[1:]
				719
				720	_valueprog = None
				721	def splitvalue(attr):
				722	"""splitvalue('attr=value') --> 'attr', 'value'."""
				723	global _valueprog
				724	if _valueprog is None:
				725	import re
				726	_valueprog = re.compile('^([^=])=(.)$')
				727
				728	match = _valueprog.match(attr)
				729	if match: return match.group(1, 2)
				730	return attr, None
				731
				732	test_input = """
				733	http://a/b/c/d
				734
				735	g:h = <URL:g:h>
				736	http:g = <URL:http://a/b/c/g>
				737	http: = <URL:http://a/b/c/d>
				738	g = <URL:http://a/b/c/g>
				739	./g = <URL:http://a/b/c/g>
				740	g/ = <URL:http://a/b/c/g/>
				741	/g = <URL:http://a/g>
				742	//g = <URL:http://g>
				743	?y = <URL:http://a/b/c/d?y>
				744	g?y = <URL:http://a/b/c/g?y>
				745	g?y/./x = <URL:http://a/b/c/g?y/./x>
				746	. = <URL:http://a/b/c/>
				747	./ = <URL:http://a/b/c/>
				748	.. = <URL:http://a/b/>
				749	../ = <URL:http://a/b/>
				750	../g = <URL:http://a/b/g>
				751	../.. = <URL:http://a/>
				752	../../g = <URL:http://a/g>
				753	../../../g = <URL:http://a/../g>
				754	./../g = <URL:http://a/b/g>
				755	./g/. = <URL:http://a/b/c/g/>
				756	/./g = <URL:http://a/./g>
				757	g/./h = <URL:http://a/b/c/g/h>
				758	g/../h = <URL:http://a/b/c/h>
				759	http:g = <URL:http://a/b/c/g>
				760	http: = <URL:http://a/b/c/d>
				761	http:?y = <URL:http://a/b/c/d?y>
				762	http:g?y = <URL:http://a/b/c/g?y>
				763	http:g?y/./x = <URL:http://a/b/c/g?y/./x>
				764	"""
				765
				766	def test():
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	767	base = ''
				768	if sys.argv[1:]:
				769	fn = sys.argv[1]
				770	if fn == '-':
				771	fp = sys.stdin
				772	else:
				773	fp = open(fn)
				774	else:
				775	from io import StringIO
				776	fp = StringIO(test_input)
				777	for line in fp:
				778	words = line.split()
				779	if not words:
				780	continue
				781	url = words[0]
				782	parts = urlparse(url)
				783	print('%-10s : %s' % (url, parts))
				784	abs = urljoin(base, url)
				785	if not base:
				786	base = abs
				787	wrapped = '<URL:%s>' % abs
				788	print('%-10s = %s' % (url, wrapped))
				789	if len(words) == 3 and words[1] == '=':
				790	if wrapped != words[2]:
				791	print('EXPECTED', words[2], '!!!!!!!!!!')
				792
				793	if __name__ == '__main__':
				794	test()