Blame - Lib/urllib/parse.py - platform/external/python/cpython3

blob: 903368329bcaba4a0b61fd0cdec900b6add1ef22 [file] [log] [blame]

Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1	"""Parse (absolute and relative) URLs.
				2
				3	See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
				4	UC Irvine, June 1995.
				5	"""
				6
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	7	import sys
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	8	import collections
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	9
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	10	__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	11	"urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	12	"quote", "quote_plus", "quote_from_bytes",
				13	"unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	14
				15	# A classification of schemes ('' means apply by default)
				16	uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
				17	'wais', 'file', 'https', 'shttp', 'mms',
				18	'prospero', 'rtsp', 'rtspu', '', 'sftp']
				19	uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
				20	'imap', 'wais', 'file', 'mms', 'https', 'shttp',
				21	'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaran	eaaec27	2009-03-30 21:54:41 +0000	[diff] [blame]	22	'svn', 'svn+ssh', 'sftp','nfs']
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	23	non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
				24	'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
				25	uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
				26	'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
				27	'mms', '', 'sftp']
				28	uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
				29	'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
				30	uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
				31	'nntp', 'wais', 'https', 'shttp', 'snews',
				32	'file', 'prospero', '']
				33
				34	# Characters valid in scheme names
				35	scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
				36	'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				37	'0123456789'
				38	'+-.')
				39
				40	MAX_CACHE_SIZE = 20
				41	_parse_cache = {}
				42
				43	def clear_cache():
				44	"""Clear the parse cache."""
				45	_parse_cache.clear()
				46
				47
				48	class ResultMixin(object):
				49	"""Shared methods for the parsed result objects."""
				50
				51	@property
				52	def username(self):
				53	netloc = self.netloc
				54	if "@" in netloc:
				55	userinfo = netloc.rsplit("@", 1)[0]
				56	if ":" in userinfo:
				57	userinfo = userinfo.split(":", 1)[0]
				58	return userinfo
				59	return None
				60
				61	@property
				62	def password(self):
				63	netloc = self.netloc
				64	if "@" in netloc:
				65	userinfo = netloc.rsplit("@", 1)[0]
				66	if ":" in userinfo:
				67	return userinfo.split(":", 1)[1]
				68	return None
				69
				70	@property
				71	def hostname(self):
				72	netloc = self.netloc
				73	if "@" in netloc:
				74	netloc = netloc.rsplit("@", 1)[1]
				75	if ":" in netloc:
				76	netloc = netloc.split(":", 1)[0]
				77	return netloc.lower() or None
				78
				79	@property
				80	def port(self):
				81	netloc = self.netloc
				82	if "@" in netloc:
				83	netloc = netloc.rsplit("@", 1)[1]
				84	if ":" in netloc:
				85	port = netloc.split(":", 1)[1]
				86	return int(port, 10)
				87	return None
				88
				89	from collections import namedtuple
				90
				91	class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
				92
				93	__slots__ = ()
				94
				95	def geturl(self):
				96	return urlunsplit(self)
				97
				98
				99	class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
				100
				101	__slots__ = ()
				102
				103	def geturl(self):
				104	return urlunparse(self)
				105
				106
				107	def urlparse(url, scheme='', allow_fragments=True):
				108	"""Parse a URL into 6 components:
				109	<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
				110	Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
				111	Note that we don't break the components up in smaller bits
				112	(e.g. netloc is a single string) and we don't expand % escapes."""
				113	tuple = urlsplit(url, scheme, allow_fragments)
				114	scheme, netloc, url, query, fragment = tuple
				115	if scheme in uses_params and ';' in url:
				116	url, params = _splitparams(url)
				117	else:
				118	params = ''
				119	return ParseResult(scheme, netloc, url, params, query, fragment)
				120
				121	def _splitparams(url):
				122	if '/' in url:
				123	i = url.find(';', url.rfind('/'))
				124	if i < 0:
				125	return url, ''
				126	else:
				127	i = url.find(';')
				128	return url[:i], url[i+1:]
				129
				130	def _splitnetloc(url, start=0):
				131	delim = len(url) # position of end of domain part of url, default is end
				132	for c in '/?#': # look for delimiters; the order is NOT important
				133	wdelim = url.find(c, start) # find first of this delim
				134	if wdelim >= 0: # if found
				135	delim = min(delim, wdelim) # use earliest delim position
				136	return url[start:delim], url[delim:] # return (domain, rest)
				137
				138	def urlsplit(url, scheme='', allow_fragments=True):
				139	"""Parse a URL into 5 components:
				140	<scheme>://<netloc>/<path>?<query>#<fragment>
				141	Return a 5-tuple: (scheme, netloc, path, query, fragment).
				142	Note that we don't break the components up in smaller bits
				143	(e.g. netloc is a single string) and we don't expand % escapes."""
				144	allow_fragments = bool(allow_fragments)
				145	key = url, scheme, allow_fragments, type(url), type(scheme)
				146	cached = _parse_cache.get(key, None)
				147	if cached:
				148	return cached
				149	if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
				150	clear_cache()
				151	netloc = query = fragment = ''
				152	i = url.find(':')
				153	if i > 0:
				154	if url[:i] == 'http': # optimize the common case
				155	scheme = url[:i].lower()
				156	url = url[i+1:]
				157	if url[:2] == '//':
				158	netloc, url = _splitnetloc(url, 2)
				159	if allow_fragments and '#' in url:
				160	url, fragment = url.split('#', 1)
				161	if '?' in url:
				162	url, query = url.split('?', 1)
				163	v = SplitResult(scheme, netloc, url, query, fragment)
				164	_parse_cache[key] = v
				165	return v
				166	for c in url[:i]:
				167	if c not in scheme_chars:
				168	break
				169	else:
				170	scheme, url = url[:i].lower(), url[i+1:]
				171	if scheme in uses_netloc and url[:2] == '//':
				172	netloc, url = _splitnetloc(url, 2)
				173	if allow_fragments and scheme in uses_fragment and '#' in url:
				174	url, fragment = url.split('#', 1)
				175	if scheme in uses_query and '?' in url:
				176	url, query = url.split('?', 1)
				177	v = SplitResult(scheme, netloc, url, query, fragment)
				178	_parse_cache[key] = v
				179	return v
				180
				181	def urlunparse(components):
				182	"""Put a parsed URL back together again. This may result in a
				183	slightly different, but equivalent URL, if the URL that was parsed
				184	originally had redundant delimiters, e.g. a ? with an empty query
				185	(the draft states that these are equivalent)."""
				186	scheme, netloc, url, params, query, fragment = components
				187	if params:
				188	url = "%s;%s" % (url, params)
				189	return urlunsplit((scheme, netloc, url, query, fragment))
				190
				191	def urlunsplit(components):
				192	scheme, netloc, url, query, fragment = components
				193	if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
				194	if url and url[:1] != '/': url = '/' + url
				195	url = '//' + (netloc or '') + url
				196	if scheme:
				197	url = scheme + ':' + url
				198	if query:
				199	url = url + '?' + query
				200	if fragment:
				201	url = url + '#' + fragment
				202	return url
				203
				204	def urljoin(base, url, allow_fragments=True):
				205	"""Join a base URL and a possibly relative URL to form an absolute
				206	interpretation of the latter."""
				207	if not base:
				208	return url
				209	if not url:
				210	return base
				211	bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
				212	urlparse(base, '', allow_fragments)
				213	scheme, netloc, path, params, query, fragment = \
				214	urlparse(url, bscheme, allow_fragments)
				215	if scheme != bscheme or scheme not in uses_relative:
				216	return url
				217	if scheme in uses_netloc:
				218	if netloc:
				219	return urlunparse((scheme, netloc, path,
				220	params, query, fragment))
				221	netloc = bnetloc
				222	if path[:1] == '/':
				223	return urlunparse((scheme, netloc, path,
				224	params, query, fragment))
Facundo Batista	23e3856	2008-08-14 16:55:14 +0000	[diff] [blame]	225	if not path:
				226	path = bpath
				227	if not params:
				228	params = bparams
				229	else:
				230	path = path[:-1]
				231	return urlunparse((scheme, netloc, path,
				232	params, query, fragment))
				233	if not query:
				234	query = bquery
				235	return urlunparse((scheme, netloc, path,
				236	params, query, fragment))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	237	segments = bpath.split('/')[:-1] + path.split('/')
				238	# XXX The stuff below is bogus in various ways...
				239	if segments[-1] == '.':
				240	segments[-1] = ''
				241	while '.' in segments:
				242	segments.remove('.')
				243	while 1:
				244	i = 1
				245	n = len(segments) - 1
				246	while i < n:
				247	if (segments[i] == '..'
				248	and segments[i-1] not in ('', '..')):
				249	del segments[i-1:i+1]
				250	break
				251	i = i+1
				252	else:
				253	break
				254	if segments == ['', '..']:
				255	segments[-1] = ''
				256	elif len(segments) >= 2 and segments[-1] == '..':
				257	segments[-2:] = ['']
				258	return urlunparse((scheme, netloc, '/'.join(segments),
				259	params, query, fragment))
				260
				261	def urldefrag(url):
				262	"""Removes any existing fragment from URL.
				263
				264	Returns a tuple of the defragmented URL and the fragment. If
				265	the URL contained no fragments, the second element is the
				266	empty string.
				267	"""
				268	if '#' in url:
				269	s, n, p, a, q, frag = urlparse(url)
				270	defrag = urlunparse((s, n, p, a, q, ''))
				271	return defrag, frag
				272	else:
				273	return url, ''
				274
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	275	def unquote_to_bytes(string):
				276	"""unquote_to_bytes('abc%20def') -> b'abc def'."""
				277	# Note: strings are encoded as UTF-8. This is only an issue if it contains
				278	# unescaped non-ASCII characters, which URIs should not.
				279	if isinstance(string, str):
				280	string = string.encode('utf-8')
				281	res = string.split(b'%')
				282	res[0] = res[0]
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	283	for i in range(1, len(res)):
				284	item = res[i]
				285	try:
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	286	res[i] = bytes([int(item[:2], 16)]) + item[2:]
				287	except ValueError:
				288	res[i] = b'%' + item
				289	return b''.join(res)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	290
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	291	def unquote(string, encoding='utf-8', errors='replace'):
				292	"""Replace %xx escapes by their single-character equivalent. The optional
				293	encoding and errors parameters specify how to decode percent-encoded
				294	sequences into Unicode characters, as accepted by the bytes.decode()
				295	method.
				296	By default, percent-encoded sequences are decoded with UTF-8, and invalid
				297	sequences are replaced by a placeholder character.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	298
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	299	unquote('abc%20def') -> 'abc def'.
				300	"""
				301	if encoding is None: encoding = 'utf-8'
				302	if errors is None: errors = 'replace'
				303	# pct_sequence: contiguous sequence of percent-encoded bytes, decoded
				304	# (list of single-byte bytes objects)
				305	pct_sequence = []
				306	res = string.split('%')
				307	for i in range(1, len(res)):
				308	item = res[i]
				309	try:
				310	if not item: raise ValueError
				311	pct_sequence.append(bytes.fromhex(item[:2]))
				312	rest = item[2:]
				313	except ValueError:
				314	rest = '%' + item
				315	if not rest:
				316	# This segment was just a single percent-encoded character.
				317	# May be part of a sequence of code units, so delay decoding.
				318	# (Stored in pct_sequence).
				319	res[i] = ''
				320	else:
				321	# Encountered non-percent-encoded characters. Flush the current
				322	# pct_sequence.
				323	res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
				324	pct_sequence = []
				325	if pct_sequence:
				326	# Flush the final pct_sequence
				327	# res[-1] will always be empty if pct_sequence != []
				328	assert not res[-1], "string=%r, res=%r" % (string, res)
				329	res[-1] = b''.join(pct_sequence).decode(encoding, errors)
				330	return ''.join(res)
				331
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	332	def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
				333	"""Parse a query given as a string argument.
				334
				335	Arguments:
				336
				337	qs: URL-encoded query string to be parsed
				338
				339	keep_blank_values: flag indicating whether blank values in
				340	URL encoded queries should be treated as blank strings.
				341	A true value indicates that blanks should be retained as
				342	blank strings. The default false value indicates that
				343	blank values are to be ignored and treated as if they were
				344	not included.
				345
				346	strict_parsing: flag indicating what to do with parsing errors.
				347	If false (the default), errors are silently ignored.
				348	If true, errors raise a ValueError exception.
				349	"""
				350	dict = {}
				351	for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
				352	if name in dict:
				353	dict[name].append(value)
				354	else:
				355	dict[name] = [value]
				356	return dict
				357
				358	def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
				359	"""Parse a query given as a string argument.
				360
				361	Arguments:
				362
				363	qs: URL-encoded query string to be parsed
				364
				365	keep_blank_values: flag indicating whether blank values in
				366	URL encoded queries should be treated as blank strings. A
				367	true value indicates that blanks should be retained as blank
				368	strings. The default false value indicates that blank values
				369	are to be ignored and treated as if they were not included.
				370
				371	strict_parsing: flag indicating what to do with parsing errors. If
				372	false (the default), errors are silently ignored. If true,
				373	errors raise a ValueError exception.
				374
				375	Returns a list, as G-d intended.
				376	"""
				377	pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
				378	r = []
				379	for name_value in pairs:
				380	if not name_value and not strict_parsing:
				381	continue
				382	nv = name_value.split('=', 1)
				383	if len(nv) != 2:
				384	if strict_parsing:
				385	raise ValueError("bad query field: %r" % (name_value,))
				386	# Handle case of a control-name with no equal sign
				387	if keep_blank_values:
				388	nv.append('')
				389	else:
				390	continue
				391	if len(nv[1]) or keep_blank_values:
				392	name = unquote(nv[0].replace('+', ' '))
				393	value = unquote(nv[1].replace('+', ' '))
				394	r.append((name, value))
				395
				396	return r
				397
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	398	def unquote_plus(string, encoding='utf-8', errors='replace'):
				399	"""Like unquote(), but also replace plus signs by spaces, as required for
				400	unquoting HTML form values.
				401
				402	unquote_plus('%7e/abc+def') -> '~/abc def'
				403	"""
				404	string = string.replace('+', ' ')
				405	return unquote(string, encoding, errors)
				406
				407	_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				408	b'abcdefghijklmnopqrstuvwxyz'
				409	b'0123456789'
				410	b'_.-')
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	411	_safe_quoters= {}
				412
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	413	class Quoter(collections.defaultdict):
				414	"""A mapping from bytes (in range(0,256)) to strings.
				415
				416	String values are percent-encoded byte values, unless the key < 128, and
				417	in the "safe" set (either the specified safe set, or default set).
				418	"""
				419	# Keeps a cache internally, using defaultdict, for efficiency (lookups
				420	# of cached keys don't call Python code at all).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	421	def __init__(self, safe):
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	422	"""safe: bytes object."""
				423	self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	424
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	425	def __repr__(self):
				426	# Without this, will just display as a defaultdict
				427	return "<Quoter %r>" % dict(self)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	428
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	429	def __missing__(self, b):
				430	# Handle a cache miss. Store quoted string in cache and return.
				431	res = b in self.safe and chr(b) or ('%%%02X' % b)
				432	self[b] = res
				433	return res
				434
				435	def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	436	"""quote('abc def') -> 'abc%20def'
				437
				438	Each part of a URL, e.g. the path info, the query, etc., has a
				439	different set of reserved characters that must be quoted.
				440
				441	RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
				442	the following reserved characters.
				443
				444	reserved = ";" \| "/" \| "?" \| ":" \| "@" \| "&" \| "=" \| "+" \|
				445	"$" \| ","
				446
				447	Each of these characters is reserved in some component of a URL,
				448	but not necessarily in all of them.
				449
				450	By default, the quote function is intended for quoting the path
				451	section of a URL. Thus, it will not encode '/'. This character
				452	is reserved, but in typical usage the quote function is being
				453	called on a path where the existing slash characters are used as
				454	reserved characters.
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	455
				456	string and safe may be either str or bytes objects. encoding must
				457	not be specified if string is a str.
				458
				459	The optional encoding and errors parameters specify how to deal with
				460	non-ASCII characters, as accepted by the str.encode method.
				461	By default, encoding='utf-8' (characters are encoded with UTF-8), and
				462	errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	463	"""
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	464	if isinstance(string, str):
				465	if encoding is None:
				466	encoding = 'utf-8'
				467	if errors is None:
				468	errors = 'strict'
				469	string = string.encode(encoding, errors)
				470	else:
				471	if encoding is not None:
				472	raise TypeError("quote() doesn't support 'encoding' for bytes")
				473	if errors is not None:
				474	raise TypeError("quote() doesn't support 'errors' for bytes")
				475	return quote_from_bytes(string, safe)
				476
				477	def quote_plus(string, safe='', encoding=None, errors=None):
				478	"""Like quote(), but also replace ' ' with '+', as required for quoting
				479	HTML form values. Plus signs in the original string are escaped unless
				480	they are included in safe. It also does not have safe default to '/'.
				481	"""
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	482	# Check if ' ' in string, where string may either be a str or bytes. If
				483	# there are no spaces, the regular quote will produce the right answer.
				484	if ((isinstance(string, str) and ' ' not in string) or
				485	(isinstance(string, bytes) and b' ' not in string)):
				486	return quote(string, safe, encoding, errors)
				487	if isinstance(safe, str):
				488	space = ' '
				489	else:
				490	space = b' '
				491	string = quote(string, safe + space)
				492	return string.replace(' ', '+')
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	493
				494	def quote_from_bytes(bs, safe='/'):
				495	"""Like quote(), but accepts a bytes object rather than a str, and does
				496	not perform string-to-bytes encoding. It always returns an ASCII string.
				497	quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
				498	"""
				499	if isinstance(safe, str):
				500	# Normalize 'safe' by converting to bytes and removing non-ASCII chars
				501	safe = safe.encode('ascii', 'ignore')
				502	cachekey = bytes(safe) # In case it was a bytearray
				503	if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
				504	raise TypeError("quote_from_bytes() expected a bytes")
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	505	try:
				506	quoter = _safe_quoters[cachekey]
				507	except KeyError:
				508	quoter = Quoter(safe)
				509	_safe_quoters[cachekey] = quoter
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	510	return ''.join([quoter[char] for char in bs])
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	511
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	512	def urlencode(query, doseq=0):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	513	"""Encode a sequence of two-element tuples or dictionary into a URL query string.
				514
				515	If any values in the query arg are sequences and doseq is true, each
				516	sequence element is converted to a separate parameter.
				517
				518	If the query arg is a sequence of two-element tuples, the order of the
				519	parameters in the output will match the order of parameters in the
				520	input.
				521	"""
				522
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	523	if hasattr(query, "items"):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	524	query = query.items()
				525	else:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	526	# It's a bother at times that strings and string-like objects are
				527	# sequences.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	528	try:
				529	# non-sequence items should not work with len()
				530	# non-empty strings will fail this
				531	if len(query) and not isinstance(query[0], tuple):
				532	raise TypeError
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	533	# Zero-length sequences of all types will get here and succeed,
				534	# but that's a minor nit. Since the original implementation
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	535	# allowed empty dicts that type of behavior probably should be
				536	# preserved for consistency
				537	except TypeError:
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	538	ty, va, tb = sys.exc_info()
				539	raise TypeError("not a valid non-string sequence "
				540	"or mapping object").with_traceback(tb)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	541
				542	l = []
				543	if not doseq:
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	544	for k, v in query:
				545	k = quote_plus(str(k))
				546	v = quote_plus(str(v))
				547	l.append(k + '=' + v)
				548	else:
				549	for k, v in query:
				550	k = quote_plus(str(k))
				551	if isinstance(v, str):
				552	v = quote_plus(v)
				553	l.append(k + '=' + v)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	554	else:
				555	try:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	556	# Is this a sufficient test for sequence-ness?
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	557	x = len(v)
				558	except TypeError:
				559	# not a sequence
				560	v = quote_plus(str(v))
				561	l.append(k + '=' + v)
				562	else:
				563	# loop over the sequence
				564	for elt in v:
				565	l.append(k + '=' + quote_plus(str(elt)))
				566	return '&'.join(l)
				567
				568	# Utilities to parse URLs (most of these return None for missing parts):
				569	# unwrap('<URL:type://host/path>') --> 'type://host/path'
				570	# splittype('type:opaquestring') --> 'type', 'opaquestring'
				571	# splithost('//host[:port]/path') --> 'host[:port]', '/path'
				572	# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
				573	# splitpasswd('user:passwd') -> 'user', 'passwd'
				574	# splitport('host:port') --> 'host', 'port'
				575	# splitquery('/path?query') --> '/path', 'query'
				576	# splittag('/path#tag') --> '/path', 'tag'
				577	# splitattr('/path;attr1=value1;attr2=value2;...') ->
				578	# '/path', ['attr1=value1', 'attr2=value2', ...]
				579	# splitvalue('attr=value') --> 'attr', 'value'
				580	# urllib.parse.unquote('abc%20def') -> 'abc def'
				581	# quote('abc def') -> 'abc%20def')
				582
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	583	def to_bytes(url):
				584	"""to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	585	# Most URL schemes require ASCII. If that changes, the conversion
				586	# can be relaxed.
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	587	# XXX get rid of to_bytes()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	588	if isinstance(url, str):
				589	try:
				590	url = url.encode("ASCII").decode()
				591	except UnicodeError:
				592	raise UnicodeError("URL " + repr(url) +
				593	" contains non-ASCII characters")
				594	return url
				595
				596	def unwrap(url):
				597	"""unwrap('<URL:type://host/path>') --> 'type://host/path'."""
				598	url = str(url).strip()
				599	if url[:1] == '<' and url[-1:] == '>':
				600	url = url[1:-1].strip()
				601	if url[:4] == 'URL:': url = url[4:].strip()
				602	return url
				603
				604	_typeprog = None
				605	def splittype(url):
				606	"""splittype('type:opaquestring') --> 'type', 'opaquestring'."""
				607	global _typeprog
				608	if _typeprog is None:
				609	import re
				610	_typeprog = re.compile('^([^/:]+):')
				611
				612	match = _typeprog.match(url)
				613	if match:
				614	scheme = match.group(1)
				615	return scheme.lower(), url[len(scheme) + 1:]
				616	return None, url
				617
				618	_hostprog = None
				619	def splithost(url):
				620	"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
				621	global _hostprog
				622	if _hostprog is None:
				623	import re
				624	_hostprog = re.compile('^//([^/?])(.)$')
				625
				626	match = _hostprog.match(url)
				627	if match: return match.group(1, 2)
				628	return None, url
				629
				630	_userprog = None
				631	def splituser(host):
				632	"""splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
				633	global _userprog
				634	if _userprog is None:
				635	import re
				636	_userprog = re.compile('^(.)@(.)$')
				637
				638	match = _userprog.match(host)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	639	if match: return map(unquote, match.group(1, 2))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	640	return None, host
				641
				642	_passwdprog = None
				643	def splitpasswd(user):
				644	"""splitpasswd('user:passwd') -> 'user', 'passwd'."""
				645	global _passwdprog
				646	if _passwdprog is None:
				647	import re
Senthil Kumaran	eaaec27	2009-03-30 21:54:41 +0000	[diff] [blame]	648	_passwdprog = re.compile('^([^:]):(.)$',re.S)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	649
				650	match = _passwdprog.match(user)
				651	if match: return match.group(1, 2)
				652	return user, None
				653
				654	# splittag('/path#tag') --> '/path', 'tag'
				655	_portprog = None
				656	def splitport(host):
				657	"""splitport('host:port') --> 'host', 'port'."""
				658	global _portprog
				659	if _portprog is None:
				660	import re
				661	_portprog = re.compile('^(.*):([0-9]+)$')
				662
				663	match = _portprog.match(host)
				664	if match: return match.group(1, 2)
				665	return host, None
				666
				667	_nportprog = None
				668	def splitnport(host, defport=-1):
				669	"""Split host and port, returning numeric port.
				670	Return given default port if no ':' found; defaults to -1.
				671	Return numerical port if a valid number are found after ':'.
				672	Return None if ':' but not a valid number."""
				673	global _nportprog
				674	if _nportprog is None:
				675	import re
				676	_nportprog = re.compile('^(.):(.)$')
				677
				678	match = _nportprog.match(host)
				679	if match:
				680	host, port = match.group(1, 2)
				681	try:
				682	if not port: raise ValueError("no digits")
				683	nport = int(port)
				684	except ValueError:
				685	nport = None
				686	return host, nport
				687	return host, defport
				688
				689	_queryprog = None
				690	def splitquery(url):
				691	"""splitquery('/path?query') --> '/path', 'query'."""
				692	global _queryprog
				693	if _queryprog is None:
				694	import re
				695	_queryprog = re.compile('^(.)\?([^?])$')
				696
				697	match = _queryprog.match(url)
				698	if match: return match.group(1, 2)
				699	return url, None
				700
				701	_tagprog = None
				702	def splittag(url):
				703	"""splittag('/path#tag') --> '/path', 'tag'."""
				704	global _tagprog
				705	if _tagprog is None:
				706	import re
				707	_tagprog = re.compile('^(.)#([^#])$')
				708
				709	match = _tagprog.match(url)
				710	if match: return match.group(1, 2)
				711	return url, None
				712
				713	def splitattr(url):
				714	"""splitattr('/path;attr1=value1;attr2=value2;...') ->
				715	'/path', ['attr1=value1', 'attr2=value2', ...]."""
				716	words = url.split(';')
				717	return words[0], words[1:]
				718
				719	_valueprog = None
				720	def splitvalue(attr):
				721	"""splitvalue('attr=value') --> 'attr', 'value'."""
				722	global _valueprog
				723	if _valueprog is None:
				724	import re
				725	_valueprog = re.compile('^([^=])=(.)$')
				726
				727	match = _valueprog.match(attr)
				728	if match: return match.group(1, 2)
				729	return attr, None
				730
				731	test_input = """
				732	http://a/b/c/d
				733
				734	g:h = <URL:g:h>
				735	http:g = <URL:http://a/b/c/g>
				736	http: = <URL:http://a/b/c/d>
				737	g = <URL:http://a/b/c/g>
				738	./g = <URL:http://a/b/c/g>
				739	g/ = <URL:http://a/b/c/g/>
				740	/g = <URL:http://a/g>
				741	//g = <URL:http://g>
				742	?y = <URL:http://a/b/c/d?y>
				743	g?y = <URL:http://a/b/c/g?y>
				744	g?y/./x = <URL:http://a/b/c/g?y/./x>
				745	. = <URL:http://a/b/c/>
				746	./ = <URL:http://a/b/c/>
				747	.. = <URL:http://a/b/>
				748	../ = <URL:http://a/b/>
				749	../g = <URL:http://a/b/g>
				750	../.. = <URL:http://a/>
				751	../../g = <URL:http://a/g>
				752	../../../g = <URL:http://a/../g>
				753	./../g = <URL:http://a/b/g>
				754	./g/. = <URL:http://a/b/c/g/>
				755	/./g = <URL:http://a/./g>
				756	g/./h = <URL:http://a/b/c/g/h>
				757	g/../h = <URL:http://a/b/c/h>
				758	http:g = <URL:http://a/b/c/g>
				759	http: = <URL:http://a/b/c/d>
				760	http:?y = <URL:http://a/b/c/d?y>
				761	http:g?y = <URL:http://a/b/c/g?y>
				762	http:g?y/./x = <URL:http://a/b/c/g?y/./x>
				763	"""
				764
				765	def test():
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	766	base = ''
				767	if sys.argv[1:]:
				768	fn = sys.argv[1]
				769	if fn == '-':
				770	fp = sys.stdin
				771	else:
				772	fp = open(fn)
				773	else:
				774	from io import StringIO
				775	fp = StringIO(test_input)
				776	for line in fp:
				777	words = line.split()
				778	if not words:
				779	continue
				780	url = words[0]
				781	parts = urlparse(url)
				782	print('%-10s : %s' % (url, parts))
				783	abs = urljoin(base, url)
				784	if not base:
				785	base = abs
				786	wrapped = '<URL:%s>' % abs
				787	print('%-10s = %s' % (url, wrapped))
				788	if len(words) == 3 and words[1] == '=':
				789	if wrapped != words[2]:
				790	print('EXPECTED', words[2], '!!!!!!!!!!')
				791
				792	if __name__ == '__main__':
				793	test()