Blame - Lib/urllib/parse.py - platform/external/python/cpython2

blob: 1affc6930d9298f5bf0b32fea66f922998a199d6 [file] [log] [blame]

Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1	"""Parse (absolute and relative) URLs.
				2
				3	See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
				4	UC Irvine, June 1995.
				5	"""
				6
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	7	import sys
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	8	import collections
Facundo Batista	2ac5de2	2008-07-07 18:24:11 +0000	[diff] [blame]	9
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	10	__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	11	"urlsplit", "urlunsplit", "parse_qs", "parse_qsl",
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	12	"quote", "quote_plus", "quote_from_bytes",
				13	"unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	14
				15	# A classification of schemes ('' means apply by default)
				16	uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
				17	'wais', 'file', 'https', 'shttp', 'mms',
				18	'prospero', 'rtsp', 'rtspu', '', 'sftp']
				19	uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
				20	'imap', 'wais', 'file', 'mms', 'https', 'shttp',
				21	'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Senthil Kumaran	eaaec27	2009-03-30 21:54:41 +0000	[diff] [blame]	22	'svn', 'svn+ssh', 'sftp','nfs']
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	23	non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
				24	'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
				25	uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
				26	'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
				27	'mms', '', 'sftp']
				28	uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
				29	'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
				30	uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
				31	'nntp', 'wais', 'https', 'shttp', 'snews',
				32	'file', 'prospero', '']
				33
				34	# Characters valid in scheme names
				35	scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
				36	'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				37	'0123456789'
				38	'+-.')
				39
				40	MAX_CACHE_SIZE = 20
				41	_parse_cache = {}
				42
				43	def clear_cache():
				44	"""Clear the parse cache."""
				45	_parse_cache.clear()
				46
				47
				48	class ResultMixin(object):
				49	"""Shared methods for the parsed result objects."""
				50
				51	@property
				52	def username(self):
				53	netloc = self.netloc
				54	if "@" in netloc:
				55	userinfo = netloc.rsplit("@", 1)[0]
				56	if ":" in userinfo:
				57	userinfo = userinfo.split(":", 1)[0]
				58	return userinfo
				59	return None
				60
				61	@property
				62	def password(self):
				63	netloc = self.netloc
				64	if "@" in netloc:
				65	userinfo = netloc.rsplit("@", 1)[0]
				66	if ":" in userinfo:
				67	return userinfo.split(":", 1)[1]
				68	return None
				69
				70	@property
				71	def hostname(self):
Senthil Kumaran	2176ad5	2010-04-16 03:06:19 +0000	[diff] [blame^]	72	netloc = self.netloc.split('@')[-1]
				73	if '[' in netloc and ']' in netloc:
				74	return netloc.split(']')[0][1:].lower()
				75	elif '[' in netloc or ']' in netloc:
				76	raise ValueError("Invalid IPv6 hostname")
				77	elif ':' in netloc:
				78	return netloc.split(':')[0].lower()
				79	elif netloc == '':
				80	return None
				81	else:
				82	return netloc.lower()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	83
				84	@property
				85	def port(self):
Senthil Kumaran	2176ad5	2010-04-16 03:06:19 +0000	[diff] [blame^]	86	netloc = self.netloc.split('@')[-1].split(']')[-1]
				87	if ':' in netloc:
				88	port = netloc.split(':')[1]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	89	return int(port, 10)
Senthil Kumaran	2176ad5	2010-04-16 03:06:19 +0000	[diff] [blame^]	90	else:
				91	return None
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	92
				93	from collections import namedtuple
				94
				95	class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
				96
				97	__slots__ = ()
				98
				99	def geturl(self):
				100	return urlunsplit(self)
				101
				102
				103	class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
				104
				105	__slots__ = ()
				106
				107	def geturl(self):
				108	return urlunparse(self)
				109
				110
				111	def urlparse(url, scheme='', allow_fragments=True):
				112	"""Parse a URL into 6 components:
				113	<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
				114	Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
				115	Note that we don't break the components up in smaller bits
				116	(e.g. netloc is a single string) and we don't expand % escapes."""
				117	tuple = urlsplit(url, scheme, allow_fragments)
				118	scheme, netloc, url, query, fragment = tuple
				119	if scheme in uses_params and ';' in url:
				120	url, params = _splitparams(url)
				121	else:
				122	params = ''
				123	return ParseResult(scheme, netloc, url, params, query, fragment)
				124
				125	def _splitparams(url):
				126	if '/' in url:
				127	i = url.find(';', url.rfind('/'))
				128	if i < 0:
				129	return url, ''
				130	else:
				131	i = url.find(';')
				132	return url[:i], url[i+1:]
				133
				134	def _splitnetloc(url, start=0):
				135	delim = len(url) # position of end of domain part of url, default is end
Senthil Kumaran	2176ad5	2010-04-16 03:06:19 +0000	[diff] [blame^]	136	if '[' in url: # check for invalid IPv6 URL
				137	if not ']' in url: raise ValueError("Invalid IPv6 URL")
				138	elif ']' in url:
				139	if not '[' in url: raise ValueError("Invalid IPv6 URL")
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	140	for c in '/?#': # look for delimiters; the order is NOT important
				141	wdelim = url.find(c, start) # find first of this delim
				142	if wdelim >= 0: # if found
				143	delim = min(delim, wdelim) # use earliest delim position
				144	return url[start:delim], url[delim:] # return (domain, rest)
				145
				146	def urlsplit(url, scheme='', allow_fragments=True):
				147	"""Parse a URL into 5 components:
				148	<scheme>://<netloc>/<path>?<query>#<fragment>
				149	Return a 5-tuple: (scheme, netloc, path, query, fragment).
				150	Note that we don't break the components up in smaller bits
				151	(e.g. netloc is a single string) and we don't expand % escapes."""
				152	allow_fragments = bool(allow_fragments)
				153	key = url, scheme, allow_fragments, type(url), type(scheme)
				154	cached = _parse_cache.get(key, None)
				155	if cached:
				156	return cached
				157	if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
				158	clear_cache()
				159	netloc = query = fragment = ''
				160	i = url.find(':')
				161	if i > 0:
				162	if url[:i] == 'http': # optimize the common case
				163	scheme = url[:i].lower()
				164	url = url[i+1:]
				165	if url[:2] == '//':
				166	netloc, url = _splitnetloc(url, 2)
				167	if allow_fragments and '#' in url:
				168	url, fragment = url.split('#', 1)
				169	if '?' in url:
				170	url, query = url.split('?', 1)
				171	v = SplitResult(scheme, netloc, url, query, fragment)
				172	_parse_cache[key] = v
				173	return v
				174	for c in url[:i]:
				175	if c not in scheme_chars:
				176	break
				177	else:
				178	scheme, url = url[:i].lower(), url[i+1:]
Senthil Kumaran	a8dbb24	2010-02-19 07:45:03 +0000	[diff] [blame]	179	if url[:2] == '//':
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	180	netloc, url = _splitnetloc(url, 2)
				181	if allow_fragments and scheme in uses_fragment and '#' in url:
				182	url, fragment = url.split('#', 1)
				183	if scheme in uses_query and '?' in url:
				184	url, query = url.split('?', 1)
				185	v = SplitResult(scheme, netloc, url, query, fragment)
				186	_parse_cache[key] = v
				187	return v
				188
				189	def urlunparse(components):
				190	"""Put a parsed URL back together again. This may result in a
				191	slightly different, but equivalent URL, if the URL that was parsed
				192	originally had redundant delimiters, e.g. a ? with an empty query
				193	(the draft states that these are equivalent)."""
				194	scheme, netloc, url, params, query, fragment = components
				195	if params:
				196	url = "%s;%s" % (url, params)
				197	return urlunsplit((scheme, netloc, url, query, fragment))
				198
				199	def urlunsplit(components):
				200	scheme, netloc, url, query, fragment = components
				201	if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
				202	if url and url[:1] != '/': url = '/' + url
				203	url = '//' + (netloc or '') + url
				204	if scheme:
				205	url = scheme + ':' + url
				206	if query:
				207	url = url + '?' + query
				208	if fragment:
				209	url = url + '#' + fragment
				210	return url
				211
				212	def urljoin(base, url, allow_fragments=True):
				213	"""Join a base URL and a possibly relative URL to form an absolute
				214	interpretation of the latter."""
				215	if not base:
				216	return url
				217	if not url:
				218	return base
				219	bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
				220	urlparse(base, '', allow_fragments)
				221	scheme, netloc, path, params, query, fragment = \
				222	urlparse(url, bscheme, allow_fragments)
				223	if scheme != bscheme or scheme not in uses_relative:
				224	return url
				225	if scheme in uses_netloc:
				226	if netloc:
				227	return urlunparse((scheme, netloc, path,
				228	params, query, fragment))
				229	netloc = bnetloc
				230	if path[:1] == '/':
				231	return urlunparse((scheme, netloc, path,
				232	params, query, fragment))
Facundo Batista	23e3856	2008-08-14 16:55:14 +0000	[diff] [blame]	233	if not path:
				234	path = bpath
				235	if not params:
				236	params = bparams
				237	else:
				238	path = path[:-1]
				239	return urlunparse((scheme, netloc, path,
				240	params, query, fragment))
				241	if not query:
				242	query = bquery
				243	return urlunparse((scheme, netloc, path,
				244	params, query, fragment))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	245	segments = bpath.split('/')[:-1] + path.split('/')
				246	# XXX The stuff below is bogus in various ways...
				247	if segments[-1] == '.':
				248	segments[-1] = ''
				249	while '.' in segments:
				250	segments.remove('.')
				251	while 1:
				252	i = 1
				253	n = len(segments) - 1
				254	while i < n:
				255	if (segments[i] == '..'
				256	and segments[i-1] not in ('', '..')):
				257	del segments[i-1:i+1]
				258	break
				259	i = i+1
				260	else:
				261	break
				262	if segments == ['', '..']:
				263	segments[-1] = ''
				264	elif len(segments) >= 2 and segments[-1] == '..':
				265	segments[-2:] = ['']
				266	return urlunparse((scheme, netloc, '/'.join(segments),
				267	params, query, fragment))
				268
				269	def urldefrag(url):
				270	"""Removes any existing fragment from URL.
				271
				272	Returns a tuple of the defragmented URL and the fragment. If
				273	the URL contained no fragments, the second element is the
				274	empty string.
				275	"""
				276	if '#' in url:
				277	s, n, p, a, q, frag = urlparse(url)
				278	defrag = urlunparse((s, n, p, a, q, ''))
				279	return defrag, frag
				280	else:
				281	return url, ''
				282
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	283	def unquote_to_bytes(string):
				284	"""unquote_to_bytes('abc%20def') -> b'abc def'."""
				285	# Note: strings are encoded as UTF-8. This is only an issue if it contains
				286	# unescaped non-ASCII characters, which URIs should not.
				287	if isinstance(string, str):
				288	string = string.encode('utf-8')
				289	res = string.split(b'%')
				290	res[0] = res[0]
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	291	for i in range(1, len(res)):
				292	item = res[i]
				293	try:
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	294	res[i] = bytes([int(item[:2], 16)]) + item[2:]
				295	except ValueError:
				296	res[i] = b'%' + item
				297	return b''.join(res)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	298
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	299	def unquote(string, encoding='utf-8', errors='replace'):
				300	"""Replace %xx escapes by their single-character equivalent. The optional
				301	encoding and errors parameters specify how to decode percent-encoded
				302	sequences into Unicode characters, as accepted by the bytes.decode()
				303	method.
				304	By default, percent-encoded sequences are decoded with UTF-8, and invalid
				305	sequences are replaced by a placeholder character.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	306
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	307	unquote('abc%20def') -> 'abc def'.
				308	"""
				309	if encoding is None: encoding = 'utf-8'
				310	if errors is None: errors = 'replace'
				311	# pct_sequence: contiguous sequence of percent-encoded bytes, decoded
				312	# (list of single-byte bytes objects)
				313	pct_sequence = []
				314	res = string.split('%')
				315	for i in range(1, len(res)):
				316	item = res[i]
				317	try:
				318	if not item: raise ValueError
				319	pct_sequence.append(bytes.fromhex(item[:2]))
				320	rest = item[2:]
				321	except ValueError:
				322	rest = '%' + item
				323	if not rest:
				324	# This segment was just a single percent-encoded character.
				325	# May be part of a sequence of code units, so delay decoding.
				326	# (Stored in pct_sequence).
				327	res[i] = ''
				328	else:
				329	# Encountered non-percent-encoded characters. Flush the current
				330	# pct_sequence.
				331	res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
				332	pct_sequence = []
				333	if pct_sequence:
				334	# Flush the final pct_sequence
				335	# res[-1] will always be empty if pct_sequence != []
				336	assert not res[-1], "string=%r, res=%r" % (string, res)
				337	res[-1] = b''.join(pct_sequence).decode(encoding, errors)
				338	return ''.join(res)
				339
Georg Brandl	b044b2a	2009-09-16 16:05:59 +0000	[diff] [blame]	340	def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	341	"""Parse a query given as a string argument.
				342
				343	Arguments:
				344
				345	qs: URL-encoded query string to be parsed
				346
				347	keep_blank_values: flag indicating whether blank values in
				348	URL encoded queries should be treated as blank strings.
				349	A true value indicates that blanks should be retained as
				350	blank strings. The default false value indicates that
				351	blank values are to be ignored and treated as if they were
				352	not included.
				353
				354	strict_parsing: flag indicating what to do with parsing errors.
				355	If false (the default), errors are silently ignored.
				356	If true, errors raise a ValueError exception.
				357	"""
				358	dict = {}
				359	for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
				360	if name in dict:
				361	dict[name].append(value)
				362	else:
				363	dict[name] = [value]
				364	return dict
				365
Georg Brandl	b044b2a	2009-09-16 16:05:59 +0000	[diff] [blame]	366	def parse_qsl(qs, keep_blank_values=False, strict_parsing=False):
Facundo Batista	c469d4c	2008-09-03 22:49:01 +0000	[diff] [blame]	367	"""Parse a query given as a string argument.
				368
				369	Arguments:
				370
				371	qs: URL-encoded query string to be parsed
				372
				373	keep_blank_values: flag indicating whether blank values in
				374	URL encoded queries should be treated as blank strings. A
				375	true value indicates that blanks should be retained as blank
				376	strings. The default false value indicates that blank values
				377	are to be ignored and treated as if they were not included.
				378
				379	strict_parsing: flag indicating what to do with parsing errors. If
				380	false (the default), errors are silently ignored. If true,
				381	errors raise a ValueError exception.
				382
				383	Returns a list, as G-d intended.
				384	"""
				385	pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
				386	r = []
				387	for name_value in pairs:
				388	if not name_value and not strict_parsing:
				389	continue
				390	nv = name_value.split('=', 1)
				391	if len(nv) != 2:
				392	if strict_parsing:
				393	raise ValueError("bad query field: %r" % (name_value,))
				394	# Handle case of a control-name with no equal sign
				395	if keep_blank_values:
				396	nv.append('')
				397	else:
				398	continue
				399	if len(nv[1]) or keep_blank_values:
				400	name = unquote(nv[0].replace('+', ' '))
				401	value = unquote(nv[1].replace('+', ' '))
				402	r.append((name, value))
				403
				404	return r
				405
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	406	def unquote_plus(string, encoding='utf-8', errors='replace'):
				407	"""Like unquote(), but also replace plus signs by spaces, as required for
				408	unquoting HTML form values.
				409
				410	unquote_plus('%7e/abc+def') -> '~/abc def'
				411	"""
				412	string = string.replace('+', ' ')
				413	return unquote(string, encoding, errors)
				414
				415	_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				416	b'abcdefghijklmnopqrstuvwxyz'
				417	b'0123456789'
				418	b'_.-')
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	419	_safe_quoters= {}
				420
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	421	class Quoter(collections.defaultdict):
				422	"""A mapping from bytes (in range(0,256)) to strings.
				423
				424	String values are percent-encoded byte values, unless the key < 128, and
				425	in the "safe" set (either the specified safe set, or default set).
				426	"""
				427	# Keeps a cache internally, using defaultdict, for efficiency (lookups
				428	# of cached keys don't call Python code at all).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	429	def __init__(self, safe):
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	430	"""safe: bytes object."""
				431	self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	432
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	433	def __repr__(self):
				434	# Without this, will just display as a defaultdict
				435	return "<Quoter %r>" % dict(self)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	436
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	437	def __missing__(self, b):
				438	# Handle a cache miss. Store quoted string in cache and return.
				439	res = b in self.safe and chr(b) or ('%%%02X' % b)
				440	self[b] = res
				441	return res
				442
				443	def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	444	"""quote('abc def') -> 'abc%20def'
				445
				446	Each part of a URL, e.g. the path info, the query, etc., has a
				447	different set of reserved characters that must be quoted.
				448
				449	RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
				450	the following reserved characters.
				451
				452	reserved = ";" \| "/" \| "?" \| ":" \| "@" \| "&" \| "=" \| "+" \|
				453	"$" \| ","
				454
				455	Each of these characters is reserved in some component of a URL,
				456	but not necessarily in all of them.
				457
				458	By default, the quote function is intended for quoting the path
				459	section of a URL. Thus, it will not encode '/'. This character
				460	is reserved, but in typical usage the quote function is being
				461	called on a path where the existing slash characters are used as
				462	reserved characters.
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	463
				464	string and safe may be either str or bytes objects. encoding must
				465	not be specified if string is a str.
				466
				467	The optional encoding and errors parameters specify how to deal with
				468	non-ASCII characters, as accepted by the str.encode method.
				469	By default, encoding='utf-8' (characters are encoded with UTF-8), and
				470	errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	471	"""
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	472	if isinstance(string, str):
				473	if encoding is None:
				474	encoding = 'utf-8'
				475	if errors is None:
				476	errors = 'strict'
				477	string = string.encode(encoding, errors)
				478	else:
				479	if encoding is not None:
				480	raise TypeError("quote() doesn't support 'encoding' for bytes")
				481	if errors is not None:
				482	raise TypeError("quote() doesn't support 'errors' for bytes")
				483	return quote_from_bytes(string, safe)
				484
				485	def quote_plus(string, safe='', encoding=None, errors=None):
				486	"""Like quote(), but also replace ' ' with '+', as required for quoting
				487	HTML form values. Plus signs in the original string are escaped unless
				488	they are included in safe. It also does not have safe default to '/'.
				489	"""
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	490	# Check if ' ' in string, where string may either be a str or bytes. If
				491	# there are no spaces, the regular quote will produce the right answer.
				492	if ((isinstance(string, str) and ' ' not in string) or
				493	(isinstance(string, bytes) and b' ' not in string)):
				494	return quote(string, safe, encoding, errors)
				495	if isinstance(safe, str):
				496	space = ' '
				497	else:
				498	space = b' '
Georg Brandl	faf4149	2009-05-26 18:31:11 +0000	[diff] [blame]	499	string = quote(string, safe + space, encoding, errors)
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	500	return string.replace(' ', '+')
Guido van Rossum	52dbbb9	2008-08-18 21:44:30 +0000	[diff] [blame]	501
				502	def quote_from_bytes(bs, safe='/'):
				503	"""Like quote(), but accepts a bytes object rather than a str, and does
				504	not perform string-to-bytes encoding. It always returns an ASCII string.
				505	quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
				506	"""
				507	if isinstance(safe, str):
				508	# Normalize 'safe' by converting to bytes and removing non-ASCII chars
				509	safe = safe.encode('ascii', 'ignore')
				510	cachekey = bytes(safe) # In case it was a bytearray
				511	if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
				512	raise TypeError("quote_from_bytes() expected a bytes")
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	513	try:
				514	quoter = _safe_quoters[cachekey]
				515	except KeyError:
				516	quoter = Quoter(safe)
				517	_safe_quoters[cachekey] = quoter
Jeremy Hylton	f819886	2009-03-26 16:55:08 +0000	[diff] [blame]	518	return ''.join([quoter[char] for char in bs])
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	519
Georg Brandl	b044b2a	2009-09-16 16:05:59 +0000	[diff] [blame]	520	def urlencode(query, doseq=False):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	521	"""Encode a sequence of two-element tuples or dictionary into a URL query string.
				522
				523	If any values in the query arg are sequences and doseq is true, each
				524	sequence element is converted to a separate parameter.
				525
				526	If the query arg is a sequence of two-element tuples, the order of the
				527	parameters in the output will match the order of parameters in the
				528	input.
				529	"""
				530
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	531	if hasattr(query, "items"):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	532	query = query.items()
				533	else:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	534	# It's a bother at times that strings and string-like objects are
				535	# sequences.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	536	try:
				537	# non-sequence items should not work with len()
				538	# non-empty strings will fail this
				539	if len(query) and not isinstance(query[0], tuple):
				540	raise TypeError
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	541	# Zero-length sequences of all types will get here and succeed,
				542	# but that's a minor nit. Since the original implementation
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	543	# allowed empty dicts that type of behavior probably should be
				544	# preserved for consistency
				545	except TypeError:
Jeremy Hylton	a4de60a	2009-03-26 14:49:26 +0000	[diff] [blame]	546	ty, va, tb = sys.exc_info()
				547	raise TypeError("not a valid non-string sequence "
				548	"or mapping object").with_traceback(tb)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	549
				550	l = []
				551	if not doseq:
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	552	for k, v in query:
				553	k = quote_plus(str(k))
				554	v = quote_plus(str(v))
				555	l.append(k + '=' + v)
				556	else:
				557	for k, v in query:
				558	k = quote_plus(str(k))
				559	if isinstance(v, str):
				560	v = quote_plus(v)
				561	l.append(k + '=' + v)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	562	else:
				563	try:
Jeremy Hylton	230feba	2009-03-26 16:56:59 +0000	[diff] [blame]	564	# Is this a sufficient test for sequence-ness?
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	565	x = len(v)
				566	except TypeError:
				567	# not a sequence
				568	v = quote_plus(str(v))
				569	l.append(k + '=' + v)
				570	else:
				571	# loop over the sequence
				572	for elt in v:
				573	l.append(k + '=' + quote_plus(str(elt)))
				574	return '&'.join(l)
				575
				576	# Utilities to parse URLs (most of these return None for missing parts):
				577	# unwrap('<URL:type://host/path>') --> 'type://host/path'
				578	# splittype('type:opaquestring') --> 'type', 'opaquestring'
				579	# splithost('//host[:port]/path') --> 'host[:port]', '/path'
				580	# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
				581	# splitpasswd('user:passwd') -> 'user', 'passwd'
				582	# splitport('host:port') --> 'host', 'port'
				583	# splitquery('/path?query') --> '/path', 'query'
				584	# splittag('/path#tag') --> '/path', 'tag'
				585	# splitattr('/path;attr1=value1;attr2=value2;...') ->
				586	# '/path', ['attr1=value1', 'attr2=value2', ...]
				587	# splitvalue('attr=value') --> 'attr', 'value'
				588	# urllib.parse.unquote('abc%20def') -> 'abc def'
				589	# quote('abc def') -> 'abc%20def')
				590
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	591	def to_bytes(url):
				592	"""to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	593	# Most URL schemes require ASCII. If that changes, the conversion
				594	# can be relaxed.
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	595	# XXX get rid of to_bytes()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	596	if isinstance(url, str):
				597	try:
				598	url = url.encode("ASCII").decode()
				599	except UnicodeError:
				600	raise UnicodeError("URL " + repr(url) +
				601	" contains non-ASCII characters")
				602	return url
				603
				604	def unwrap(url):
				605	"""unwrap('<URL:type://host/path>') --> 'type://host/path'."""
				606	url = str(url).strip()
				607	if url[:1] == '<' and url[-1:] == '>':
				608	url = url[1:-1].strip()
				609	if url[:4] == 'URL:': url = url[4:].strip()
				610	return url
				611
				612	_typeprog = None
				613	def splittype(url):
				614	"""splittype('type:opaquestring') --> 'type', 'opaquestring'."""
				615	global _typeprog
				616	if _typeprog is None:
				617	import re
				618	_typeprog = re.compile('^([^/:]+):')
				619
				620	match = _typeprog.match(url)
				621	if match:
				622	scheme = match.group(1)
				623	return scheme.lower(), url[len(scheme) + 1:]
				624	return None, url
				625
				626	_hostprog = None
				627	def splithost(url):
				628	"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
				629	global _hostprog
				630	if _hostprog is None:
				631	import re
				632	_hostprog = re.compile('^//([^/?])(.)$')
				633
				634	match = _hostprog.match(url)
				635	if match: return match.group(1, 2)
				636	return None, url
				637
				638	_userprog = None
				639	def splituser(host):
				640	"""splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
				641	global _userprog
				642	if _userprog is None:
				643	import re
				644	_userprog = re.compile('^(.)@(.)$')
				645
				646	match = _userprog.match(host)
Guido van Rossum	df9f1ec	2008-08-06 19:31:34 +0000	[diff] [blame]	647	if match: return map(unquote, match.group(1, 2))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	648	return None, host
				649
				650	_passwdprog = None
				651	def splitpasswd(user):
				652	"""splitpasswd('user:passwd') -> 'user', 'passwd'."""
				653	global _passwdprog
				654	if _passwdprog is None:
				655	import re
Senthil Kumaran	eaaec27	2009-03-30 21:54:41 +0000	[diff] [blame]	656	_passwdprog = re.compile('^([^:]):(.)$',re.S)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	657
				658	match = _passwdprog.match(user)
				659	if match: return match.group(1, 2)
				660	return user, None
				661
				662	# splittag('/path#tag') --> '/path', 'tag'
				663	_portprog = None
				664	def splitport(host):
				665	"""splitport('host:port') --> 'host', 'port'."""
				666	global _portprog
				667	if _portprog is None:
				668	import re
				669	_portprog = re.compile('^(.*):([0-9]+)$')
				670
				671	match = _portprog.match(host)
				672	if match: return match.group(1, 2)
				673	return host, None
				674
				675	_nportprog = None
				676	def splitnport(host, defport=-1):
				677	"""Split host and port, returning numeric port.
				678	Return given default port if no ':' found; defaults to -1.
				679	Return numerical port if a valid number are found after ':'.
				680	Return None if ':' but not a valid number."""
				681	global _nportprog
				682	if _nportprog is None:
				683	import re
				684	_nportprog = re.compile('^(.):(.)$')
				685
				686	match = _nportprog.match(host)
				687	if match:
				688	host, port = match.group(1, 2)
				689	try:
				690	if not port: raise ValueError("no digits")
				691	nport = int(port)
				692	except ValueError:
				693	nport = None
				694	return host, nport
				695	return host, defport
				696
				697	_queryprog = None
				698	def splitquery(url):
				699	"""splitquery('/path?query') --> '/path', 'query'."""
				700	global _queryprog
				701	if _queryprog is None:
				702	import re
				703	_queryprog = re.compile('^(.)\?([^?])$')
				704
				705	match = _queryprog.match(url)
				706	if match: return match.group(1, 2)
				707	return url, None
				708
				709	_tagprog = None
				710	def splittag(url):
				711	"""splittag('/path#tag') --> '/path', 'tag'."""
				712	global _tagprog
				713	if _tagprog is None:
				714	import re
				715	_tagprog = re.compile('^(.)#([^#])$')
				716
				717	match = _tagprog.match(url)
				718	if match: return match.group(1, 2)
				719	return url, None
				720
				721	def splitattr(url):
				722	"""splitattr('/path;attr1=value1;attr2=value2;...') ->
				723	'/path', ['attr1=value1', 'attr2=value2', ...]."""
				724	words = url.split(';')
				725	return words[0], words[1:]
				726
				727	_valueprog = None
				728	def splitvalue(attr):
				729	"""splitvalue('attr=value') --> 'attr', 'value'."""
				730	global _valueprog
				731	if _valueprog is None:
				732	import re
				733	_valueprog = re.compile('^([^=])=(.)$')
				734
				735	match = _valueprog.match(attr)
				736	if match: return match.group(1, 2)
				737	return attr, None
				738
				739	test_input = """
				740	http://a/b/c/d
				741
				742	g:h = <URL:g:h>
				743	http:g = <URL:http://a/b/c/g>
				744	http: = <URL:http://a/b/c/d>
				745	g = <URL:http://a/b/c/g>
				746	./g = <URL:http://a/b/c/g>
				747	g/ = <URL:http://a/b/c/g/>
				748	/g = <URL:http://a/g>
				749	//g = <URL:http://g>
				750	?y = <URL:http://a/b/c/d?y>
				751	g?y = <URL:http://a/b/c/g?y>
				752	g?y/./x = <URL:http://a/b/c/g?y/./x>
				753	. = <URL:http://a/b/c/>
				754	./ = <URL:http://a/b/c/>
				755	.. = <URL:http://a/b/>
				756	../ = <URL:http://a/b/>
				757	../g = <URL:http://a/b/g>
				758	../.. = <URL:http://a/>
				759	../../g = <URL:http://a/g>
				760	../../../g = <URL:http://a/../g>
				761	./../g = <URL:http://a/b/g>
				762	./g/. = <URL:http://a/b/c/g/>
				763	/./g = <URL:http://a/./g>
				764	g/./h = <URL:http://a/b/c/g/h>
				765	g/../h = <URL:http://a/b/c/h>
				766	http:g = <URL:http://a/b/c/g>
				767	http: = <URL:http://a/b/c/d>
				768	http:?y = <URL:http://a/b/c/d?y>
				769	http:g?y = <URL:http://a/b/c/g?y>
				770	http:g?y/./x = <URL:http://a/b/c/g?y/./x>
				771	"""
				772
				773	def test():
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	774	base = ''
				775	if sys.argv[1:]:
				776	fn = sys.argv[1]
				777	if fn == '-':
				778	fp = sys.stdin
				779	else:
				780	fp = open(fn)
				781	else:
				782	from io import StringIO
				783	fp = StringIO(test_input)
				784	for line in fp:
				785	words = line.split()
				786	if not words:
				787	continue
				788	url = words[0]
				789	parts = urlparse(url)
				790	print('%-10s : %s' % (url, parts))
				791	abs = urljoin(base, url)
				792	if not base:
				793	base = abs
				794	wrapped = '<URL:%s>' % abs
				795	print('%-10s = %s' % (url, wrapped))
				796	if len(words) == 3 and words[1] == '=':
				797	if wrapped != words[2]:
				798	print('EXPECTED', words[2], '!!!!!!!!!!')
				799
				800	if __name__ == '__main__':
				801	test()