Blame - Lib/urllib/parse.py - platform/external/python/cpython3

blob: 71cc36908da3e308ab25d8edb3377c36f2daec8e [file] [log] [blame]

Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1	"""Parse (absolute and relative) URLs.
				2
				3	See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
				4	UC Irvine, June 1995.
				5	"""
				6
				7	__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
				8	"urlsplit", "urlunsplit"]
				9
				10	# A classification of schemes ('' means apply by default)
				11	uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
				12	'wais', 'file', 'https', 'shttp', 'mms',
				13	'prospero', 'rtsp', 'rtspu', '', 'sftp']
				14	uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
				15	'imap', 'wais', 'file', 'mms', 'https', 'shttp',
				16	'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
				17	'svn', 'svn+ssh', 'sftp']
				18	non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
				19	'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
				20	uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
				21	'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
				22	'mms', '', 'sftp']
				23	uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
				24	'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
				25	uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
				26	'nntp', 'wais', 'https', 'shttp', 'snews',
				27	'file', 'prospero', '']
				28
				29	# Characters valid in scheme names
				30	scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
				31	'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				32	'0123456789'
				33	'+-.')
				34
				35	MAX_CACHE_SIZE = 20
				36	_parse_cache = {}
				37
				38	def clear_cache():
				39	"""Clear the parse cache."""
				40	_parse_cache.clear()
				41
				42
				43	class ResultMixin(object):
				44	"""Shared methods for the parsed result objects."""
				45
				46	@property
				47	def username(self):
				48	netloc = self.netloc
				49	if "@" in netloc:
				50	userinfo = netloc.rsplit("@", 1)[0]
				51	if ":" in userinfo:
				52	userinfo = userinfo.split(":", 1)[0]
				53	return userinfo
				54	return None
				55
				56	@property
				57	def password(self):
				58	netloc = self.netloc
				59	if "@" in netloc:
				60	userinfo = netloc.rsplit("@", 1)[0]
				61	if ":" in userinfo:
				62	return userinfo.split(":", 1)[1]
				63	return None
				64
				65	@property
				66	def hostname(self):
				67	netloc = self.netloc
				68	if "@" in netloc:
				69	netloc = netloc.rsplit("@", 1)[1]
				70	if ":" in netloc:
				71	netloc = netloc.split(":", 1)[0]
				72	return netloc.lower() or None
				73
				74	@property
				75	def port(self):
				76	netloc = self.netloc
				77	if "@" in netloc:
				78	netloc = netloc.rsplit("@", 1)[1]
				79	if ":" in netloc:
				80	port = netloc.split(":", 1)[1]
				81	return int(port, 10)
				82	return None
				83
				84	from collections import namedtuple
				85
				86	class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
				87
				88	__slots__ = ()
				89
				90	def geturl(self):
				91	return urlunsplit(self)
				92
				93
				94	class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
				95
				96	__slots__ = ()
				97
				98	def geturl(self):
				99	return urlunparse(self)
				100
				101
				102	def urlparse(url, scheme='', allow_fragments=True):
				103	"""Parse a URL into 6 components:
				104	<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
				105	Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
				106	Note that we don't break the components up in smaller bits
				107	(e.g. netloc is a single string) and we don't expand % escapes."""
				108	tuple = urlsplit(url, scheme, allow_fragments)
				109	scheme, netloc, url, query, fragment = tuple
				110	if scheme in uses_params and ';' in url:
				111	url, params = _splitparams(url)
				112	else:
				113	params = ''
				114	return ParseResult(scheme, netloc, url, params, query, fragment)
				115
				116	def _splitparams(url):
				117	if '/' in url:
				118	i = url.find(';', url.rfind('/'))
				119	if i < 0:
				120	return url, ''
				121	else:
				122	i = url.find(';')
				123	return url[:i], url[i+1:]
				124
				125	def _splitnetloc(url, start=0):
				126	delim = len(url) # position of end of domain part of url, default is end
				127	for c in '/?#': # look for delimiters; the order is NOT important
				128	wdelim = url.find(c, start) # find first of this delim
				129	if wdelim >= 0: # if found
				130	delim = min(delim, wdelim) # use earliest delim position
				131	return url[start:delim], url[delim:] # return (domain, rest)
				132
				133	def urlsplit(url, scheme='', allow_fragments=True):
				134	"""Parse a URL into 5 components:
				135	<scheme>://<netloc>/<path>?<query>#<fragment>
				136	Return a 5-tuple: (scheme, netloc, path, query, fragment).
				137	Note that we don't break the components up in smaller bits
				138	(e.g. netloc is a single string) and we don't expand % escapes."""
				139	allow_fragments = bool(allow_fragments)
				140	key = url, scheme, allow_fragments, type(url), type(scheme)
				141	cached = _parse_cache.get(key, None)
				142	if cached:
				143	return cached
				144	if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
				145	clear_cache()
				146	netloc = query = fragment = ''
				147	i = url.find(':')
				148	if i > 0:
				149	if url[:i] == 'http': # optimize the common case
				150	scheme = url[:i].lower()
				151	url = url[i+1:]
				152	if url[:2] == '//':
				153	netloc, url = _splitnetloc(url, 2)
				154	if allow_fragments and '#' in url:
				155	url, fragment = url.split('#', 1)
				156	if '?' in url:
				157	url, query = url.split('?', 1)
				158	v = SplitResult(scheme, netloc, url, query, fragment)
				159	_parse_cache[key] = v
				160	return v
				161	for c in url[:i]:
				162	if c not in scheme_chars:
				163	break
				164	else:
				165	scheme, url = url[:i].lower(), url[i+1:]
				166	if scheme in uses_netloc and url[:2] == '//':
				167	netloc, url = _splitnetloc(url, 2)
				168	if allow_fragments and scheme in uses_fragment and '#' in url:
				169	url, fragment = url.split('#', 1)
				170	if scheme in uses_query and '?' in url:
				171	url, query = url.split('?', 1)
				172	v = SplitResult(scheme, netloc, url, query, fragment)
				173	_parse_cache[key] = v
				174	return v
				175
				176	def urlunparse(components):
				177	"""Put a parsed URL back together again. This may result in a
				178	slightly different, but equivalent URL, if the URL that was parsed
				179	originally had redundant delimiters, e.g. a ? with an empty query
				180	(the draft states that these are equivalent)."""
				181	scheme, netloc, url, params, query, fragment = components
				182	if params:
				183	url = "%s;%s" % (url, params)
				184	return urlunsplit((scheme, netloc, url, query, fragment))
				185
				186	def urlunsplit(components):
				187	scheme, netloc, url, query, fragment = components
				188	if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
				189	if url and url[:1] != '/': url = '/' + url
				190	url = '//' + (netloc or '') + url
				191	if scheme:
				192	url = scheme + ':' + url
				193	if query:
				194	url = url + '?' + query
				195	if fragment:
				196	url = url + '#' + fragment
				197	return url
				198
				199	def urljoin(base, url, allow_fragments=True):
				200	"""Join a base URL and a possibly relative URL to form an absolute
				201	interpretation of the latter."""
				202	if not base:
				203	return url
				204	if not url:
				205	return base
				206	bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
				207	urlparse(base, '', allow_fragments)
				208	scheme, netloc, path, params, query, fragment = \
				209	urlparse(url, bscheme, allow_fragments)
				210	if scheme != bscheme or scheme not in uses_relative:
				211	return url
				212	if scheme in uses_netloc:
				213	if netloc:
				214	return urlunparse((scheme, netloc, path,
				215	params, query, fragment))
				216	netloc = bnetloc
				217	if path[:1] == '/':
				218	return urlunparse((scheme, netloc, path,
				219	params, query, fragment))
				220	if not (path or params or query):
				221	return urlunparse((scheme, netloc, bpath,
				222	bparams, bquery, fragment))
				223	segments = bpath.split('/')[:-1] + path.split('/')
				224	# XXX The stuff below is bogus in various ways...
				225	if segments[-1] == '.':
				226	segments[-1] = ''
				227	while '.' in segments:
				228	segments.remove('.')
				229	while 1:
				230	i = 1
				231	n = len(segments) - 1
				232	while i < n:
				233	if (segments[i] == '..'
				234	and segments[i-1] not in ('', '..')):
				235	del segments[i-1:i+1]
				236	break
				237	i = i+1
				238	else:
				239	break
				240	if segments == ['', '..']:
				241	segments[-1] = ''
				242	elif len(segments) >= 2 and segments[-1] == '..':
				243	segments[-2:] = ['']
				244	return urlunparse((scheme, netloc, '/'.join(segments),
				245	params, query, fragment))
				246
				247	def urldefrag(url):
				248	"""Removes any existing fragment from URL.
				249
				250	Returns a tuple of the defragmented URL and the fragment. If
				251	the URL contained no fragments, the second element is the
				252	empty string.
				253	"""
				254	if '#' in url:
				255	s, n, p, a, q, frag = urlparse(url)
				256	defrag = urlunparse((s, n, p, a, q, ''))
				257	return defrag, frag
				258	else:
				259	return url, ''
				260
				261
				262	_hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
				263	_hextochr.update(('%02X' % i, chr(i)) for i in range(256))
				264
				265	def unquote(s):
				266	"""unquote('abc%20def') -> 'abc def'."""
				267	res = s.split('%')
				268	for i in range(1, len(res)):
				269	item = res[i]
				270	try:
				271	res[i] = _hextochr[item[:2]] + item[2:]
				272	except KeyError:
				273	res[i] = '%' + item
				274	except UnicodeDecodeError:
				275	res[i] = chr(int(item[:2], 16)) + item[2:]
				276	return "".join(res)
				277
				278	def unquote_plus(s):
				279	"""unquote('%7e/abc+def') -> '~/abc def'"""
				280	s = s.replace('+', ' ')
				281	return unquote(s)
				282
				283	always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				284	'abcdefghijklmnopqrstuvwxyz'
				285	'0123456789' '_.-')
				286	_safe_quoters= {}
				287
				288	class Quoter:
				289	def __init__(self, safe):
				290	self.cache = {}
				291	self.safe = safe + always_safe
				292
				293	def __call__(self, c):
				294	try:
				295	return self.cache[c]
				296	except KeyError:
				297	if ord(c) < 256:
				298	res = (c in self.safe) and c or ('%%%02X' % ord(c))
				299	self.cache[c] = res
				300	return res
				301	else:
				302	return "".join(['%%%02X' % i for i in c.encode("utf-8")])
				303
				304	def quote(s, safe = '/'):
				305	"""quote('abc def') -> 'abc%20def'
				306
				307	Each part of a URL, e.g. the path info, the query, etc., has a
				308	different set of reserved characters that must be quoted.
				309
				310	RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
				311	the following reserved characters.
				312
				313	reserved = ";" \| "/" \| "?" \| ":" \| "@" \| "&" \| "=" \| "+" \|
				314	"$" \| ","
				315
				316	Each of these characters is reserved in some component of a URL,
				317	but not necessarily in all of them.
				318
				319	By default, the quote function is intended for quoting the path
				320	section of a URL. Thus, it will not encode '/'. This character
				321	is reserved, but in typical usage the quote function is being
				322	called on a path where the existing slash characters are used as
				323	reserved characters.
				324	"""
				325	cachekey = (safe, always_safe)
				326	try:
				327	quoter = _safe_quoters[cachekey]
				328	except KeyError:
				329	quoter = Quoter(safe)
				330	_safe_quoters[cachekey] = quoter
				331	res = map(quoter, s)
				332	return ''.join(res)
				333
				334	def quote_plus(s, safe = ''):
				335	"""Quote the query fragment of a URL; replacing ' ' with '+'"""
				336	if ' ' in s:
				337	s = quote(s, safe + ' ')
				338	return s.replace(' ', '+')
				339	return quote(s, safe)
				340
				341	def urlencode(query,doseq=0):
				342	"""Encode a sequence of two-element tuples or dictionary into a URL query string.
				343
				344	If any values in the query arg are sequences and doseq is true, each
				345	sequence element is converted to a separate parameter.
				346
				347	If the query arg is a sequence of two-element tuples, the order of the
				348	parameters in the output will match the order of parameters in the
				349	input.
				350	"""
				351
				352	if hasattr(query,"items"):
				353	# mapping objects
				354	query = query.items()
				355	else:
				356	# it's a bother at times that strings and string-like objects are
				357	# sequences...
				358	try:
				359	# non-sequence items should not work with len()
				360	# non-empty strings will fail this
				361	if len(query) and not isinstance(query[0], tuple):
				362	raise TypeError
				363	# zero-length sequences of all types will get here and succeed,
				364	# but that's a minor nit - since the original implementation
				365	# allowed empty dicts that type of behavior probably should be
				366	# preserved for consistency
				367	except TypeError:
				368	ty,va,tb = sys.exc_info()
				369	raise TypeError("not a valid non-string sequence or mapping object").with_traceback(tb)
				370
				371	l = []
				372	if not doseq:
				373	# preserve old behavior
				374	for k, v in query:
				375	k = quote_plus(str(k))
				376	v = quote_plus(str(v))
				377	l.append(k + '=' + v)
				378	else:
				379	for k, v in query:
				380	k = quote_plus(str(k))
				381	if isinstance(v, str):
				382	v = quote_plus(v)
				383	l.append(k + '=' + v)
				384	elif isinstance(v, str):
				385	# is there a reasonable way to convert to ASCII?
				386	# encode generates a string, but "replace" or "ignore"
				387	# lose information and "strict" can raise UnicodeError
				388	v = quote_plus(v.encode("ASCII","replace"))
				389	l.append(k + '=' + v)
				390	else:
				391	try:
				392	# is this a sufficient test for sequence-ness?
				393	x = len(v)
				394	except TypeError:
				395	# not a sequence
				396	v = quote_plus(str(v))
				397	l.append(k + '=' + v)
				398	else:
				399	# loop over the sequence
				400	for elt in v:
				401	l.append(k + '=' + quote_plus(str(elt)))
				402	return '&'.join(l)
				403
				404	# Utilities to parse URLs (most of these return None for missing parts):
				405	# unwrap('<URL:type://host/path>') --> 'type://host/path'
				406	# splittype('type:opaquestring') --> 'type', 'opaquestring'
				407	# splithost('//host[:port]/path') --> 'host[:port]', '/path'
				408	# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
				409	# splitpasswd('user:passwd') -> 'user', 'passwd'
				410	# splitport('host:port') --> 'host', 'port'
				411	# splitquery('/path?query') --> '/path', 'query'
				412	# splittag('/path#tag') --> '/path', 'tag'
				413	# splitattr('/path;attr1=value1;attr2=value2;...') ->
				414	# '/path', ['attr1=value1', 'attr2=value2', ...]
				415	# splitvalue('attr=value') --> 'attr', 'value'
				416	# urllib.parse.unquote('abc%20def') -> 'abc def'
				417	# quote('abc def') -> 'abc%20def')
				418
				419	def toBytes(url):
				420	"""toBytes(u"URL") --> 'URL'."""
				421	# Most URL schemes require ASCII. If that changes, the conversion
				422	# can be relaxed.
				423	# XXX get rid of toBytes()
				424	if isinstance(url, str):
				425	try:
				426	url = url.encode("ASCII").decode()
				427	except UnicodeError:
				428	raise UnicodeError("URL " + repr(url) +
				429	" contains non-ASCII characters")
				430	return url
				431
				432	def unwrap(url):
				433	"""unwrap('<URL:type://host/path>') --> 'type://host/path'."""
				434	url = str(url).strip()
				435	if url[:1] == '<' and url[-1:] == '>':
				436	url = url[1:-1].strip()
				437	if url[:4] == 'URL:': url = url[4:].strip()
				438	return url
				439
				440	_typeprog = None
				441	def splittype(url):
				442	"""splittype('type:opaquestring') --> 'type', 'opaquestring'."""
				443	global _typeprog
				444	if _typeprog is None:
				445	import re
				446	_typeprog = re.compile('^([^/:]+):')
				447
				448	match = _typeprog.match(url)
				449	if match:
				450	scheme = match.group(1)
				451	return scheme.lower(), url[len(scheme) + 1:]
				452	return None, url
				453
				454	_hostprog = None
				455	def splithost(url):
				456	"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
				457	global _hostprog
				458	if _hostprog is None:
				459	import re
				460	_hostprog = re.compile('^//([^/?])(.)$')
				461
				462	match = _hostprog.match(url)
				463	if match: return match.group(1, 2)
				464	return None, url
				465
				466	_userprog = None
				467	def splituser(host):
				468	"""splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
				469	global _userprog
				470	if _userprog is None:
				471	import re
				472	_userprog = re.compile('^(.)@(.)$')
				473
				474	match = _userprog.match(host)
				475	if match: return map(unquote, match.group(1, 2))
				476	return None, host
				477
				478	_passwdprog = None
				479	def splitpasswd(user):
				480	"""splitpasswd('user:passwd') -> 'user', 'passwd'."""
				481	global _passwdprog
				482	if _passwdprog is None:
				483	import re
				484	_passwdprog = re.compile('^([^:]):(.)$')
				485
				486	match = _passwdprog.match(user)
				487	if match: return match.group(1, 2)
				488	return user, None
				489
				490	# splittag('/path#tag') --> '/path', 'tag'
				491	_portprog = None
				492	def splitport(host):
				493	"""splitport('host:port') --> 'host', 'port'."""
				494	global _portprog
				495	if _portprog is None:
				496	import re
				497	_portprog = re.compile('^(.*):([0-9]+)$')
				498
				499	match = _portprog.match(host)
				500	if match: return match.group(1, 2)
				501	return host, None
				502
				503	_nportprog = None
				504	def splitnport(host, defport=-1):
				505	"""Split host and port, returning numeric port.
				506	Return given default port if no ':' found; defaults to -1.
				507	Return numerical port if a valid number are found after ':'.
				508	Return None if ':' but not a valid number."""
				509	global _nportprog
				510	if _nportprog is None:
				511	import re
				512	_nportprog = re.compile('^(.):(.)$')
				513
				514	match = _nportprog.match(host)
				515	if match:
				516	host, port = match.group(1, 2)
				517	try:
				518	if not port: raise ValueError("no digits")
				519	nport = int(port)
				520	except ValueError:
				521	nport = None
				522	return host, nport
				523	return host, defport
				524
				525	_queryprog = None
				526	def splitquery(url):
				527	"""splitquery('/path?query') --> '/path', 'query'."""
				528	global _queryprog
				529	if _queryprog is None:
				530	import re
				531	_queryprog = re.compile('^(.)\?([^?])$')
				532
				533	match = _queryprog.match(url)
				534	if match: return match.group(1, 2)
				535	return url, None
				536
				537	_tagprog = None
				538	def splittag(url):
				539	"""splittag('/path#tag') --> '/path', 'tag'."""
				540	global _tagprog
				541	if _tagprog is None:
				542	import re
				543	_tagprog = re.compile('^(.)#([^#])$')
				544
				545	match = _tagprog.match(url)
				546	if match: return match.group(1, 2)
				547	return url, None
				548
				549	def splitattr(url):
				550	"""splitattr('/path;attr1=value1;attr2=value2;...') ->
				551	'/path', ['attr1=value1', 'attr2=value2', ...]."""
				552	words = url.split(';')
				553	return words[0], words[1:]
				554
				555	_valueprog = None
				556	def splitvalue(attr):
				557	"""splitvalue('attr=value') --> 'attr', 'value'."""
				558	global _valueprog
				559	if _valueprog is None:
				560	import re
				561	_valueprog = re.compile('^([^=])=(.)$')
				562
				563	match = _valueprog.match(attr)
				564	if match: return match.group(1, 2)
				565	return attr, None
				566
				567	test_input = """
				568	http://a/b/c/d
				569
				570	g:h = <URL:g:h>
				571	http:g = <URL:http://a/b/c/g>
				572	http: = <URL:http://a/b/c/d>
				573	g = <URL:http://a/b/c/g>
				574	./g = <URL:http://a/b/c/g>
				575	g/ = <URL:http://a/b/c/g/>
				576	/g = <URL:http://a/g>
				577	//g = <URL:http://g>
				578	?y = <URL:http://a/b/c/d?y>
				579	g?y = <URL:http://a/b/c/g?y>
				580	g?y/./x = <URL:http://a/b/c/g?y/./x>
				581	. = <URL:http://a/b/c/>
				582	./ = <URL:http://a/b/c/>
				583	.. = <URL:http://a/b/>
				584	../ = <URL:http://a/b/>
				585	../g = <URL:http://a/b/g>
				586	../.. = <URL:http://a/>
				587	../../g = <URL:http://a/g>
				588	../../../g = <URL:http://a/../g>
				589	./../g = <URL:http://a/b/g>
				590	./g/. = <URL:http://a/b/c/g/>
				591	/./g = <URL:http://a/./g>
				592	g/./h = <URL:http://a/b/c/g/h>
				593	g/../h = <URL:http://a/b/c/h>
				594	http:g = <URL:http://a/b/c/g>
				595	http: = <URL:http://a/b/c/d>
				596	http:?y = <URL:http://a/b/c/d?y>
				597	http:g?y = <URL:http://a/b/c/g?y>
				598	http:g?y/./x = <URL:http://a/b/c/g?y/./x>
				599	"""
				600
				601	def test():
				602	import sys
				603	base = ''
				604	if sys.argv[1:]:
				605	fn = sys.argv[1]
				606	if fn == '-':
				607	fp = sys.stdin
				608	else:
				609	fp = open(fn)
				610	else:
				611	from io import StringIO
				612	fp = StringIO(test_input)
				613	for line in fp:
				614	words = line.split()
				615	if not words:
				616	continue
				617	url = words[0]
				618	parts = urlparse(url)
				619	print('%-10s : %s' % (url, parts))
				620	abs = urljoin(base, url)
				621	if not base:
				622	base = abs
				623	wrapped = '<URL:%s>' % abs
				624	print('%-10s = %s' % (url, wrapped))
				625	if len(words) == 3 and words[1] == '=':
				626	if wrapped != words[2]:
				627	print('EXPECTED', words[2], '!!!!!!!!!!')
				628
				629	if __name__ == '__main__':
				630	test()