blob: dc75f8f28cfe8bf0e1cb1b25d814adc7996768d0 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
Senthil Kumaranfd41e082010-04-17 14:44:14 +00003urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000011RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000012Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
David Malcolmee255682010-12-02 16:41:00 +000014RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000015
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000019RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
Senthil Kumaranfd41e082010-04-17 14:44:14 +000020McCahill, December 1994
21
Benjamin Petersond7c3ed52010-06-27 22:32:30 +000022RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
Senthil Kumaranfd41e082010-04-17 14:44:14 +000027test_urlparse.py provides a good indicator of parsing behavior.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000028"""
29
Facundo Batista2ac5de22008-07-07 18:24:11 +000030import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +000031import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +000032
Jeremy Hylton1afc1692008-06-18 20:49:58 +000033__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Senthil Kumaran0256b2a2010-10-25 16:36:20 +000034 "urlsplit", "urlunsplit", "urlencode", "parse_qs",
35 "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000036 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000037
38# A classification of schemes ('' means apply by default)
39uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
40 'wais', 'file', 'https', 'shttp', 'mms',
Senthil Kumaran2a157d22011-08-03 18:37:22 +080041 'prospero', 'rtsp', 'rtspu', '', 'sftp',
42 'svn', 'svn+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000043uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
44 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
45 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
Florent Xiclunac7b8e862010-05-17 17:33:07 +000046 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000047uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
48 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
Senthil Kumaraned301992012-12-24 14:00:20 -080049 'mms', '', 'sftp', 'tel']
Jeremy Hylton1afc1692008-06-18 20:49:58 +000050
Georg Brandla61b09f2012-08-24 18:15:29 +020051# These are not actually used anymore, but should stay for backwards
52# compatibility. (They are undocumented, but have a public-looking name.)
53non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
54 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
55uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
56 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
57uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
58 'nntp', 'wais', 'https', 'shttp', 'snews',
59 'file', 'prospero', '']
60
Jeremy Hylton1afc1692008-06-18 20:49:58 +000061# Characters valid in scheme names
62scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
63 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
64 '0123456789'
65 '+-.')
66
Nick Coghlan9fc443c2010-11-30 15:48:08 +000067# XXX: Consider replacing with functools.lru_cache
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068MAX_CACHE_SIZE = 20
69_parse_cache = {}
70
71def clear_cache():
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000072 """Clear the parse cache and the quoters cache."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000073 _parse_cache.clear()
Antoine Pitrou2df5fc72009-12-08 19:38:17 +000074 _safe_quoters.clear()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000075
76
Nick Coghlan9fc443c2010-11-30 15:48:08 +000077# Helpers for bytes handling
78# For 3.2, we deliberately require applications that
79# handle improperly quoted URLs to do their own
80# decoding and encoding. If valid use cases are
81# presented, we may relax this by using latin-1
82# decoding internally for 3.3
83_implicit_encoding = 'ascii'
84_implicit_errors = 'strict'
85
86def _noop(obj):
87 return obj
88
89def _encode_result(obj, encoding=_implicit_encoding,
90 errors=_implicit_errors):
91 return obj.encode(encoding, errors)
92
93def _decode_args(args, encoding=_implicit_encoding,
94 errors=_implicit_errors):
95 return tuple(x.decode(encoding, errors) if x else '' for x in args)
96
97def _coerce_args(*args):
98 # Invokes decode if necessary to create str args
99 # and returns the coerced inputs along with
100 # an appropriate result coercion function
101 # - noop for str inputs
102 # - encoding function otherwise
103 str_input = isinstance(args[0], str)
104 for arg in args[1:]:
105 # We special-case the empty string to support the
106 # "scheme=''" default argument to some functions
107 if arg and isinstance(arg, str) != str_input:
108 raise TypeError("Cannot mix str and non-str arguments")
109 if str_input:
110 return args + (_noop,)
111 return _decode_args(args) + (_encode_result,)
112
113# Result objects are more helpful than simple tuples
114class _ResultMixinStr(object):
115 """Standard approach to encoding parsed results from str to bytes"""
116 __slots__ = ()
117
118 def encode(self, encoding='ascii', errors='strict'):
119 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
120
121
122class _ResultMixinBytes(object):
123 """Standard approach to decoding parsed results from bytes to str"""
124 __slots__ = ()
125
126 def decode(self, encoding='ascii', errors='strict'):
127 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
128
129
130class _NetlocResultMixinBase(object):
131 """Shared methods for the parsed result objects containing a netloc element"""
132 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000133
134 @property
135 def username(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000136 return self._userinfo[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000137
138 @property
139 def password(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000140 return self._userinfo[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000141
142 @property
143 def hostname(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000144 hostname = self._hostinfo[0]
145 if not hostname:
146 hostname = None
147 elif hostname is not None:
148 hostname = hostname.lower()
149 return hostname
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000150
151 @property
152 def port(self):
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000153 port = self._hostinfo[1]
154 if port is not None:
155 port = int(port, 10)
Senthil Kumaran2fc5a502012-05-24 21:56:17 +0800156 # Return None on an illegal port
157 if not ( 0 <= port <= 65535):
158 return None
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000159 return port
160
161
162class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
163 __slots__ = ()
164
165 @property
166 def _userinfo(self):
167 netloc = self.netloc
168 userinfo, have_info, hostinfo = netloc.rpartition('@')
169 if have_info:
170 username, have_password, password = userinfo.partition(':')
171 if not have_password:
172 password = None
Senthil Kumaranad02d232010-04-16 03:02:13 +0000173 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000174 username = password = None
175 return username, password
176
177 @property
178 def _hostinfo(self):
179 netloc = self.netloc
180 _, _, hostinfo = netloc.rpartition('@')
181 _, have_open_br, bracketed = hostinfo.partition('[')
182 if have_open_br:
183 hostname, _, port = bracketed.partition(']')
184 _, have_port, port = port.partition(':')
185 else:
186 hostname, have_port, port = hostinfo.partition(':')
187 if not have_port:
188 port = None
189 return hostname, port
190
191
192class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
193 __slots__ = ()
194
195 @property
196 def _userinfo(self):
197 netloc = self.netloc
198 userinfo, have_info, hostinfo = netloc.rpartition(b'@')
199 if have_info:
200 username, have_password, password = userinfo.partition(b':')
201 if not have_password:
202 password = None
203 else:
204 username = password = None
205 return username, password
206
207 @property
208 def _hostinfo(self):
209 netloc = self.netloc
210 _, _, hostinfo = netloc.rpartition(b'@')
211 _, have_open_br, bracketed = hostinfo.partition(b'[')
212 if have_open_br:
213 hostname, _, port = bracketed.partition(b']')
214 _, have_port, port = port.partition(b':')
215 else:
216 hostname, have_port, port = hostinfo.partition(b':')
217 if not have_port:
218 port = None
219 return hostname, port
220
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000221
222from collections import namedtuple
223
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000224_DefragResultBase = namedtuple('DefragResult', 'url fragment')
225_SplitResultBase = namedtuple('SplitResult', 'scheme netloc path query fragment')
226_ParseResultBase = namedtuple('ParseResult', 'scheme netloc path params query fragment')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000227
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000228# For backwards compatibility, alias _NetlocResultMixinStr
229# ResultBase is no longer part of the documented API, but it is
230# retained since deprecating it isn't worth the hassle
231ResultBase = _NetlocResultMixinStr
232
233# Structured result objects for string data
234class DefragResult(_DefragResultBase, _ResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000235 __slots__ = ()
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000236 def geturl(self):
237 if self.fragment:
238 return self.url + '#' + self.fragment
239 else:
240 return self.url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000241
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000242class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
243 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000244 def geturl(self):
245 return urlunsplit(self)
246
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000247class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000248 __slots__ = ()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000249 def geturl(self):
250 return urlunparse(self)
251
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000252# Structured result objects for bytes data
253class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
254 __slots__ = ()
255 def geturl(self):
256 if self.fragment:
257 return self.url + b'#' + self.fragment
258 else:
259 return self.url
260
261class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
262 __slots__ = ()
263 def geturl(self):
264 return urlunsplit(self)
265
266class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
267 __slots__ = ()
268 def geturl(self):
269 return urlunparse(self)
270
271# Set up the encode/decode result pairs
272def _fix_result_transcoding():
273 _result_pairs = (
274 (DefragResult, DefragResultBytes),
275 (SplitResult, SplitResultBytes),
276 (ParseResult, ParseResultBytes),
277 )
278 for _decoded, _encoded in _result_pairs:
279 _decoded._encoded_counterpart = _encoded
280 _encoded._decoded_counterpart = _decoded
281
282_fix_result_transcoding()
283del _fix_result_transcoding
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000284
285def urlparse(url, scheme='', allow_fragments=True):
286 """Parse a URL into 6 components:
287 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
288 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
289 Note that we don't break the components up in smaller bits
290 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000291 url, scheme, _coerce_result = _coerce_args(url, scheme)
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700292 splitresult = urlsplit(url, scheme, allow_fragments)
293 scheme, netloc, url, query, fragment = splitresult
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000294 if scheme in uses_params and ';' in url:
295 url, params = _splitparams(url)
296 else:
297 params = ''
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000298 result = ParseResult(scheme, netloc, url, params, query, fragment)
299 return _coerce_result(result)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000300
301def _splitparams(url):
302 if '/' in url:
303 i = url.find(';', url.rfind('/'))
304 if i < 0:
305 return url, ''
306 else:
307 i = url.find(';')
308 return url[:i], url[i+1:]
309
310def _splitnetloc(url, start=0):
311 delim = len(url) # position of end of domain part of url, default is end
312 for c in '/?#': # look for delimiters; the order is NOT important
313 wdelim = url.find(c, start) # find first of this delim
314 if wdelim >= 0: # if found
315 delim = min(delim, wdelim) # use earliest delim position
316 return url[start:delim], url[delim:] # return (domain, rest)
317
318def urlsplit(url, scheme='', allow_fragments=True):
319 """Parse a URL into 5 components:
320 <scheme>://<netloc>/<path>?<query>#<fragment>
321 Return a 5-tuple: (scheme, netloc, path, query, fragment).
322 Note that we don't break the components up in smaller bits
323 (e.g. netloc is a single string) and we don't expand % escapes."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000324 url, scheme, _coerce_result = _coerce_args(url, scheme)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000325 allow_fragments = bool(allow_fragments)
326 key = url, scheme, allow_fragments, type(url), type(scheme)
327 cached = _parse_cache.get(key, None)
328 if cached:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000329 return _coerce_result(cached)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000330 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
331 clear_cache()
332 netloc = query = fragment = ''
333 i = url.find(':')
334 if i > 0:
335 if url[:i] == 'http': # optimize the common case
336 scheme = url[:i].lower()
337 url = url[i+1:]
338 if url[:2] == '//':
339 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000340 if (('[' in netloc and ']' not in netloc) or
341 (']' in netloc and '[' not in netloc)):
342 raise ValueError("Invalid IPv6 URL")
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000343 if allow_fragments and '#' in url:
344 url, fragment = url.split('#', 1)
345 if '?' in url:
346 url, query = url.split('?', 1)
347 v = SplitResult(scheme, netloc, url, query, fragment)
348 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000349 return _coerce_result(v)
Senthil Kumaran397eb442011-04-15 18:20:24 +0800350 for c in url[:i]:
351 if c not in scheme_chars:
352 break
353 else:
Ezio Melotti6709b7d2012-05-19 17:15:19 +0300354 # make sure "url" is not actually a port number (in which case
355 # "scheme" is really part of the path)
356 rest = url[i+1:]
357 if not rest or any(c not in '0123456789' for c in rest):
358 # not a port number
359 scheme, url = url[:i].lower(), rest
Senthil Kumaran397eb442011-04-15 18:20:24 +0800360
Senthil Kumaran6be85c52010-02-19 07:42:50 +0000361 if url[:2] == '//':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000362 netloc, url = _splitnetloc(url, 2)
Senthil Kumaran7a1e09f2010-04-22 12:19:46 +0000363 if (('[' in netloc and ']' not in netloc) or
364 (']' in netloc and '[' not in netloc)):
365 raise ValueError("Invalid IPv6 URL")
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800366 if allow_fragments and '#' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000367 url, fragment = url.split('#', 1)
Senthil Kumaran1be320e2012-05-19 08:12:00 +0800368 if '?' in url:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000369 url, query = url.split('?', 1)
370 v = SplitResult(scheme, netloc, url, query, fragment)
371 _parse_cache[key] = v
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000372 return _coerce_result(v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000373
374def urlunparse(components):
375 """Put a parsed URL back together again. This may result in a
376 slightly different, but equivalent URL, if the URL that was parsed
377 originally had redundant delimiters, e.g. a ? with an empty query
378 (the draft states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000379 scheme, netloc, url, params, query, fragment, _coerce_result = (
380 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000381 if params:
382 url = "%s;%s" % (url, params)
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000383 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000384
385def urlunsplit(components):
Senthil Kumaran8749a632010-06-28 14:08:00 +0000386 """Combine the elements of a tuple as returned by urlsplit() into a
387 complete URL as a string. The data argument can be any five-item iterable.
388 This may result in a slightly different, but equivalent URL, if the URL that
389 was parsed originally had unnecessary delimiters (for example, a ? with an
390 empty query; the RFC states that these are equivalent)."""
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000391 scheme, netloc, url, query, fragment, _coerce_result = (
392 _coerce_args(*components))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000393 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
394 if url and url[:1] != '/': url = '/' + url
395 url = '//' + (netloc or '') + url
396 if scheme:
397 url = scheme + ':' + url
398 if query:
399 url = url + '?' + query
400 if fragment:
401 url = url + '#' + fragment
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000402 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000403
404def urljoin(base, url, allow_fragments=True):
405 """Join a base URL and a possibly relative URL to form an absolute
406 interpretation of the latter."""
407 if not base:
408 return url
409 if not url:
410 return base
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000411 base, url, _coerce_result = _coerce_args(base, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000412 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
413 urlparse(base, '', allow_fragments)
414 scheme, netloc, path, params, query, fragment = \
415 urlparse(url, bscheme, allow_fragments)
416 if scheme != bscheme or scheme not in uses_relative:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000417 return _coerce_result(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000418 if scheme in uses_netloc:
419 if netloc:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000420 return _coerce_result(urlunparse((scheme, netloc, path,
421 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000422 netloc = bnetloc
423 if path[:1] == '/':
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000424 return _coerce_result(urlunparse((scheme, netloc, path,
425 params, query, fragment)))
Senthil Kumarandca5b862010-12-17 04:48:45 +0000426 if not path and not params:
Facundo Batista23e38562008-08-14 16:55:14 +0000427 path = bpath
Senthil Kumarandca5b862010-12-17 04:48:45 +0000428 params = bparams
Facundo Batista23e38562008-08-14 16:55:14 +0000429 if not query:
430 query = bquery
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000431 return _coerce_result(urlunparse((scheme, netloc, path,
432 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000433 segments = bpath.split('/')[:-1] + path.split('/')
434 # XXX The stuff below is bogus in various ways...
435 if segments[-1] == '.':
436 segments[-1] = ''
437 while '.' in segments:
438 segments.remove('.')
439 while 1:
440 i = 1
441 n = len(segments) - 1
442 while i < n:
443 if (segments[i] == '..'
444 and segments[i-1] not in ('', '..')):
445 del segments[i-1:i+1]
446 break
447 i = i+1
448 else:
449 break
450 if segments == ['', '..']:
451 segments[-1] = ''
452 elif len(segments) >= 2 and segments[-1] == '..':
453 segments[-2:] = ['']
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000454 return _coerce_result(urlunparse((scheme, netloc, '/'.join(segments),
455 params, query, fragment)))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000456
457def urldefrag(url):
458 """Removes any existing fragment from URL.
459
460 Returns a tuple of the defragmented URL and the fragment. If
461 the URL contained no fragments, the second element is the
462 empty string.
463 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000464 url, _coerce_result = _coerce_args(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000465 if '#' in url:
466 s, n, p, a, q, frag = urlparse(url)
467 defrag = urlunparse((s, n, p, a, q, ''))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000468 else:
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000469 frag = ''
470 defrag = url
471 return _coerce_result(DefragResult(defrag, frag))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000472
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000473def unquote_to_bytes(string):
474 """unquote_to_bytes('abc%20def') -> b'abc def'."""
475 # Note: strings are encoded as UTF-8. This is only an issue if it contains
476 # unescaped non-ASCII characters, which URIs should not.
Florent Xicluna82a3f8a2010-08-14 18:30:35 +0000477 if not string:
478 # Is it a string-like object?
479 string.split
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000480 return b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000481 if isinstance(string, str):
482 string = string.encode('utf-8')
483 res = string.split(b'%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000484 if len(res) == 1:
485 return string
486 string = res[0]
487 for item in res[1:]:
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000488 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000489 string += bytes([int(item[:2], 16)]) + item[2:]
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000490 except ValueError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000491 string += b'%' + item
492 return string
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000493
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000494def unquote(string, encoding='utf-8', errors='replace'):
495 """Replace %xx escapes by their single-character equivalent. The optional
496 encoding and errors parameters specify how to decode percent-encoded
497 sequences into Unicode characters, as accepted by the bytes.decode()
498 method.
499 By default, percent-encoded sequences are decoded with UTF-8, and invalid
500 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000501
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000502 unquote('abc%20def') -> 'abc def'.
503 """
Florent Xiclunac049fca2010-07-31 08:56:55 +0000504 if string == '':
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000505 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000506 res = string.split('%')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000507 if len(res) == 1:
508 return string
509 if encoding is None:
510 encoding = 'utf-8'
511 if errors is None:
512 errors = 'replace'
Florent Xicluna0f78a942010-05-17 18:01:22 +0000513 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000514 pct_sequence = b''
515 string = res[0]
516 for item in res[1:]:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000517 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000518 if not item:
519 raise ValueError
520 pct_sequence += bytes.fromhex(item[:2])
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000521 rest = item[2:]
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000522 if not rest:
523 # This segment was just a single percent-encoded character.
524 # May be part of a sequence of code units, so delay decoding.
525 # (Stored in pct_sequence).
526 continue
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000527 except ValueError:
528 rest = '%' + item
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000529 # Encountered non-percent-encoded characters. Flush the current
530 # pct_sequence.
531 string += pct_sequence.decode(encoding, errors) + rest
532 pct_sequence = b''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000533 if pct_sequence:
534 # Flush the final pct_sequence
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000535 string += pct_sequence.decode(encoding, errors)
536 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000537
Victor Stinnerac71c542011-01-14 12:52:12 +0000538def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
539 encoding='utf-8', errors='replace'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000540 """Parse a query given as a string argument.
541
542 Arguments:
543
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000544 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000545
546 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000547 percent-encoded queries should be treated as blank strings.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000548 A true value indicates that blanks should be retained as
549 blank strings. The default false value indicates that
550 blank values are to be ignored and treated as if they were
551 not included.
552
553 strict_parsing: flag indicating what to do with parsing errors.
554 If false (the default), errors are silently ignored.
555 If true, errors raise a ValueError exception.
Victor Stinnerac71c542011-01-14 12:52:12 +0000556
557 encoding and errors: specify how to decode percent-encoded sequences
558 into Unicode characters, as accepted by the bytes.decode() method.
Facundo Batistac469d4c2008-09-03 22:49:01 +0000559 """
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700560 parsed_result = {}
Victor Stinnerac71c542011-01-14 12:52:12 +0000561 pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
562 encoding=encoding, errors=errors)
563 for name, value in pairs:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700564 if name in parsed_result:
565 parsed_result[name].append(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000566 else:
Senthil Kumaraneda29f42012-06-29 11:08:20 -0700567 parsed_result[name] = [value]
568 return parsed_result
Facundo Batistac469d4c2008-09-03 22:49:01 +0000569
Victor Stinnerac71c542011-01-14 12:52:12 +0000570def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
571 encoding='utf-8', errors='replace'):
Facundo Batistac469d4c2008-09-03 22:49:01 +0000572 """Parse a query given as a string argument.
573
574 Arguments:
575
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000576 qs: percent-encoded query string to be parsed
Facundo Batistac469d4c2008-09-03 22:49:01 +0000577
578 keep_blank_values: flag indicating whether blank values in
Senthil Kumaran30e86a42010-08-09 20:01:35 +0000579 percent-encoded queries should be treated as blank strings. A
Facundo Batistac469d4c2008-09-03 22:49:01 +0000580 true value indicates that blanks should be retained as blank
581 strings. The default false value indicates that blank values
582 are to be ignored and treated as if they were not included.
583
584 strict_parsing: flag indicating what to do with parsing errors. If
585 false (the default), errors are silently ignored. If true,
586 errors raise a ValueError exception.
587
Victor Stinnerac71c542011-01-14 12:52:12 +0000588 encoding and errors: specify how to decode percent-encoded sequences
589 into Unicode characters, as accepted by the bytes.decode() method.
590
Facundo Batistac469d4c2008-09-03 22:49:01 +0000591 Returns a list, as G-d intended.
592 """
Nick Coghlan9fc443c2010-11-30 15:48:08 +0000593 qs, _coerce_result = _coerce_args(qs)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000594 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
595 r = []
596 for name_value in pairs:
597 if not name_value and not strict_parsing:
598 continue
599 nv = name_value.split('=', 1)
600 if len(nv) != 2:
601 if strict_parsing:
602 raise ValueError("bad query field: %r" % (name_value,))
603 # Handle case of a control-name with no equal sign
604 if keep_blank_values:
605 nv.append('')
606 else:
607 continue
608 if len(nv[1]) or keep_blank_values:
Victor Stinnerac71c542011-01-14 12:52:12 +0000609 name = nv[0].replace('+', ' ')
610 name = unquote(name, encoding=encoding, errors=errors)
611 name = _coerce_result(name)
612 value = nv[1].replace('+', ' ')
613 value = unquote(value, encoding=encoding, errors=errors)
614 value = _coerce_result(value)
Facundo Batistac469d4c2008-09-03 22:49:01 +0000615 r.append((name, value))
Facundo Batistac469d4c2008-09-03 22:49:01 +0000616 return r
617
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000618def unquote_plus(string, encoding='utf-8', errors='replace'):
619 """Like unquote(), but also replace plus signs by spaces, as required for
620 unquoting HTML form values.
621
622 unquote_plus('%7e/abc+def') -> '~/abc def'
623 """
624 string = string.replace('+', ' ')
625 return unquote(string, encoding, errors)
626
627_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
628 b'abcdefghijklmnopqrstuvwxyz'
629 b'0123456789'
630 b'_.-')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000631_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
632_safe_quoters = {}
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000633
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000634class Quoter(collections.defaultdict):
635 """A mapping from bytes (in range(0,256)) to strings.
636
637 String values are percent-encoded byte values, unless the key < 128, and
638 in the "safe" set (either the specified safe set, or default set).
639 """
640 # Keeps a cache internally, using defaultdict, for efficiency (lookups
641 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000642 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000643 """safe: bytes object."""
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000644 self.safe = _ALWAYS_SAFE.union(safe)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000645
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000646 def __repr__(self):
647 # Without this, will just display as a defaultdict
648 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000649
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000650 def __missing__(self, b):
651 # Handle a cache miss. Store quoted string in cache and return.
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000652 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000653 self[b] = res
654 return res
655
656def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000657 """quote('abc def') -> 'abc%20def'
658
659 Each part of a URL, e.g. the path info, the query, etc., has a
660 different set of reserved characters that must be quoted.
661
662 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
663 the following reserved characters.
664
665 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
666 "$" | ","
667
668 Each of these characters is reserved in some component of a URL,
669 but not necessarily in all of them.
670
671 By default, the quote function is intended for quoting the path
672 section of a URL. Thus, it will not encode '/'. This character
673 is reserved, but in typical usage the quote function is being
674 called on a path where the existing slash characters are used as
675 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000676
677 string and safe may be either str or bytes objects. encoding must
678 not be specified if string is a str.
679
680 The optional encoding and errors parameters specify how to deal with
681 non-ASCII characters, as accepted by the str.encode method.
682 By default, encoding='utf-8' (characters are encoded with UTF-8), and
683 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000684 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000685 if isinstance(string, str):
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000686 if not string:
687 return string
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000688 if encoding is None:
689 encoding = 'utf-8'
690 if errors is None:
691 errors = 'strict'
692 string = string.encode(encoding, errors)
693 else:
694 if encoding is not None:
695 raise TypeError("quote() doesn't support 'encoding' for bytes")
696 if errors is not None:
697 raise TypeError("quote() doesn't support 'errors' for bytes")
698 return quote_from_bytes(string, safe)
699
700def quote_plus(string, safe='', encoding=None, errors=None):
701 """Like quote(), but also replace ' ' with '+', as required for quoting
702 HTML form values. Plus signs in the original string are escaped unless
703 they are included in safe. It also does not have safe default to '/'.
704 """
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000705 # Check if ' ' in string, where string may either be a str or bytes. If
706 # there are no spaces, the regular quote will produce the right answer.
707 if ((isinstance(string, str) and ' ' not in string) or
708 (isinstance(string, bytes) and b' ' not in string)):
709 return quote(string, safe, encoding, errors)
710 if isinstance(safe, str):
711 space = ' '
712 else:
713 space = b' '
Georg Brandlfaf41492009-05-26 18:31:11 +0000714 string = quote(string, safe + space, encoding, errors)
Jeremy Hyltonf8198862009-03-26 16:55:08 +0000715 return string.replace(' ', '+')
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000716
717def quote_from_bytes(bs, safe='/'):
718 """Like quote(), but accepts a bytes object rather than a str, and does
719 not perform string-to-bytes encoding. It always returns an ASCII string.
Senthil Kumaranffa4b2c2012-05-26 09:53:32 +0800720 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000721 """
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000722 if not isinstance(bs, (bytes, bytearray)):
723 raise TypeError("quote_from_bytes() expected bytes")
724 if not bs:
725 return ''
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000726 if isinstance(safe, str):
727 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
728 safe = safe.encode('ascii', 'ignore')
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000729 else:
730 safe = bytes([c for c in safe if c < 128])
731 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
732 return bs.decode()
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000733 try:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000734 quoter = _safe_quoters[safe]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000735 except KeyError:
Florent Xiclunac7b8e862010-05-17 17:33:07 +0000736 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
737 return ''.join([quoter(char) for char in bs])
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000738
Senthil Kumarandf022da2010-07-03 17:48:22 +0000739def urlencode(query, doseq=False, safe='', encoding=None, errors=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000740 """Encode a sequence of two-element tuples or dictionary into a URL query string.
741
742 If any values in the query arg are sequences and doseq is true, each
743 sequence element is converted to a separate parameter.
744
745 If the query arg is a sequence of two-element tuples, the order of the
746 parameters in the output will match the order of parameters in the
747 input.
Senthil Kumarandf022da2010-07-03 17:48:22 +0000748
749 The query arg may be either a string or a bytes type. When query arg is a
750 string, the safe, encoding and error parameters are sent the quote_plus for
751 encoding.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000752 """
753
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000754 if hasattr(query, "items"):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000755 query = query.items()
756 else:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000757 # It's a bother at times that strings and string-like objects are
758 # sequences.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000759 try:
760 # non-sequence items should not work with len()
761 # non-empty strings will fail this
762 if len(query) and not isinstance(query[0], tuple):
763 raise TypeError
Jeremy Hylton230feba2009-03-26 16:56:59 +0000764 # Zero-length sequences of all types will get here and succeed,
765 # but that's a minor nit. Since the original implementation
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000766 # allowed empty dicts that type of behavior probably should be
767 # preserved for consistency
768 except TypeError:
Jeremy Hyltona4de60a2009-03-26 14:49:26 +0000769 ty, va, tb = sys.exc_info()
770 raise TypeError("not a valid non-string sequence "
771 "or mapping object").with_traceback(tb)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000772
773 l = []
774 if not doseq:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000775 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000776 if isinstance(k, bytes):
777 k = quote_plus(k, safe)
778 else:
779 k = quote_plus(str(k), safe, encoding, errors)
780
781 if isinstance(v, bytes):
782 v = quote_plus(v, safe)
783 else:
784 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000785 l.append(k + '=' + v)
786 else:
787 for k, v in query:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000788 if isinstance(k, bytes):
789 k = quote_plus(k, safe)
790 else:
791 k = quote_plus(str(k), safe, encoding, errors)
792
793 if isinstance(v, bytes):
794 v = quote_plus(v, safe)
795 l.append(k + '=' + v)
796 elif isinstance(v, str):
797 v = quote_plus(v, safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000798 l.append(k + '=' + v)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000799 else:
800 try:
Jeremy Hylton230feba2009-03-26 16:56:59 +0000801 # Is this a sufficient test for sequence-ness?
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000802 x = len(v)
803 except TypeError:
804 # not a sequence
Senthil Kumarandf022da2010-07-03 17:48:22 +0000805 v = quote_plus(str(v), safe, encoding, errors)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000806 l.append(k + '=' + v)
807 else:
808 # loop over the sequence
809 for elt in v:
Senthil Kumarandf022da2010-07-03 17:48:22 +0000810 if isinstance(elt, bytes):
811 elt = quote_plus(elt, safe)
812 else:
813 elt = quote_plus(str(elt), safe, encoding, errors)
814 l.append(k + '=' + elt)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000815 return '&'.join(l)
816
817# Utilities to parse URLs (most of these return None for missing parts):
818# unwrap('<URL:type://host/path>') --> 'type://host/path'
819# splittype('type:opaquestring') --> 'type', 'opaquestring'
820# splithost('//host[:port]/path') --> 'host[:port]', '/path'
821# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
822# splitpasswd('user:passwd') -> 'user', 'passwd'
823# splitport('host:port') --> 'host', 'port'
824# splitquery('/path?query') --> '/path', 'query'
825# splittag('/path#tag') --> '/path', 'tag'
826# splitattr('/path;attr1=value1;attr2=value2;...') ->
827# '/path', ['attr1=value1', 'attr2=value2', ...]
828# splitvalue('attr=value') --> 'attr', 'value'
829# urllib.parse.unquote('abc%20def') -> 'abc def'
830# quote('abc def') -> 'abc%20def')
831
Georg Brandl13e89462008-07-01 19:56:00 +0000832def to_bytes(url):
833 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000834 # Most URL schemes require ASCII. If that changes, the conversion
835 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000836 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000837 if isinstance(url, str):
838 try:
839 url = url.encode("ASCII").decode()
840 except UnicodeError:
841 raise UnicodeError("URL " + repr(url) +
842 " contains non-ASCII characters")
843 return url
844
845def unwrap(url):
846 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
847 url = str(url).strip()
848 if url[:1] == '<' and url[-1:] == '>':
849 url = url[1:-1].strip()
850 if url[:4] == 'URL:': url = url[4:].strip()
851 return url
852
853_typeprog = None
854def splittype(url):
855 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
856 global _typeprog
857 if _typeprog is None:
858 import re
859 _typeprog = re.compile('^([^/:]+):')
860
861 match = _typeprog.match(url)
862 if match:
863 scheme = match.group(1)
864 return scheme.lower(), url[len(scheme) + 1:]
865 return None, url
866
867_hostprog = None
868def splithost(url):
869 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
870 global _hostprog
871 if _hostprog is None:
872 import re
873 _hostprog = re.compile('^//([^/?]*)(.*)$')
874
875 match = _hostprog.match(url)
Senthil Kumaranc2958622010-11-22 04:48:26 +0000876 if match:
877 host_port = match.group(1)
878 path = match.group(2)
879 if path and not path.startswith('/'):
880 path = '/' + path
881 return host_port, path
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000882 return None, url
883
884_userprog = None
885def splituser(host):
886 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
887 global _userprog
888 if _userprog is None:
889 import re
890 _userprog = re.compile('^(.*)@(.*)$')
891
892 match = _userprog.match(host)
Senthil Kumarandaa29d02010-11-18 15:36:41 +0000893 if match: return match.group(1, 2)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000894 return None, host
895
896_passwdprog = None
897def splitpasswd(user):
898 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
899 global _passwdprog
900 if _passwdprog is None:
901 import re
Senthil Kumaraneaaec272009-03-30 21:54:41 +0000902 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000903
904 match = _passwdprog.match(user)
905 if match: return match.group(1, 2)
906 return user, None
907
908# splittag('/path#tag') --> '/path', 'tag'
909_portprog = None
910def splitport(host):
911 """splitport('host:port') --> 'host', 'port'."""
912 global _portprog
913 if _portprog is None:
914 import re
915 _portprog = re.compile('^(.*):([0-9]+)$')
916
917 match = _portprog.match(host)
918 if match: return match.group(1, 2)
919 return host, None
920
921_nportprog = None
922def splitnport(host, defport=-1):
923 """Split host and port, returning numeric port.
924 Return given default port if no ':' found; defaults to -1.
925 Return numerical port if a valid number are found after ':'.
926 Return None if ':' but not a valid number."""
927 global _nportprog
928 if _nportprog is None:
929 import re
930 _nportprog = re.compile('^(.*):(.*)$')
931
932 match = _nportprog.match(host)
933 if match:
934 host, port = match.group(1, 2)
935 try:
936 if not port: raise ValueError("no digits")
937 nport = int(port)
938 except ValueError:
939 nport = None
940 return host, nport
941 return host, defport
942
943_queryprog = None
944def splitquery(url):
945 """splitquery('/path?query') --> '/path', 'query'."""
946 global _queryprog
947 if _queryprog is None:
948 import re
949 _queryprog = re.compile('^(.*)\?([^?]*)$')
950
951 match = _queryprog.match(url)
952 if match: return match.group(1, 2)
953 return url, None
954
955_tagprog = None
956def splittag(url):
957 """splittag('/path#tag') --> '/path', 'tag'."""
958 global _tagprog
959 if _tagprog is None:
960 import re
961 _tagprog = re.compile('^(.*)#([^#]*)$')
962
963 match = _tagprog.match(url)
964 if match: return match.group(1, 2)
965 return url, None
966
967def splitattr(url):
968 """splitattr('/path;attr1=value1;attr2=value2;...') ->
969 '/path', ['attr1=value1', 'attr2=value2', ...]."""
970 words = url.split(';')
971 return words[0], words[1:]
972
973_valueprog = None
974def splitvalue(attr):
975 """splitvalue('attr=value') --> 'attr', 'value'."""
976 global _valueprog
977 if _valueprog is None:
978 import re
979 _valueprog = re.compile('^([^=]*)=(.*)$')
980
981 match = _valueprog.match(attr)
982 if match: return match.group(1, 2)
983 return attr, None