blob: 94d77ebbf0a77ffb036a1bcb9e9a0a8c1900b952 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""Parse (absolute and relative) URLs.
2
3See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
4UC Irvine, June 1995.
5"""
6
Facundo Batista2ac5de22008-07-07 18:24:11 +00007import sys
Guido van Rossum52dbbb92008-08-18 21:44:30 +00008import collections
Facundo Batista2ac5de22008-07-07 18:24:11 +00009
Jeremy Hylton1afc1692008-06-18 20:49:58 +000010__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
Guido van Rossum52dbbb92008-08-18 21:44:30 +000011 "urlsplit", "urlunsplit",
12 "quote", "quote_plus", "quote_from_bytes",
13 "unquote", "unquote_plus", "unquote_to_bytes"]
Jeremy Hylton1afc1692008-06-18 20:49:58 +000014
15# A classification of schemes ('' means apply by default)
16uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
17 'wais', 'file', 'https', 'shttp', 'mms',
18 'prospero', 'rtsp', 'rtspu', '', 'sftp']
19uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
20 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
21 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
22 'svn', 'svn+ssh', 'sftp']
23non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
24 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
25uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
26 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
27 'mms', '', 'sftp']
28uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
29 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
30uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
31 'nntp', 'wais', 'https', 'shttp', 'snews',
32 'file', 'prospero', '']
33
34# Characters valid in scheme names
35scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
36 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
37 '0123456789'
38 '+-.')
39
40MAX_CACHE_SIZE = 20
41_parse_cache = {}
42
43def clear_cache():
44 """Clear the parse cache."""
45 _parse_cache.clear()
46
47
48class ResultMixin(object):
49 """Shared methods for the parsed result objects."""
50
51 @property
52 def username(self):
53 netloc = self.netloc
54 if "@" in netloc:
55 userinfo = netloc.rsplit("@", 1)[0]
56 if ":" in userinfo:
57 userinfo = userinfo.split(":", 1)[0]
58 return userinfo
59 return None
60
61 @property
62 def password(self):
63 netloc = self.netloc
64 if "@" in netloc:
65 userinfo = netloc.rsplit("@", 1)[0]
66 if ":" in userinfo:
67 return userinfo.split(":", 1)[1]
68 return None
69
70 @property
71 def hostname(self):
72 netloc = self.netloc
73 if "@" in netloc:
74 netloc = netloc.rsplit("@", 1)[1]
75 if ":" in netloc:
76 netloc = netloc.split(":", 1)[0]
77 return netloc.lower() or None
78
79 @property
80 def port(self):
81 netloc = self.netloc
82 if "@" in netloc:
83 netloc = netloc.rsplit("@", 1)[1]
84 if ":" in netloc:
85 port = netloc.split(":", 1)[1]
86 return int(port, 10)
87 return None
88
89from collections import namedtuple
90
91class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
92
93 __slots__ = ()
94
95 def geturl(self):
96 return urlunsplit(self)
97
98
99class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
100
101 __slots__ = ()
102
103 def geturl(self):
104 return urlunparse(self)
105
106
107def urlparse(url, scheme='', allow_fragments=True):
108 """Parse a URL into 6 components:
109 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
110 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
111 Note that we don't break the components up in smaller bits
112 (e.g. netloc is a single string) and we don't expand % escapes."""
113 tuple = urlsplit(url, scheme, allow_fragments)
114 scheme, netloc, url, query, fragment = tuple
115 if scheme in uses_params and ';' in url:
116 url, params = _splitparams(url)
117 else:
118 params = ''
119 return ParseResult(scheme, netloc, url, params, query, fragment)
120
121def _splitparams(url):
122 if '/' in url:
123 i = url.find(';', url.rfind('/'))
124 if i < 0:
125 return url, ''
126 else:
127 i = url.find(';')
128 return url[:i], url[i+1:]
129
130def _splitnetloc(url, start=0):
131 delim = len(url) # position of end of domain part of url, default is end
132 for c in '/?#': # look for delimiters; the order is NOT important
133 wdelim = url.find(c, start) # find first of this delim
134 if wdelim >= 0: # if found
135 delim = min(delim, wdelim) # use earliest delim position
136 return url[start:delim], url[delim:] # return (domain, rest)
137
138def urlsplit(url, scheme='', allow_fragments=True):
139 """Parse a URL into 5 components:
140 <scheme>://<netloc>/<path>?<query>#<fragment>
141 Return a 5-tuple: (scheme, netloc, path, query, fragment).
142 Note that we don't break the components up in smaller bits
143 (e.g. netloc is a single string) and we don't expand % escapes."""
144 allow_fragments = bool(allow_fragments)
145 key = url, scheme, allow_fragments, type(url), type(scheme)
146 cached = _parse_cache.get(key, None)
147 if cached:
148 return cached
149 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
150 clear_cache()
151 netloc = query = fragment = ''
152 i = url.find(':')
153 if i > 0:
154 if url[:i] == 'http': # optimize the common case
155 scheme = url[:i].lower()
156 url = url[i+1:]
157 if url[:2] == '//':
158 netloc, url = _splitnetloc(url, 2)
159 if allow_fragments and '#' in url:
160 url, fragment = url.split('#', 1)
161 if '?' in url:
162 url, query = url.split('?', 1)
163 v = SplitResult(scheme, netloc, url, query, fragment)
164 _parse_cache[key] = v
165 return v
166 for c in url[:i]:
167 if c not in scheme_chars:
168 break
169 else:
170 scheme, url = url[:i].lower(), url[i+1:]
171 if scheme in uses_netloc and url[:2] == '//':
172 netloc, url = _splitnetloc(url, 2)
173 if allow_fragments and scheme in uses_fragment and '#' in url:
174 url, fragment = url.split('#', 1)
175 if scheme in uses_query and '?' in url:
176 url, query = url.split('?', 1)
177 v = SplitResult(scheme, netloc, url, query, fragment)
178 _parse_cache[key] = v
179 return v
180
181def urlunparse(components):
182 """Put a parsed URL back together again. This may result in a
183 slightly different, but equivalent URL, if the URL that was parsed
184 originally had redundant delimiters, e.g. a ? with an empty query
185 (the draft states that these are equivalent)."""
186 scheme, netloc, url, params, query, fragment = components
187 if params:
188 url = "%s;%s" % (url, params)
189 return urlunsplit((scheme, netloc, url, query, fragment))
190
191def urlunsplit(components):
192 scheme, netloc, url, query, fragment = components
193 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
194 if url and url[:1] != '/': url = '/' + url
195 url = '//' + (netloc or '') + url
196 if scheme:
197 url = scheme + ':' + url
198 if query:
199 url = url + '?' + query
200 if fragment:
201 url = url + '#' + fragment
202 return url
203
204def urljoin(base, url, allow_fragments=True):
205 """Join a base URL and a possibly relative URL to form an absolute
206 interpretation of the latter."""
207 if not base:
208 return url
209 if not url:
210 return base
211 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
212 urlparse(base, '', allow_fragments)
213 scheme, netloc, path, params, query, fragment = \
214 urlparse(url, bscheme, allow_fragments)
215 if scheme != bscheme or scheme not in uses_relative:
216 return url
217 if scheme in uses_netloc:
218 if netloc:
219 return urlunparse((scheme, netloc, path,
220 params, query, fragment))
221 netloc = bnetloc
222 if path[:1] == '/':
223 return urlunparse((scheme, netloc, path,
224 params, query, fragment))
Facundo Batista23e38562008-08-14 16:55:14 +0000225 if not path:
226 path = bpath
227 if not params:
228 params = bparams
229 else:
230 path = path[:-1]
231 return urlunparse((scheme, netloc, path,
232 params, query, fragment))
233 if not query:
234 query = bquery
235 return urlunparse((scheme, netloc, path,
236 params, query, fragment))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000237 segments = bpath.split('/')[:-1] + path.split('/')
238 # XXX The stuff below is bogus in various ways...
239 if segments[-1] == '.':
240 segments[-1] = ''
241 while '.' in segments:
242 segments.remove('.')
243 while 1:
244 i = 1
245 n = len(segments) - 1
246 while i < n:
247 if (segments[i] == '..'
248 and segments[i-1] not in ('', '..')):
249 del segments[i-1:i+1]
250 break
251 i = i+1
252 else:
253 break
254 if segments == ['', '..']:
255 segments[-1] = ''
256 elif len(segments) >= 2 and segments[-1] == '..':
257 segments[-2:] = ['']
258 return urlunparse((scheme, netloc, '/'.join(segments),
259 params, query, fragment))
260
261def urldefrag(url):
262 """Removes any existing fragment from URL.
263
264 Returns a tuple of the defragmented URL and the fragment. If
265 the URL contained no fragments, the second element is the
266 empty string.
267 """
268 if '#' in url:
269 s, n, p, a, q, frag = urlparse(url)
270 defrag = urlunparse((s, n, p, a, q, ''))
271 return defrag, frag
272 else:
273 return url, ''
274
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000275def unquote_to_bytes(string):
276 """unquote_to_bytes('abc%20def') -> b'abc def'."""
277 # Note: strings are encoded as UTF-8. This is only an issue if it contains
278 # unescaped non-ASCII characters, which URIs should not.
279 if isinstance(string, str):
280 string = string.encode('utf-8')
281 res = string.split(b'%')
282 res[0] = res[0]
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000283 for i in range(1, len(res)):
284 item = res[i]
285 try:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000286 res[i] = bytes([int(item[:2], 16)]) + item[2:]
287 except ValueError:
288 res[i] = b'%' + item
289 return b''.join(res)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000290
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000291def unquote(string, encoding='utf-8', errors='replace'):
292 """Replace %xx escapes by their single-character equivalent. The optional
293 encoding and errors parameters specify how to decode percent-encoded
294 sequences into Unicode characters, as accepted by the bytes.decode()
295 method.
296 By default, percent-encoded sequences are decoded with UTF-8, and invalid
297 sequences are replaced by a placeholder character.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000298
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000299 unquote('abc%20def') -> 'abc def'.
300 """
301 if encoding is None: encoding = 'utf-8'
302 if errors is None: errors = 'replace'
303 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
304 # (list of single-byte bytes objects)
305 pct_sequence = []
306 res = string.split('%')
307 for i in range(1, len(res)):
308 item = res[i]
309 try:
310 if not item: raise ValueError
311 pct_sequence.append(bytes.fromhex(item[:2]))
312 rest = item[2:]
313 except ValueError:
314 rest = '%' + item
315 if not rest:
316 # This segment was just a single percent-encoded character.
317 # May be part of a sequence of code units, so delay decoding.
318 # (Stored in pct_sequence).
319 res[i] = ''
320 else:
321 # Encountered non-percent-encoded characters. Flush the current
322 # pct_sequence.
323 res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
324 pct_sequence = []
325 if pct_sequence:
326 # Flush the final pct_sequence
327 # res[-1] will always be empty if pct_sequence != []
328 assert not res[-1], "string=%r, res=%r" % (string, res)
329 res[-1] = b''.join(pct_sequence).decode(encoding, errors)
330 return ''.join(res)
331
332def unquote_plus(string, encoding='utf-8', errors='replace'):
333 """Like unquote(), but also replace plus signs by spaces, as required for
334 unquoting HTML form values.
335
336 unquote_plus('%7e/abc+def') -> '~/abc def'
337 """
338 string = string.replace('+', ' ')
339 return unquote(string, encoding, errors)
340
341_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
342 b'abcdefghijklmnopqrstuvwxyz'
343 b'0123456789'
344 b'_.-')
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000345_safe_quoters= {}
346
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000347class Quoter(collections.defaultdict):
348 """A mapping from bytes (in range(0,256)) to strings.
349
350 String values are percent-encoded byte values, unless the key < 128, and
351 in the "safe" set (either the specified safe set, or default set).
352 """
353 # Keeps a cache internally, using defaultdict, for efficiency (lookups
354 # of cached keys don't call Python code at all).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000355 def __init__(self, safe):
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000356 """safe: bytes object."""
357 self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000358
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000359 def __repr__(self):
360 # Without this, will just display as a defaultdict
361 return "<Quoter %r>" % dict(self)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000362
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000363 def __missing__(self, b):
364 # Handle a cache miss. Store quoted string in cache and return.
365 res = b in self.safe and chr(b) or ('%%%02X' % b)
366 self[b] = res
367 return res
368
369def quote(string, safe='/', encoding=None, errors=None):
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000370 """quote('abc def') -> 'abc%20def'
371
372 Each part of a URL, e.g. the path info, the query, etc., has a
373 different set of reserved characters that must be quoted.
374
375 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
376 the following reserved characters.
377
378 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
379 "$" | ","
380
381 Each of these characters is reserved in some component of a URL,
382 but not necessarily in all of them.
383
384 By default, the quote function is intended for quoting the path
385 section of a URL. Thus, it will not encode '/'. This character
386 is reserved, but in typical usage the quote function is being
387 called on a path where the existing slash characters are used as
388 reserved characters.
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000389
390 string and safe may be either str or bytes objects. encoding must
391 not be specified if string is a str.
392
393 The optional encoding and errors parameters specify how to deal with
394 non-ASCII characters, as accepted by the str.encode method.
395 By default, encoding='utf-8' (characters are encoded with UTF-8), and
396 errors='strict' (unsupported characters raise a UnicodeEncodeError).
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000397 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000398 if isinstance(string, str):
399 if encoding is None:
400 encoding = 'utf-8'
401 if errors is None:
402 errors = 'strict'
403 string = string.encode(encoding, errors)
404 else:
405 if encoding is not None:
406 raise TypeError("quote() doesn't support 'encoding' for bytes")
407 if errors is not None:
408 raise TypeError("quote() doesn't support 'errors' for bytes")
409 return quote_from_bytes(string, safe)
410
411def quote_plus(string, safe='', encoding=None, errors=None):
412 """Like quote(), but also replace ' ' with '+', as required for quoting
413 HTML form values. Plus signs in the original string are escaped unless
414 they are included in safe. It also does not have safe default to '/'.
415 """
416 # Check if ' ' in string, where string may either be a str or bytes
417 if ' ' in string if isinstance(string, str) else b' ' in string:
418 string = quote(string,
419 safe + ' ' if isinstance(safe, str) else safe + b' ')
420 return string.replace(' ', '+')
421 return quote(string, safe, encoding, errors)
422
423def quote_from_bytes(bs, safe='/'):
424 """Like quote(), but accepts a bytes object rather than a str, and does
425 not perform string-to-bytes encoding. It always returns an ASCII string.
426 quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
427 """
428 if isinstance(safe, str):
429 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
430 safe = safe.encode('ascii', 'ignore')
431 cachekey = bytes(safe) # In case it was a bytearray
432 if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
433 raise TypeError("quote_from_bytes() expected a bytes")
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000434 try:
435 quoter = _safe_quoters[cachekey]
436 except KeyError:
437 quoter = Quoter(safe)
438 _safe_quoters[cachekey] = quoter
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000439 return ''.join(map(quoter.__getitem__, bs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000440
441def urlencode(query,doseq=0):
442 """Encode a sequence of two-element tuples or dictionary into a URL query string.
443
444 If any values in the query arg are sequences and doseq is true, each
445 sequence element is converted to a separate parameter.
446
447 If the query arg is a sequence of two-element tuples, the order of the
448 parameters in the output will match the order of parameters in the
449 input.
450 """
451
452 if hasattr(query,"items"):
453 # mapping objects
454 query = query.items()
455 else:
456 # it's a bother at times that strings and string-like objects are
457 # sequences...
458 try:
459 # non-sequence items should not work with len()
460 # non-empty strings will fail this
461 if len(query) and not isinstance(query[0], tuple):
462 raise TypeError
463 # zero-length sequences of all types will get here and succeed,
464 # but that's a minor nit - since the original implementation
465 # allowed empty dicts that type of behavior probably should be
466 # preserved for consistency
467 except TypeError:
468 ty,va,tb = sys.exc_info()
469 raise TypeError("not a valid non-string sequence or mapping object").with_traceback(tb)
470
471 l = []
472 if not doseq:
473 # preserve old behavior
474 for k, v in query:
475 k = quote_plus(str(k))
476 v = quote_plus(str(v))
477 l.append(k + '=' + v)
478 else:
479 for k, v in query:
480 k = quote_plus(str(k))
481 if isinstance(v, str):
482 v = quote_plus(v)
483 l.append(k + '=' + v)
484 elif isinstance(v, str):
485 # is there a reasonable way to convert to ASCII?
486 # encode generates a string, but "replace" or "ignore"
487 # lose information and "strict" can raise UnicodeError
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000488 v = quote_plus(v.encode("ASCII","replace"))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000489 l.append(k + '=' + v)
490 else:
491 try:
492 # is this a sufficient test for sequence-ness?
493 x = len(v)
494 except TypeError:
495 # not a sequence
496 v = quote_plus(str(v))
497 l.append(k + '=' + v)
498 else:
499 # loop over the sequence
500 for elt in v:
501 l.append(k + '=' + quote_plus(str(elt)))
502 return '&'.join(l)
503
504# Utilities to parse URLs (most of these return None for missing parts):
505# unwrap('<URL:type://host/path>') --> 'type://host/path'
506# splittype('type:opaquestring') --> 'type', 'opaquestring'
507# splithost('//host[:port]/path') --> 'host[:port]', '/path'
508# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
509# splitpasswd('user:passwd') -> 'user', 'passwd'
510# splitport('host:port') --> 'host', 'port'
511# splitquery('/path?query') --> '/path', 'query'
512# splittag('/path#tag') --> '/path', 'tag'
513# splitattr('/path;attr1=value1;attr2=value2;...') ->
514# '/path', ['attr1=value1', 'attr2=value2', ...]
515# splitvalue('attr=value') --> 'attr', 'value'
516# urllib.parse.unquote('abc%20def') -> 'abc def'
517# quote('abc def') -> 'abc%20def')
518
Georg Brandl13e89462008-07-01 19:56:00 +0000519def to_bytes(url):
520 """to_bytes(u"URL") --> 'URL'."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000521 # Most URL schemes require ASCII. If that changes, the conversion
522 # can be relaxed.
Georg Brandl13e89462008-07-01 19:56:00 +0000523 # XXX get rid of to_bytes()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000524 if isinstance(url, str):
525 try:
526 url = url.encode("ASCII").decode()
527 except UnicodeError:
528 raise UnicodeError("URL " + repr(url) +
529 " contains non-ASCII characters")
530 return url
531
532def unwrap(url):
533 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
534 url = str(url).strip()
535 if url[:1] == '<' and url[-1:] == '>':
536 url = url[1:-1].strip()
537 if url[:4] == 'URL:': url = url[4:].strip()
538 return url
539
540_typeprog = None
541def splittype(url):
542 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
543 global _typeprog
544 if _typeprog is None:
545 import re
546 _typeprog = re.compile('^([^/:]+):')
547
548 match = _typeprog.match(url)
549 if match:
550 scheme = match.group(1)
551 return scheme.lower(), url[len(scheme) + 1:]
552 return None, url
553
554_hostprog = None
555def splithost(url):
556 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
557 global _hostprog
558 if _hostprog is None:
559 import re
560 _hostprog = re.compile('^//([^/?]*)(.*)$')
561
562 match = _hostprog.match(url)
563 if match: return match.group(1, 2)
564 return None, url
565
566_userprog = None
567def splituser(host):
568 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
569 global _userprog
570 if _userprog is None:
571 import re
572 _userprog = re.compile('^(.*)@(.*)$')
573
574 match = _userprog.match(host)
Guido van Rossumdf9f1ec2008-08-06 19:31:34 +0000575 if match: return map(unquote, match.group(1, 2))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000576 return None, host
577
578_passwdprog = None
579def splitpasswd(user):
580 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
581 global _passwdprog
582 if _passwdprog is None:
583 import re
584 _passwdprog = re.compile('^([^:]*):(.*)$')
585
586 match = _passwdprog.match(user)
587 if match: return match.group(1, 2)
588 return user, None
589
590# splittag('/path#tag') --> '/path', 'tag'
591_portprog = None
592def splitport(host):
593 """splitport('host:port') --> 'host', 'port'."""
594 global _portprog
595 if _portprog is None:
596 import re
597 _portprog = re.compile('^(.*):([0-9]+)$')
598
599 match = _portprog.match(host)
600 if match: return match.group(1, 2)
601 return host, None
602
603_nportprog = None
604def splitnport(host, defport=-1):
605 """Split host and port, returning numeric port.
606 Return given default port if no ':' found; defaults to -1.
607 Return numerical port if a valid number are found after ':'.
608 Return None if ':' but not a valid number."""
609 global _nportprog
610 if _nportprog is None:
611 import re
612 _nportprog = re.compile('^(.*):(.*)$')
613
614 match = _nportprog.match(host)
615 if match:
616 host, port = match.group(1, 2)
617 try:
618 if not port: raise ValueError("no digits")
619 nport = int(port)
620 except ValueError:
621 nport = None
622 return host, nport
623 return host, defport
624
625_queryprog = None
626def splitquery(url):
627 """splitquery('/path?query') --> '/path', 'query'."""
628 global _queryprog
629 if _queryprog is None:
630 import re
631 _queryprog = re.compile('^(.*)\?([^?]*)$')
632
633 match = _queryprog.match(url)
634 if match: return match.group(1, 2)
635 return url, None
636
637_tagprog = None
638def splittag(url):
639 """splittag('/path#tag') --> '/path', 'tag'."""
640 global _tagprog
641 if _tagprog is None:
642 import re
643 _tagprog = re.compile('^(.*)#([^#]*)$')
644
645 match = _tagprog.match(url)
646 if match: return match.group(1, 2)
647 return url, None
648
649def splitattr(url):
650 """splitattr('/path;attr1=value1;attr2=value2;...') ->
651 '/path', ['attr1=value1', 'attr2=value2', ...]."""
652 words = url.split(';')
653 return words[0], words[1:]
654
655_valueprog = None
656def splitvalue(attr):
657 """splitvalue('attr=value') --> 'attr', 'value'."""
658 global _valueprog
659 if _valueprog is None:
660 import re
661 _valueprog = re.compile('^([^=]*)=(.*)$')
662
663 match = _valueprog.match(attr)
664 if match: return match.group(1, 2)
665 return attr, None
666
667test_input = """
668 http://a/b/c/d
669
670 g:h = <URL:g:h>
671 http:g = <URL:http://a/b/c/g>
672 http: = <URL:http://a/b/c/d>
673 g = <URL:http://a/b/c/g>
674 ./g = <URL:http://a/b/c/g>
675 g/ = <URL:http://a/b/c/g/>
676 /g = <URL:http://a/g>
677 //g = <URL:http://g>
678 ?y = <URL:http://a/b/c/d?y>
679 g?y = <URL:http://a/b/c/g?y>
680 g?y/./x = <URL:http://a/b/c/g?y/./x>
681 . = <URL:http://a/b/c/>
682 ./ = <URL:http://a/b/c/>
683 .. = <URL:http://a/b/>
684 ../ = <URL:http://a/b/>
685 ../g = <URL:http://a/b/g>
686 ../.. = <URL:http://a/>
687 ../../g = <URL:http://a/g>
688 ../../../g = <URL:http://a/../g>
689 ./../g = <URL:http://a/b/g>
690 ./g/. = <URL:http://a/b/c/g/>
691 /./g = <URL:http://a/./g>
692 g/./h = <URL:http://a/b/c/g/h>
693 g/../h = <URL:http://a/b/c/h>
694 http:g = <URL:http://a/b/c/g>
695 http: = <URL:http://a/b/c/d>
696 http:?y = <URL:http://a/b/c/d?y>
697 http:g?y = <URL:http://a/b/c/g?y>
698 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
699"""
700
701def test():
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000702 base = ''
703 if sys.argv[1:]:
704 fn = sys.argv[1]
705 if fn == '-':
706 fp = sys.stdin
707 else:
708 fp = open(fn)
709 else:
710 from io import StringIO
711 fp = StringIO(test_input)
712 for line in fp:
713 words = line.split()
714 if not words:
715 continue
716 url = words[0]
717 parts = urlparse(url)
718 print('%-10s : %s' % (url, parts))
719 abs = urljoin(base, url)
720 if not base:
721 base = abs
722 wrapped = '<URL:%s>' % abs
723 print('%-10s = %s' % (url, wrapped))
724 if len(words) == 3 and words[1] == '=':
725 if wrapped != words[2]:
726 print('EXPECTED', words[2], '!!!!!!!!!!')
727
728if __name__ == '__main__':
729 test()