Blame - Lib/urllib/parse.py - platform/external/python/cpython3

2008-06-18 20:49:58 +0000

[diff] [blame]

1

"""Parse (absolute and relative) URLs.

2

Senthil Kumaran

2010-04-17 14:44:14 +0000

[diff] [blame]

3

urlparse module is based upon the following RFC specifications.

4

5

RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding

6

and L. Masinter, January 2005.

7

8

RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter

9

and L.Masinter, December 1999.

10

Benjamin Peterson

d7c3ed5

2010-06-27 22:32:30 +0000

[diff] [blame]

11

RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.

Senthil Kumaran

2010-04-17 14:44:14 +0000

[diff] [blame]

12

Berners-Lee, R. Fielding, and L. Masinter, August 1998.

13

David Malcolm

ee25568

2010-12-02 16:41:00 +0000

[diff] [blame]

14

RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.

Senthil Kumaran

2010-04-17 14:44:14 +0000

[diff] [blame]

15

16

RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June

17

1995.

18

Benjamin Peterson

d7c3ed5

2010-06-27 22:32:30 +0000

[diff] [blame]

19

RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.

Senthil Kumaran

2010-04-17 14:44:14 +0000

[diff] [blame]

20

McCahill, December 1994

21

Benjamin Peterson

d7c3ed5

2010-06-27 22:32:30 +0000

[diff] [blame]

22

RFC 3986 is considered the current standard and any future changes to

23

urlparse module should conform with it. The urlparse module is

24

currently not entirely compliant with this RFC due to defacto

25

scenarios for parsing, and for backward compatibility purposes, some

26

parsing quirks from older RFCs are retained. The testcases in

Senthil Kumaran

2010-04-17 14:44:14 +0000

[diff] [blame]

27

test_urlparse.py provides a good indicator of parsing behavior.

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

28

"""

29

Serhiy Storchaka

2013-03-14 21:31:37 +0200

[diff] [blame]

30

import re

Facundo Batista

2ac5de2

2008-07-07 18:24:11 +0000

[diff] [blame]

31

import sys

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

32

import collections

Facundo Batista

2ac5de2

2008-07-07 18:24:11 +0000

[diff] [blame]

33

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

34

__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",

Senthil Kumaran

0256b2a

2010-10-25 16:36:20 +0000

[diff] [blame]

35

"urlsplit", "urlunsplit", "urlencode", "parse_qs",

36

"parse_qsl", "quote", "quote_plus", "quote_from_bytes",

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

37

"unquote", "unquote_plus", "unquote_to_bytes"]

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

38

39

# A classification of schemes ('' means apply by default)

40

uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',

41

'wais', 'file', 'https', 'shttp', 'mms',

Senthil Kumaran

2a157d2

2011-08-03 18:37:22 +0800

[diff] [blame]

42

'prospero', 'rtsp', 'rtspu', '', 'sftp',

43

'svn', 'svn+ssh']

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

44

uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',

45

'imap', 'wais', 'file', 'mms', 'https', 'shttp',

46

'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

47

'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

48

uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',

49

'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',

Senthil Kumaran

ed30199

2012-12-24 14:00:20 -0800

[diff] [blame]

50

'mms', '', 'sftp', 'tel']

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

51

Georg Brandl

a61b09f

2012-08-24 18:15:29 +0200

[diff] [blame]

52

# These are not actually used anymore, but should stay for backwards

53

# compatibility. (They are undocumented, but have a public-looking name.)

54

non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',

55

'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']

56

uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',

57

'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']

58

uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',

59

'nntp', 'wais', 'https', 'shttp', 'snews',

60

'file', 'prospero', '']

61

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

62

# Characters valid in scheme names

63

scheme_chars = ('abcdefghijklmnopqrstuvwxyz'

64

'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

'0123456789'

'+-.')

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

68

# XXX: Consider replacing with functools.lru_cache

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

MAX_CACHE_SIZE = 20

_parse_cache = {}

def clear_cache():

Antoine Pitrou

2df5fc7

2009-12-08 19:38:17 +0000

[diff] [blame]

73

"""Clear the parse cache and the quoters cache."""

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

74

_parse_cache.clear()

Antoine Pitrou

2df5fc7

2009-12-08 19:38:17 +0000

[diff] [blame]

75

_safe_quoters.clear()

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

76

77

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

78

# Helpers for bytes handling

79

# For 3.2, we deliberately require applications that

80

# handle improperly quoted URLs to do their own

81

# decoding and encoding. If valid use cases are

82

# presented, we may relax this by using latin-1

83

# decoding internally for 3.3

84

_implicit_encoding = 'ascii'

85

_implicit_errors = 'strict'

def _noop(obj):

return obj

def _encode_result(obj, encoding=_implicit_encoding,

91

errors=_implicit_errors):

92

return obj.encode(encoding, errors)

93

94

def _decode_args(args, encoding=_implicit_encoding,

95

errors=_implicit_errors):

96

return tuple(x.decode(encoding, errors) if x else '' for x in args)

97

98

def _coerce_args(*args):

99

# Invokes decode if necessary to create str args

100

# and returns the coerced inputs along with

101

# an appropriate result coercion function

102

# - noop for str inputs

103

# - encoding function otherwise

104

str_input = isinstance(args[0], str)

105

for arg in args[1:]:

106

# We special-case the empty string to support the

107

# "scheme=''" default argument to some functions

108

if arg and isinstance(arg, str) != str_input:

109

raise TypeError("Cannot mix str and non-str arguments")

110

if str_input:

111

return args + (_noop,)

112

return _decode_args(args) + (_encode_result,)

113

114

# Result objects are more helpful than simple tuples

115

class _ResultMixinStr(object):

116

"""Standard approach to encoding parsed results from str to bytes"""

117

__slots__ = ()

118

119

def encode(self, encoding='ascii', errors='strict'):

120

return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))

121

122

123

class _ResultMixinBytes(object):

124

"""Standard approach to decoding parsed results from bytes to str"""

125

__slots__ = ()

126

127

def decode(self, encoding='ascii', errors='strict'):

128

return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))

129

130

131

class _NetlocResultMixinBase(object):

132

"""Shared methods for the parsed result objects containing a netloc element"""

133

__slots__ = ()

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

134

135

@property

136

def username(self):

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

137

return self._userinfo[0]

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

138

139

@property

140

def password(self):

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

141

return self._userinfo[1]

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

142

143

@property

144

def hostname(self):

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

145

hostname = self._hostinfo[0]

146

if not hostname:

147

hostname = None

148

elif hostname is not None:

149

hostname = hostname.lower()

150

return hostname

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

151

152

@property

153

def port(self):

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

154

port = self._hostinfo[1]

155

if port is not None:

156

port = int(port, 10)

Senthil Kumaran

2fc5a50

2012-05-24 21:56:17 +0800

[diff] [blame]

157

# Return None on an illegal port

158

if not ( 0 <= port <= 65535):

159

return None

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

return port

class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):

__slots__ = ()

@property

def _userinfo(self):

netloc = self.netloc

userinfo, have_info, hostinfo = netloc.rpartition('@')

170

if have_info:

171

username, have_password, password = userinfo.partition(':')

172

if not have_password:

173

password = None

Senthil Kumaran

ad02d23

2010-04-16 03:02:13 +0000

[diff] [blame]

174

else:

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

175

username = password = None

176

return username, password

@property

def _hostinfo(self):

netloc = self.netloc

_, _, hostinfo = netloc.rpartition('@')

182

_, have_open_br, bracketed = hostinfo.partition('[')

183

if have_open_br:

184

hostname, _, port = bracketed.partition(']')

Serhiy Storchaka

2014-01-18 18:30:33 +0200

[diff] [blame]

185

_, _, port = port.partition(':')

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

186

else:

Serhiy Storchaka

2014-01-18 18:30:33 +0200

[diff] [blame]

187

hostname, _, port = hostinfo.partition(':')

188

if not port:

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

189

port = None

190

return hostname, port

191

192

193

class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):

__slots__ = ()

@property

def _userinfo(self):

netloc = self.netloc

userinfo, have_info, hostinfo = netloc.rpartition(b'@')

200

if have_info:

201

username, have_password, password = userinfo.partition(b':')

202

if not have_password:

203

password = None

204

else:

205

username = password = None

206

return username, password

@property

def _hostinfo(self):

netloc = self.netloc

_, _, hostinfo = netloc.rpartition(b'@')

212

_, have_open_br, bracketed = hostinfo.partition(b'[')

213

if have_open_br:

214

hostname, _, port = bracketed.partition(b']')

Serhiy Storchaka

2014-01-18 18:30:33 +0200

[diff] [blame]

215

_, _, port = port.partition(b':')

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

216

else:

Serhiy Storchaka

2014-01-18 18:30:33 +0200

[diff] [blame]

217

hostname, _, port = hostinfo.partition(b':')

218

if not port:

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

219

port = None

220

return hostname, port

221

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

222

223

from collections import namedtuple

224

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

225

_DefragResultBase = namedtuple('DefragResult', 'url fragment')

226

_SplitResultBase = namedtuple('SplitResult', 'scheme netloc path query fragment')

227

_ParseResultBase = namedtuple('ParseResult', 'scheme netloc path params query fragment')

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

228

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

229

# For backwards compatibility, alias _NetlocResultMixinStr

230

# ResultBase is no longer part of the documented API, but it is

231

# retained since deprecating it isn't worth the hassle

232

ResultBase = _NetlocResultMixinStr

233

234

# Structured result objects for string data

235

class DefragResult(_DefragResultBase, _ResultMixinStr):

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

236

__slots__ = ()

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

237

def geturl(self):

238

if self.fragment:

239

return self.url + '#' + self.fragment

240

else:

241

return self.url

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

242

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

243

class SplitResult(_SplitResultBase, _NetlocResultMixinStr):

244

__slots__ = ()

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

245

def geturl(self):

246

return urlunsplit(self)

247

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

248

class ParseResult(_ParseResultBase, _NetlocResultMixinStr):

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

249

__slots__ = ()

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

250

def geturl(self):

251

return urlunparse(self)

252

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

253

# Structured result objects for bytes data

254

class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):

__slots__ = ()

def geturl(self):

if self.fragment:

return self.url + b'#' + self.fragment

else:

return self.url

class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):

263

__slots__ = ()

264

def geturl(self):

265

return urlunsplit(self)

266

267

class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):

268

__slots__ = ()

269

def geturl(self):

270

return urlunparse(self)

271

272

# Set up the encode/decode result pairs

273

def _fix_result_transcoding():

274

_result_pairs = (

275

(DefragResult, DefragResultBytes),

276

(SplitResult, SplitResultBytes),

277

(ParseResult, ParseResultBytes),

278

)

279

for _decoded, _encoded in _result_pairs:

280

_decoded._encoded_counterpart = _encoded

281

_encoded._decoded_counterpart = _decoded

282

283

_fix_result_transcoding()

284

del _fix_result_transcoding

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

285

286

def urlparse(url, scheme='', allow_fragments=True):

287

"""Parse a URL into 6 components:

288

289

Return a 6-tuple: (scheme, netloc, path, params, query, fragment).

290

Note that we don't break the components up in smaller bits

291

(e.g. netloc is a single string) and we don't expand % escapes."""

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

292

url, scheme, _coerce_result = _coerce_args(url, scheme)

Senthil Kumaran

2012-06-29 11:08:20 -0700

[diff] [blame]

293

splitresult = urlsplit(url, scheme, allow_fragments)

294

scheme, netloc, url, query, fragment = splitresult

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

295

if scheme in uses_params and ';' in url:

296

url, params = _splitparams(url)

297

else:

298

params = ''

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

299

result = ParseResult(scheme, netloc, url, params, query, fragment)

300

return _coerce_result(result)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

301

302

def _splitparams(url):

303

if '/' in url:

304

i = url.find(';', url.rfind('/'))

if i < 0:

return url, ''

else:

i = url.find(';')

return url[:i], url[i+1:]

310

311

def _splitnetloc(url, start=0):

312

delim = len(url) # position of end of domain part of url, default is end

313

for c in '/?#': # look for delimiters; the order is NOT important

314

wdelim = url.find(c, start) # find first of this delim

315

if wdelim >= 0: # if found

316

delim = min(delim, wdelim) # use earliest delim position

317

return url[start:delim], url[delim:] # return (domain, rest)

318

319

def urlsplit(url, scheme='', allow_fragments=True):

320

"""Parse a URL into 5 components:

321

322

Return a 5-tuple: (scheme, netloc, path, query, fragment).

323

Note that we don't break the components up in smaller bits

324

(e.g. netloc is a single string) and we don't expand % escapes."""

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

325

url, scheme, _coerce_result = _coerce_args(url, scheme)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

326

allow_fragments = bool(allow_fragments)

327

key = url, scheme, allow_fragments, type(url), type(scheme)

328

cached = _parse_cache.get(key, None)

329

if cached:

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

330

return _coerce_result(cached)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

331

if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth

332

clear_cache()

333

netloc = query = fragment = ''

334

i = url.find(':')

335

if i > 0:

336

if url[:i] == 'http': # optimize the common case

337

scheme = url[:i].lower()

338

url = url[i+1:]

339

if url[:2] == '//':

340

netloc, url = _splitnetloc(url, 2)

Senthil Kumaran

7a1e09f

2010-04-22 12:19:46 +0000

[diff] [blame]

341

if (('[' in netloc and ']' not in netloc) or

342

(']' in netloc and '[' not in netloc)):

343

raise ValueError("Invalid IPv6 URL")

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

344

if allow_fragments and '#' in url:

345

url, fragment = url.split('#', 1)

346

if '?' in url:

347

url, query = url.split('?', 1)

348

v = SplitResult(scheme, netloc, url, query, fragment)

349

_parse_cache[key] = v

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

350

return _coerce_result(v)

Senthil Kumaran

397eb44

2011-04-15 18:20:24 +0800

[diff] [blame]

351

for c in url[:i]:

352

if c not in scheme_chars:

353

break

354

else:

Ezio Melotti

6709b7d

2012-05-19 17:15:19 +0300

[diff] [blame]

355

# make sure "url" is not actually a port number (in which case

356

# "scheme" is really part of the path)

357

rest = url[i+1:]

358

if not rest or any(c not in '0123456789' for c in rest):

359

# not a port number

360

scheme, url = url[:i].lower(), rest

Senthil Kumaran

397eb44

2011-04-15 18:20:24 +0800

[diff] [blame]

361

Senthil Kumaran

6be85c5

2010-02-19 07:42:50 +0000

[diff] [blame]

362

if url[:2] == '//':

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

363

netloc, url = _splitnetloc(url, 2)

Senthil Kumaran

7a1e09f

2010-04-22 12:19:46 +0000

[diff] [blame]

364

if (('[' in netloc and ']' not in netloc) or

365

(']' in netloc and '[' not in netloc)):

366

raise ValueError("Invalid IPv6 URL")

Senthil Kumaran

1be320e

2012-05-19 08:12:00 +0800

[diff] [blame]

367

if allow_fragments and '#' in url:

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

368

url, fragment = url.split('#', 1)

Senthil Kumaran

1be320e

2012-05-19 08:12:00 +0800

[diff] [blame]

369

if '?' in url:

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

370

url, query = url.split('?', 1)

371

v = SplitResult(scheme, netloc, url, query, fragment)

372

_parse_cache[key] = v

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

373

return _coerce_result(v)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

374

375

def urlunparse(components):

376

"""Put a parsed URL back together again. This may result in a

377

slightly different, but equivalent URL, if the URL that was parsed

378

originally had redundant delimiters, e.g. a ? with an empty query

379

(the draft states that these are equivalent)."""

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

380

scheme, netloc, url, params, query, fragment, _coerce_result = (

381

_coerce_args(*components))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

382

if params:

383

url = "%s;%s" % (url, params)

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

384

return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

385

386

def urlunsplit(components):

Senthil Kumaran

8749a63

2010-06-28 14:08:00 +0000

[diff] [blame]

387

"""Combine the elements of a tuple as returned by urlsplit() into a

388

complete URL as a string. The data argument can be any five-item iterable.

389

This may result in a slightly different, but equivalent URL, if the URL that

390

was parsed originally had unnecessary delimiters (for example, a ? with an

391

empty query; the RFC states that these are equivalent)."""

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

392

scheme, netloc, url, query, fragment, _coerce_result = (

393

_coerce_args(*components))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

394

if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):

395

if url and url[:1] != '/': url = '/' + url

396

url = '//' + (netloc or '') + url

397

if scheme:

398

url = scheme + ':' + url

399

if query:

400

url = url + '?' + query

401

if fragment:

402

url = url + '#' + fragment

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

403

return _coerce_result(url)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

404

405

def urljoin(base, url, allow_fragments=True):

406

"""Join a base URL and a possibly relative URL to form an absolute

407

interpretation of the latter."""

if not base:

return url

if not url:

return base

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

412

base, url, _coerce_result = _coerce_args(base, url)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

413

bscheme, bnetloc, bpath, bparams, bquery, bfragment = \

414

urlparse(base, '', allow_fragments)

415

scheme, netloc, path, params, query, fragment = \

416

urlparse(url, bscheme, allow_fragments)

417

if scheme != bscheme or scheme not in uses_relative:

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

418

return _coerce_result(url)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

419

if scheme in uses_netloc:

420

if netloc:

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

421

return _coerce_result(urlunparse((scheme, netloc, path,

422

params, query, fragment)))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

423

netloc = bnetloc

424

if path[:1] == '/':

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

425

return _coerce_result(urlunparse((scheme, netloc, path,

426

params, query, fragment)))

Senthil Kumaran

dca5b86

2010-12-17 04:48:45 +0000

[diff] [blame]

427

if not path and not params:

Facundo Batista

23e3856

2008-08-14 16:55:14 +0000

[diff] [blame]

428

path = bpath

Senthil Kumaran

dca5b86

2010-12-17 04:48:45 +0000

[diff] [blame]

429

params = bparams

Facundo Batista

23e3856

2008-08-14 16:55:14 +0000

[diff] [blame]

430

if not query:

431

query = bquery

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

432

return _coerce_result(urlunparse((scheme, netloc, path,

433

params, query, fragment)))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

434

segments = bpath.split('/')[:-1] + path.split('/')

435

# XXX The stuff below is bogus in various ways...

436

if segments[-1] == '.':

437

segments[-1] = ''

438

while '.' in segments:

segments.remove('.')

while 1:

i = 1

n = len(segments) - 1

443

while i < n:

444

if (segments[i] == '..'

445

and segments[i-1] not in ('', '..')):

446

del segments[i-1:i+1]

break

i = i+1

else:

break

if segments == ['', '..']:

452

segments[-1] = ''

453

elif len(segments) >= 2 and segments[-1] == '..':

454

segments[-2:] = ['']

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

455

return _coerce_result(urlunparse((scheme, netloc, '/'.join(segments),

456

params, query, fragment)))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

457

458

def urldefrag(url):

459

"""Removes any existing fragment from URL.

460

461

Returns a tuple of the defragmented URL and the fragment. If

462

the URL contained no fragments, the second element is the

463

empty string.

464

"""

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

465

url, _coerce_result = _coerce_args(url)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

466

if '#' in url:

467

s, n, p, a, q, frag = urlparse(url)

468

defrag = urlunparse((s, n, p, a, q, ''))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

469

else:

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

470

frag = ''

471

defrag = url

472

return _coerce_result(DefragResult(defrag, frag))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

473

Serhiy Storchaka

2013-03-14 21:31:37 +0200

[diff] [blame]

474

_hexdig = '0123456789ABCDEFabcdef'

Victor Stinner

d6a91a7

2014-03-17 22:38:41 +0100

[diff] [blame]

475

_hextobyte = None

Serhiy Storchaka

2013-03-14 21:31:37 +0200

[diff] [blame]

476

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

477

def unquote_to_bytes(string):

478

"""unquote_to_bytes('abc%20def') -> b'abc def'."""

479

# Note: strings are encoded as UTF-8. This is only an issue if it contains

480

# unescaped non-ASCII characters, which URIs should not.

Florent Xicluna

82a3f8a

2010-08-14 18:30:35 +0000

[diff] [blame]

481

if not string:

482

# Is it a string-like object?

483

string.split

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

484

return b''

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

485

if isinstance(string, str):

486

string = string.encode('utf-8')

Serhiy Storchaka

2013-03-14 21:31:37 +0200

[diff] [blame]

487

bits = string.split(b'%')

488

if len(bits) == 1:

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

489

return string

Serhiy Storchaka

2013-03-14 21:31:37 +0200

[diff] [blame]

490

res = [bits[0]]

491

append = res.append

Victor Stinner

d6a91a7

2014-03-17 22:38:41 +0100

[diff] [blame]

492

# Delay the initialization of the table to not waste memory

493

# if the function is never called

494

global _hextobyte

495

if _hextobyte is None:

496

_hextobyte = {(a + b).encode(): bytes([int(a + b, 16)])

497

for a in _hexdig for b in _hexdig}

Serhiy Storchaka

2013-03-14 21:31:37 +0200

[diff] [blame]

498

for item in bits[1:]:

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

499

try:

Serhiy Storchaka

2013-03-14 21:31:37 +0200

[diff] [blame]

500

append(_hextobyte[item[:2]])

append(item[2:])

except KeyError:

append(b'%')

append(item)

return b''.join(res)

_asciire = re.compile('([\x00-\x7f]+)')

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

508

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

509

def unquote(string, encoding='utf-8', errors='replace'):

510

"""Replace %xx escapes by their single-character equivalent. The optional

511

encoding and errors parameters specify how to decode percent-encoded

512

sequences into Unicode characters, as accepted by the bytes.decode()

513

method.

514

By default, percent-encoded sequences are decoded with UTF-8, and invalid

515

sequences are replaced by a placeholder character.

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

516

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

517

unquote('abc%20def') -> 'abc def'.

518

"""

Serhiy Storchaka

2013-03-14 21:31:37 +0200

[diff] [blame]

519

if '%' not in string:

520

string.split

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

return string

if encoding is None:

encoding = 'utf-8'

if errors is None:

errors = 'replace'

Serhiy Storchaka

2013-03-14 21:31:37 +0200

[diff] [blame]

526

bits = _asciire.split(string)

527

res = [bits[0]]

528

append = res.append

529

for i in range(1, len(bits), 2):

530

append(unquote_to_bytes(bits[i]).decode(encoding, errors))

531

append(bits[i + 1])

532

return ''.join(res)

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

533

Victor Stinner

2011-01-14 12:52:12 +0000

[diff] [blame]

534

def parse_qs(qs, keep_blank_values=False, strict_parsing=False,

535

encoding='utf-8', errors='replace'):

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

536

"""Parse a query given as a string argument.

Arguments:

Senthil Kumaran

2010-08-09 20:01:35 +0000

[diff] [blame]

540

qs: percent-encoded query string to be parsed

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

541

542

keep_blank_values: flag indicating whether blank values in

Senthil Kumaran

30e86a4

2010-08-09 20:01:35 +0000

[diff] [blame]

543

percent-encoded queries should be treated as blank strings.

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

544

A true value indicates that blanks should be retained as

545

blank strings. The default false value indicates that

546

blank values are to be ignored and treated as if they were

547

not included.

548

549

strict_parsing: flag indicating what to do with parsing errors.

550

If false (the default), errors are silently ignored.

551

If true, errors raise a ValueError exception.

Victor Stinner

2011-01-14 12:52:12 +0000

[diff] [blame]

552

553

encoding and errors: specify how to decode percent-encoded sequences

554

into Unicode characters, as accepted by the bytes.decode() method.

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

555

"""

Senthil Kumaran

2012-06-29 11:08:20 -0700

[diff] [blame]

556

parsed_result = {}

Victor Stinner

2011-01-14 12:52:12 +0000

[diff] [blame]

557

pairs = parse_qsl(qs, keep_blank_values, strict_parsing,

558

encoding=encoding, errors=errors)

559

for name, value in pairs:

Senthil Kumaran

2012-06-29 11:08:20 -0700

[diff] [blame]

560

if name in parsed_result:

561

parsed_result[name].append(value)

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

562

else:

Senthil Kumaran

2012-06-29 11:08:20 -0700

[diff] [blame]

563

parsed_result[name] = [value]

564

return parsed_result

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

565

Victor Stinner

2011-01-14 12:52:12 +0000

[diff] [blame]

566

def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,

567

encoding='utf-8', errors='replace'):

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

568

"""Parse a query given as a string argument.

Arguments:

Senthil Kumaran

2010-08-09 20:01:35 +0000

[diff] [blame]

572

qs: percent-encoded query string to be parsed

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

573

574

keep_blank_values: flag indicating whether blank values in

Senthil Kumaran

30e86a4

2010-08-09 20:01:35 +0000

[diff] [blame]

575

percent-encoded queries should be treated as blank strings. A

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

576

true value indicates that blanks should be retained as blank

577

strings. The default false value indicates that blank values

578

are to be ignored and treated as if they were not included.

579

580

strict_parsing: flag indicating what to do with parsing errors. If

581

false (the default), errors are silently ignored. If true,

582

errors raise a ValueError exception.

583

Victor Stinner

2011-01-14 12:52:12 +0000

[diff] [blame]

584

encoding and errors: specify how to decode percent-encoded sequences

585

into Unicode characters, as accepted by the bytes.decode() method.

586

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

587

Returns a list, as G-d intended.

588

"""

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

589

qs, _coerce_result = _coerce_args(qs)

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

590

pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]

591

r = []

592

for name_value in pairs:

593

if not name_value and not strict_parsing:

594

continue

595

nv = name_value.split('=', 1)

596

if len(nv) != 2:

597

if strict_parsing:

598

raise ValueError("bad query field: %r" % (name_value,))

599

# Handle case of a control-name with no equal sign

600

if keep_blank_values:

nv.append('')

else:

continue

if len(nv[1]) or keep_blank_values:

Victor Stinner

2011-01-14 12:52:12 +0000

[diff] [blame]

605

name = nv[0].replace('+', ' ')

606

name = unquote(name, encoding=encoding, errors=errors)

607

name = _coerce_result(name)

608

value = nv[1].replace('+', ' ')

609

value = unquote(value, encoding=encoding, errors=errors)

610

value = _coerce_result(value)

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

611

r.append((name, value))

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

612

return r

613

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

614

def unquote_plus(string, encoding='utf-8', errors='replace'):

615

"""Like unquote(), but also replace plus signs by spaces, as required for

616

unquoting HTML form values.

617

618

unquote_plus('%7e/abc+def') -> '~/abc def'

619

"""

620

string = string.replace('+', ' ')

621

return unquote(string, encoding, errors)

622

623

_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

624

b'abcdefghijklmnopqrstuvwxyz'

625

b'0123456789'

626

b'_.-')

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

627

_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)

628

_safe_quoters = {}

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

629

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

630

class Quoter(collections.defaultdict):

631

"""A mapping from bytes (in range(0,256)) to strings.

632

633

String values are percent-encoded byte values, unless the key < 128, and

634

in the "safe" set (either the specified safe set, or default set).

635

"""

636

# Keeps a cache internally, using defaultdict, for efficiency (lookups

637

# of cached keys don't call Python code at all).

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

638

def __init__(self, safe):

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

639

"""safe: bytes object."""

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

640

self.safe = _ALWAYS_SAFE.union(safe)

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

641

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

642

def __repr__(self):

643

# Without this, will just display as a defaultdict

644

return "<Quoter %r>" % dict(self)

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

645

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

646

def __missing__(self, b):

647

# Handle a cache miss. Store quoted string in cache and return.

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

648

res = chr(b) if b in self.safe else '%{:02X}'.format(b)

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

self[b] = res

return res

def quote(string, safe='/', encoding=None, errors=None):

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

653

"""quote('abc def') -> 'abc%20def'

654

655

Each part of a URL, e.g. the path info, the query, etc., has a

656

different set of reserved characters that must be quoted.

657

658

RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists

659

the following reserved characters.

660

661

reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |

662

"$" | ","

663

664

Each of these characters is reserved in some component of a URL,

665

but not necessarily in all of them.

666

667

By default, the quote function is intended for quoting the path

668

section of a URL. Thus, it will not encode '/'. This character

669

is reserved, but in typical usage the quote function is being

670

called on a path where the existing slash characters are used as

671

reserved characters.

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

672

673

string and safe may be either str or bytes objects. encoding must

674

not be specified if string is a str.

675

676

The optional encoding and errors parameters specify how to deal with

677

non-ASCII characters, as accepted by the str.encode method.

678

By default, encoding='utf-8' (characters are encoded with UTF-8), and

679

errors='strict' (unsupported characters raise a UnicodeEncodeError).

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

680

"""

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

681

if isinstance(string, str):

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

682

if not string:

683

return string

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

if encoding is None:

encoding = 'utf-8'

if errors is None:

errors = 'strict'

string = string.encode(encoding, errors)

689

else:

690

if encoding is not None:

691

raise TypeError("quote() doesn't support 'encoding' for bytes")

692

if errors is not None:

693

raise TypeError("quote() doesn't support 'errors' for bytes")

694

return quote_from_bytes(string, safe)

695

696

def quote_plus(string, safe='', encoding=None, errors=None):

697

"""Like quote(), but also replace ' ' with '+', as required for quoting

698

HTML form values. Plus signs in the original string are escaped unless

699

they are included in safe. It also does not have safe default to '/'.

700

"""

Jeremy Hylton

f819886

2009-03-26 16:55:08 +0000

[diff] [blame]

701

# Check if ' ' in string, where string may either be a str or bytes. If

702

# there are no spaces, the regular quote will produce the right answer.

703

if ((isinstance(string, str) and ' ' not in string) or

704

(isinstance(string, bytes) and b' ' not in string)):

705

return quote(string, safe, encoding, errors)

706

if isinstance(safe, str):

707

space = ' '

708

else:

709

space = b' '

Georg Brandl

faf4149

2009-05-26 18:31:11 +0000

[diff] [blame]

710

string = quote(string, safe + space, encoding, errors)

Jeremy Hylton

f819886

2009-03-26 16:55:08 +0000

[diff] [blame]

711

return string.replace(' ', '+')

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

712

713

def quote_from_bytes(bs, safe='/'):

714

"""Like quote(), but accepts a bytes object rather than a str, and does

715

not perform string-to-bytes encoding. It always returns an ASCII string.

Senthil Kumaran

ffa4b2c

2012-05-26 09:53:32 +0800

[diff] [blame]

716

quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

717

"""

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

718

if not isinstance(bs, (bytes, bytearray)):

719

raise TypeError("quote_from_bytes() expected bytes")

720

if not bs:

721

return ''

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

722

if isinstance(safe, str):

723

# Normalize 'safe' by converting to bytes and removing non-ASCII chars

724

safe = safe.encode('ascii', 'ignore')

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

725

else:

726

safe = bytes([c for c in safe if c < 128])

727

if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):

728

return bs.decode()

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

729

try:

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

730

quoter = _safe_quoters[safe]

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

731

except KeyError:

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

732

_safe_quoters[safe] = quoter = Quoter(safe).__getitem__

733

return ''.join([quoter(char) for char in bs])

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

734

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

735

def urlencode(query, doseq=False, safe='', encoding=None, errors=None):

Senthil Kumaran

324ae385

2013-09-05 21:42:38 -0700

[diff] [blame]

736

"""Encode a dict or sequence of two-element tuples into a URL query string.

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

737

738

If any values in the query arg are sequences and doseq is true, each

739

sequence element is converted to a separate parameter.

740

741

If the query arg is a sequence of two-element tuples, the order of the

742

parameters in the output will match the order of parameters in the

743

input.

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

744

Senthil Kumaran

324ae385

2013-09-05 21:42:38 -0700

[diff] [blame]

745

The components of a query arg may each be either a string or a bytes type.

746

When a component is a string, the safe, encoding and error parameters are

747

sent to the quote_plus function for encoding.

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

748

"""

749

Jeremy Hylton

a4de60a

2009-03-26 14:49:26 +0000

[diff] [blame]

750

if hasattr(query, "items"):

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

751

query = query.items()

752

else:

Jeremy Hylton

230feba

2009-03-26 16:56:59 +0000

[diff] [blame]

753

# It's a bother at times that strings and string-like objects are

754

# sequences.

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

755

try:

756

# non-sequence items should not work with len()

757

# non-empty strings will fail this

758

if len(query) and not isinstance(query[0], tuple):

759

raise TypeError

Jeremy Hylton

230feba

2009-03-26 16:56:59 +0000

[diff] [blame]

760

# Zero-length sequences of all types will get here and succeed,

761

# but that's a minor nit. Since the original implementation

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

762

# allowed empty dicts that type of behavior probably should be

763

# preserved for consistency

764

except TypeError:

Jeremy Hylton

a4de60a

2009-03-26 14:49:26 +0000

[diff] [blame]

765

ty, va, tb = sys.exc_info()

766

raise TypeError("not a valid non-string sequence "

767

"or mapping object").with_traceback(tb)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

768

769

l = []

770

if not doseq:

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

771

for k, v in query:

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

772

if isinstance(k, bytes):

773

k = quote_plus(k, safe)

774

else:

775

k = quote_plus(str(k), safe, encoding, errors)

776

777

if isinstance(v, bytes):

778

v = quote_plus(v, safe)

779

else:

780

v = quote_plus(str(v), safe, encoding, errors)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

781

l.append(k + '=' + v)

782

else:

783

for k, v in query:

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

784

if isinstance(k, bytes):

785

k = quote_plus(k, safe)

786

else:

787

k = quote_plus(str(k), safe, encoding, errors)

788

789

if isinstance(v, bytes):

790

v = quote_plus(v, safe)

791

l.append(k + '=' + v)

792

elif isinstance(v, str):

793

v = quote_plus(v, safe, encoding, errors)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

794

l.append(k + '=' + v)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

795

else:

796

try:

Jeremy Hylton

230feba

2009-03-26 16:56:59 +0000

[diff] [blame]

797

# Is this a sufficient test for sequence-ness?

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

798

x = len(v)

799

except TypeError:

800

# not a sequence

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

801

v = quote_plus(str(v), safe, encoding, errors)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

802

l.append(k + '=' + v)

803

else:

804

# loop over the sequence

805

for elt in v:

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

806

if isinstance(elt, bytes):

807

elt = quote_plus(elt, safe)

808

else:

809

elt = quote_plus(str(elt), safe, encoding, errors)

810

l.append(k + '=' + elt)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

811

return '&'.join(l)

812

813

# Utilities to parse URLs (most of these return None for missing parts):

814

# unwrap('<URL:type://host/path>') --> 'type://host/path'

815

# splittype('type:opaquestring') --> 'type', 'opaquestring'

816

# splithost('//host[:port]/path') --> 'host[:port]', '/path'

817

# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'

818

# splitpasswd('user:passwd') -> 'user', 'passwd'

819

# splitport('host:port') --> 'host', 'port'

820

# splitquery('/path?query') --> '/path', 'query'

821

# splittag('/path#tag') --> '/path', 'tag'

822

# splitattr('/path;attr1=value1;attr2=value2;...') ->

823

# '/path', ['attr1=value1', 'attr2=value2', ...]

824

# splitvalue('attr=value') --> 'attr', 'value'

825

# urllib.parse.unquote('abc%20def') -> 'abc def'

826

# quote('abc def') -> 'abc%20def')

827

Georg Brandl

13e8946

2008-07-01 19:56:00 +0000

[diff] [blame]

828

def to_bytes(url):

829

"""to_bytes(u"URL") --> 'URL'."""

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

830

# Most URL schemes require ASCII. If that changes, the conversion

831

# can be relaxed.

Georg Brandl

13e8946

2008-07-01 19:56:00 +0000

[diff] [blame]

832

# XXX get rid of to_bytes()

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

833

if isinstance(url, str):

834

try:

835

url = url.encode("ASCII").decode()

836

except UnicodeError:

837

raise UnicodeError("URL " + repr(url) +

838

" contains non-ASCII characters")

return url

def unwrap(url):

"""unwrap('<URL:type://host/path>') --> 'type://host/path'."""

843

url = str(url).strip()

844

if url[:1] == '<' and url[-1:] == '>':

845

url = url[1:-1].strip()

846

if url[:4] == 'URL:': url = url[4:].strip()

return url

_typeprog = None

def splittype(url):

"""splittype('type:opaquestring') --> 'type', 'opaquestring'."""

852

global _typeprog

853

if _typeprog is None:

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

854

_typeprog = re.compile('^([^/:]+):')

855

856

match = _typeprog.match(url)

857

if match:

858

scheme = match.group(1)

859

return scheme.lower(), url[len(scheme) + 1:]

return None, url

_hostprog = None

def splithost(url):

"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""

865

global _hostprog

866

if _hostprog is None:

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

867

_hostprog = re.compile('^//([^/?]*)(.*)$')

868

869

match = _hostprog.match(url)

Senthil Kumaran

c295862

2010-11-22 04:48:26 +0000

[diff] [blame]

870

if match:

871

host_port = match.group(1)

872

path = match.group(2)

873

if path and not path.startswith('/'):

874

path = '/' + path

875

return host_port, path

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

return None, url

_userprog = None

def splituser(host):

"""splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""

881

global _userprog

882

if _userprog is None:

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

883

_userprog = re.compile('^(.*)@(.*)$')

884

885

match = _userprog.match(host)

Senthil Kumaran

daa29d0

2010-11-18 15:36:41 +0000

[diff] [blame]

886

if match: return match.group(1, 2)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

return None, host

_passwdprog = None

def splitpasswd(user):

891

"""splitpasswd('user:passwd') -> 'user', 'passwd'."""

892

global _passwdprog

893

if _passwdprog is None:

Senthil Kumaran

eaaec27

2009-03-30 21:54:41 +0000

[diff] [blame]

894

_passwdprog = re.compile('^([^:]*):(.*)$',re.S)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

895

896

match = _passwdprog.match(user)

897

if match: return match.group(1, 2)

898

return user, None

899

900

# splittag('/path#tag') --> '/path', 'tag'

901

_portprog = None

902

def splitport(host):

903

"""splitport('host:port') --> 'host', 'port'."""

904

global _portprog

905

if _portprog is None:

Serhiy Storchaka

2014-01-18 18:30:33 +0200

[diff] [blame]

906

_portprog = re.compile('^(.*):([0-9]*)$')

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

907

908

match = _portprog.match(host)

Serhiy Storchaka

2014-01-18 18:30:33 +0200

[diff] [blame]

909

if match:

910

host, port = match.groups()

911

if port:

912

return host, port

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

return host, None

_nportprog = None

def splitnport(host, defport=-1):

917

"""Split host and port, returning numeric port.

918

Return given default port if no ':' found; defaults to -1.

919

Return numerical port if a valid number are found after ':'.

920

Return None if ':' but not a valid number."""

921

global _nportprog

922

if _nportprog is None:

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

923

_nportprog = re.compile('^(.*):(.*)$')

924

925

match = _nportprog.match(host)

926

if match:

927

host, port = match.group(1, 2)

Serhiy Storchaka

2014-01-18 18:30:33 +0200

[diff] [blame]

if port:

try:

nport = int(port)

except ValueError:

nport = None

return host, nport

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

return host, defport

_queryprog = None

def splitquery(url):

"""splitquery('/path?query') --> '/path', 'query'."""

939

global _queryprog

940

if _queryprog is None:

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

941

_queryprog = re.compile('^(.*)\?([^?]*)$')

942

943

match = _queryprog.match(url)

944

if match: return match.group(1, 2)

return url, None

_tagprog = None

def splittag(url):

"""splittag('/path#tag') --> '/path', 'tag'."""

950

global _tagprog

951

if _tagprog is None:

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

952

_tagprog = re.compile('^(.*)#([^#]*)$')

953

954

match = _tagprog.match(url)

955

if match: return match.group(1, 2)

return url, None

def splitattr(url):

"""splitattr('/path;attr1=value1;attr2=value2;...') ->

960

'/path', ['attr1=value1', 'attr2=value2', ...]."""

961

words = url.split(';')

962

return words[0], words[1:]

963

964

_valueprog = None

965

def splitvalue(attr):

966

"""splitvalue('attr=value') --> 'attr', 'value'."""

967

global _valueprog

968

if _valueprog is None:

Jeremy Hylton