Blame - Lib/urllib/parse.py - platform/external/python/cpython3

2008-06-18 20:49:58 +0000

[diff] [blame]

1

"""Parse (absolute and relative) URLs.

2

Senthil Kumaran

2010-04-17 14:44:14 +0000

[diff] [blame]

3

urlparse module is based upon the following RFC specifications.

4

5

RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding

6

and L. Masinter, January 2005.

7

8

RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter

9

and L.Masinter, December 1999.

10

Benjamin Peterson

d7c3ed5

2010-06-27 22:32:30 +0000

[diff] [blame]

11

RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.

Senthil Kumaran

2010-04-17 14:44:14 +0000

[diff] [blame]

12

Berners-Lee, R. Fielding, and L. Masinter, August 1998.

13

David Malcolm

ee25568

2010-12-02 16:41:00 +0000

[diff] [blame]

14

RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.

Senthil Kumaran

2010-04-17 14:44:14 +0000

[diff] [blame]

15

16

RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June

17

1995.

18

Benjamin Peterson

d7c3ed5

2010-06-27 22:32:30 +0000

[diff] [blame]

19

RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.

Senthil Kumaran

2010-04-17 14:44:14 +0000

[diff] [blame]

20

McCahill, December 1994

21

Benjamin Peterson

d7c3ed5

2010-06-27 22:32:30 +0000

[diff] [blame]

22

RFC 3986 is considered the current standard and any future changes to

23

urlparse module should conform with it. The urlparse module is

24

currently not entirely compliant with this RFC due to defacto

25

scenarios for parsing, and for backward compatibility purposes, some

26

parsing quirks from older RFCs are retained. The testcases in

Senthil Kumaran

2010-04-17 14:44:14 +0000

[diff] [blame]

27

test_urlparse.py provides a good indicator of parsing behavior.

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

28

"""

29

Serhiy Storchaka

2013-03-14 21:31:37 +0200

[diff] [blame]

30

import re

Facundo Batista

2ac5de2

2008-07-07 18:24:11 +0000

[diff] [blame]

31

import sys

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

32

import collections

Facundo Batista

2ac5de2

2008-07-07 18:24:11 +0000

[diff] [blame]

33

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

34

__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",

Senthil Kumaran

0256b2a

2010-10-25 16:36:20 +0000

[diff] [blame]

35

"urlsplit", "urlunsplit", "urlencode", "parse_qs",

36

"parse_qsl", "quote", "quote_plus", "quote_from_bytes",

Serhiy Storchaka

1515450

2015-04-07 19:09:01 +0300

[diff] [blame]

37

"unquote", "unquote_plus", "unquote_to_bytes",

38

"DefragResult", "ParseResult", "SplitResult",

39

"DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

40

41

# A classification of schemes ('' means apply by default)

42

uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',

43

'wais', 'file', 'https', 'shttp', 'mms',

Senthil Kumaran

2a157d2

2011-08-03 18:37:22 +0800

[diff] [blame]

44

'prospero', 'rtsp', 'rtspu', '', 'sftp',

Berker Peksag

f676748

2016-09-16 14:43:58 +0300

[diff] [blame]

45

'svn', 'svn+ssh', 'ws', 'wss']

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

46

uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',

47

'imap', 'wais', 'file', 'mms', 'https', 'shttp',

48

'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',

Berker Peksag

f676748

2016-09-16 14:43:58 +0300

[diff] [blame]

49

'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',

50

'ws', 'wss']

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

51

uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',

52

'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',

Senthil Kumaran

ed30199

2012-12-24 14:00:20 -0800

[diff] [blame]

53

'mms', '', 'sftp', 'tel']

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

54

Georg Brandl

a61b09f

2012-08-24 18:15:29 +0200

[diff] [blame]

55

# These are not actually used anymore, but should stay for backwards

56

# compatibility. (They are undocumented, but have a public-looking name.)

57

non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',

58

'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']

59

uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',

60

'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']

61

uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',

62

'nntp', 'wais', 'https', 'shttp', 'snews',

63

'file', 'prospero', '']

64

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

65

# Characters valid in scheme names

66

scheme_chars = ('abcdefghijklmnopqrstuvwxyz'

67

'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

'0123456789'

'+-.')

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

71

# XXX: Consider replacing with functools.lru_cache

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

MAX_CACHE_SIZE = 20

_parse_cache = {}

def clear_cache():

Antoine Pitrou

2df5fc7

2009-12-08 19:38:17 +0000

[diff] [blame]

76

"""Clear the parse cache and the quoters cache."""

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

77

_parse_cache.clear()

Antoine Pitrou

2df5fc7

2009-12-08 19:38:17 +0000

[diff] [blame]

78

_safe_quoters.clear()

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

79

80

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

81

# Helpers for bytes handling

82

# For 3.2, we deliberately require applications that

83

# handle improperly quoted URLs to do their own

84

# decoding and encoding. If valid use cases are

85

# presented, we may relax this by using latin-1

86

# decoding internally for 3.3

87

_implicit_encoding = 'ascii'

88

_implicit_errors = 'strict'

def _noop(obj):

return obj

def _encode_result(obj, encoding=_implicit_encoding,

94

errors=_implicit_errors):

95

return obj.encode(encoding, errors)

96

97

def _decode_args(args, encoding=_implicit_encoding,

98

errors=_implicit_errors):

99

return tuple(x.decode(encoding, errors) if x else '' for x in args)

100

101

def _coerce_args(*args):

102

# Invokes decode if necessary to create str args

103

# and returns the coerced inputs along with

104

# an appropriate result coercion function

105

# - noop for str inputs

106

# - encoding function otherwise

107

str_input = isinstance(args[0], str)

108

for arg in args[1:]:

109

# We special-case the empty string to support the

110

# "scheme=''" default argument to some functions

111

if arg and isinstance(arg, str) != str_input:

112

raise TypeError("Cannot mix str and non-str arguments")

113

if str_input:

114

return args + (_noop,)

115

return _decode_args(args) + (_encode_result,)

116

117

# Result objects are more helpful than simple tuples

118

class _ResultMixinStr(object):

119

"""Standard approach to encoding parsed results from str to bytes"""

120

__slots__ = ()

121

122

def encode(self, encoding='ascii', errors='strict'):

123

return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))

124

125

126

class _ResultMixinBytes(object):

127

"""Standard approach to decoding parsed results from bytes to str"""

128

__slots__ = ()

129

130

def decode(self, encoding='ascii', errors='strict'):

131

return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))

132

133

134

class _NetlocResultMixinBase(object):

135

"""Shared methods for the parsed result objects containing a netloc element"""

136

__slots__ = ()

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

137

138

@property

139

def username(self):

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

140

return self._userinfo[0]

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

141

142

@property

143

def password(self):

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

144

return self._userinfo[1]

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

145

146

@property

147

def hostname(self):

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

148

hostname = self._hostinfo[0]

149

if not hostname:

150

hostname = None

151

elif hostname is not None:

152

hostname = hostname.lower()

153

return hostname

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

154

155

@property

156

def port(self):

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

157

port = self._hostinfo[1]

158

if port is not None:

159

port = int(port, 10)

Senthil Kumaran

2fc5a50

2012-05-24 21:56:17 +0800

[diff] [blame]

160

if not ( 0 <= port <= 65535):

Robert Collins

dfa95c9

2015-08-10 09:53:30 +1200

[diff] [blame]

161

raise ValueError("Port out of range 0-65535")

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

return port

class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):

__slots__ = ()

@property

def _userinfo(self):

netloc = self.netloc

userinfo, have_info, hostinfo = netloc.rpartition('@')

172

if have_info:

173

username, have_password, password = userinfo.partition(':')

174

if not have_password:

175

password = None

Senthil Kumaran

ad02d23

2010-04-16 03:02:13 +0000

[diff] [blame]

176

else:

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

177

username = password = None

178

return username, password

@property

def _hostinfo(self):

netloc = self.netloc

_, _, hostinfo = netloc.rpartition('@')

184

_, have_open_br, bracketed = hostinfo.partition('[')

185

if have_open_br:

186

hostname, _, port = bracketed.partition(']')

Serhiy Storchaka

2014-01-18 18:30:33 +0200

[diff] [blame]

187

_, _, port = port.partition(':')

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

188

else:

Serhiy Storchaka

2014-01-18 18:30:33 +0200

[diff] [blame]

189

hostname, _, port = hostinfo.partition(':')

190

if not port:

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

191

port = None

192

return hostname, port

193

194

195

class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):

__slots__ = ()

@property

def _userinfo(self):

netloc = self.netloc

userinfo, have_info, hostinfo = netloc.rpartition(b'@')

202

if have_info:

203

username, have_password, password = userinfo.partition(b':')

204

if not have_password:

205

password = None

206

else:

207

username = password = None

208

return username, password

@property

def _hostinfo(self):

netloc = self.netloc

_, _, hostinfo = netloc.rpartition(b'@')

214

_, have_open_br, bracketed = hostinfo.partition(b'[')

215

if have_open_br:

216

hostname, _, port = bracketed.partition(b']')

Serhiy Storchaka

2014-01-18 18:30:33 +0200

[diff] [blame]

217

_, _, port = port.partition(b':')

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

218

else:

Serhiy Storchaka

2014-01-18 18:30:33 +0200

[diff] [blame]

219

hostname, _, port = hostinfo.partition(b':')

220

if not port:

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

221

port = None

222

return hostname, port

223

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

224

225

from collections import namedtuple

226

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

227

_DefragResultBase = namedtuple('DefragResult', 'url fragment')

Senthil Kumaran

86f7109

2016-01-14 00:11:39 -0800

[diff] [blame]

228

_SplitResultBase = namedtuple(

229

'SplitResult', 'scheme netloc path query fragment')

230

_ParseResultBase = namedtuple(

231

'ParseResult', 'scheme netloc path params query fragment')

232

233

_DefragResultBase.__doc__ = """

234

DefragResult(url, fragment)

235

236

A 2-tuple that contains the url without fragment identifier and the fragment

237

identifier as a separate argument.

238

"""

239

240

_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""

241

242

_DefragResultBase.fragment.__doc__ = """

243

Fragment identifier separated from URL, that allows indirect identification of a

244

secondary resource by reference to a primary resource and additional identifying

information.

"""

_SplitResultBase.__doc__ = """

249

SplitResult(scheme, netloc, path, query, fragment)

250

251

A 5-tuple that contains the different components of a URL. Similar to

252

ParseResult, but does not split params.

253

"""

254

255

_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""

256

257

_SplitResultBase.netloc.__doc__ = """

258

Network location where the request is made to.

259

"""

260

261

_SplitResultBase.path.__doc__ = """

262

The hierarchical path, such as the path to a file to download.

263

"""

264

265

_SplitResultBase.query.__doc__ = """

266

The query component, that contains non-hierarchical data, that along with data

267

in path component, identifies a resource in the scope of URI's scheme and

network location.

"""

_SplitResultBase.fragment.__doc__ = """

272

Fragment identifier, that allows indirect identification of a secondary resource

273

by reference to a primary resource and additional identifying information.

274

"""

275

276

_ParseResultBase.__doc__ = """

277

ParseResult(scheme, netloc, path, params, query, fragment)

278

279

A 6-tuple that contains components of a parsed URL.

280

"""

281

282

_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__

283

_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__

284

_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__

285

_ParseResultBase.params.__doc__ = """

286

Parameters for last path element used to dereference the URI in order to provide

287

access to perform some operation on the resource.

288

"""

289

290

_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__

291

_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__

292

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

293

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

294

# For backwards compatibility, alias _NetlocResultMixinStr

295

# ResultBase is no longer part of the documented API, but it is

296

# retained since deprecating it isn't worth the hassle

297

ResultBase = _NetlocResultMixinStr

298

299

# Structured result objects for string data

300

class DefragResult(_DefragResultBase, _ResultMixinStr):

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

301

__slots__ = ()

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

302

def geturl(self):

303

if self.fragment:

304

return self.url + '#' + self.fragment

305

else:

306

return self.url

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

307

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

308

class SplitResult(_SplitResultBase, _NetlocResultMixinStr):

309

__slots__ = ()

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

310

def geturl(self):

311

return urlunsplit(self)

312

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

313

class ParseResult(_ParseResultBase, _NetlocResultMixinStr):

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

314

__slots__ = ()

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

315

def geturl(self):

316

return urlunparse(self)

317

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

318

# Structured result objects for bytes data

319

class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):

__slots__ = ()

def geturl(self):

if self.fragment:

return self.url + b'#' + self.fragment

else:

return self.url

class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):

328

__slots__ = ()

329

def geturl(self):

330

return urlunsplit(self)

331

332

class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):

333

__slots__ = ()

334

def geturl(self):

335

return urlunparse(self)

336

337

# Set up the encode/decode result pairs

338

def _fix_result_transcoding():

339

_result_pairs = (

340

(DefragResult, DefragResultBytes),

341

(SplitResult, SplitResultBytes),

342

(ParseResult, ParseResultBytes),

343

)

344

for _decoded, _encoded in _result_pairs:

345

_decoded._encoded_counterpart = _encoded

346

_encoded._decoded_counterpart = _decoded

347

348

_fix_result_transcoding()

349

del _fix_result_transcoding

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

350

351

def urlparse(url, scheme='', allow_fragments=True):

352

"""Parse a URL into 6 components:

353

354

Return a 6-tuple: (scheme, netloc, path, params, query, fragment).

355

Note that we don't break the components up in smaller bits

356

(e.g. netloc is a single string) and we don't expand % escapes."""

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

357

url, scheme, _coerce_result = _coerce_args(url, scheme)

Senthil Kumaran

2012-06-29 11:08:20 -0700

[diff] [blame]

358

splitresult = urlsplit(url, scheme, allow_fragments)

359

scheme, netloc, url, query, fragment = splitresult

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

360

if scheme in uses_params and ';' in url:

361

url, params = _splitparams(url)

362

else:

363

params = ''

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

364

result = ParseResult(scheme, netloc, url, params, query, fragment)

365

return _coerce_result(result)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

366

367

def _splitparams(url):

368

if '/' in url:

369

i = url.find(';', url.rfind('/'))

if i < 0:

return url, ''

else:

i = url.find(';')

return url[:i], url[i+1:]

375

376

def _splitnetloc(url, start=0):

377

delim = len(url) # position of end of domain part of url, default is end

378

for c in '/?#': # look for delimiters; the order is NOT important

379

wdelim = url.find(c, start) # find first of this delim

380

if wdelim >= 0: # if found

381

delim = min(delim, wdelim) # use earliest delim position

382

return url[start:delim], url[delim:] # return (domain, rest)

383

384

def urlsplit(url, scheme='', allow_fragments=True):

385

"""Parse a URL into 5 components:

386

387

Return a 5-tuple: (scheme, netloc, path, query, fragment).

388

Note that we don't break the components up in smaller bits

389

(e.g. netloc is a single string) and we don't expand % escapes."""

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

390

url, scheme, _coerce_result = _coerce_args(url, scheme)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

391

allow_fragments = bool(allow_fragments)

392

key = url, scheme, allow_fragments, type(url), type(scheme)

393

cached = _parse_cache.get(key, None)

394

if cached:

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

395

return _coerce_result(cached)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

396

if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth

397

clear_cache()

398

netloc = query = fragment = ''

399

i = url.find(':')

400

if i > 0:

401

if url[:i] == 'http': # optimize the common case

402

scheme = url[:i].lower()

403

url = url[i+1:]

404

if url[:2] == '//':

405

netloc, url = _splitnetloc(url, 2)

Senthil Kumaran

7a1e09f

2010-04-22 12:19:46 +0000

[diff] [blame]

406

if (('[' in netloc and ']' not in netloc) or

407

(']' in netloc and '[' not in netloc)):

408

raise ValueError("Invalid IPv6 URL")

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

409

if allow_fragments and '#' in url:

410

url, fragment = url.split('#', 1)

411

if '?' in url:

412

url, query = url.split('?', 1)

413

v = SplitResult(scheme, netloc, url, query, fragment)

414

_parse_cache[key] = v

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

415

return _coerce_result(v)

Senthil Kumaran

397eb44

2011-04-15 18:20:24 +0800

[diff] [blame]

416

for c in url[:i]:

417

if c not in scheme_chars:

418

break

419

else:

Ezio Melotti

6709b7d

2012-05-19 17:15:19 +0300

[diff] [blame]

420

# make sure "url" is not actually a port number (in which case

421

# "scheme" is really part of the path)

422

rest = url[i+1:]

423

if not rest or any(c not in '0123456789' for c in rest):

424

# not a port number

425

scheme, url = url[:i].lower(), rest

Senthil Kumaran

397eb44

2011-04-15 18:20:24 +0800

[diff] [blame]

426

Senthil Kumaran

6be85c5

2010-02-19 07:42:50 +0000

[diff] [blame]

427

if url[:2] == '//':

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

428

netloc, url = _splitnetloc(url, 2)

Senthil Kumaran

7a1e09f

2010-04-22 12:19:46 +0000

[diff] [blame]

429

if (('[' in netloc and ']' not in netloc) or

430

(']' in netloc and '[' not in netloc)):

431

raise ValueError("Invalid IPv6 URL")

Senthil Kumaran

1be320e

2012-05-19 08:12:00 +0800

[diff] [blame]

432

if allow_fragments and '#' in url:

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

433

url, fragment = url.split('#', 1)

Senthil Kumaran

1be320e

2012-05-19 08:12:00 +0800

[diff] [blame]

434

if '?' in url:

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

435

url, query = url.split('?', 1)

436

v = SplitResult(scheme, netloc, url, query, fragment)

437

_parse_cache[key] = v

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

438

return _coerce_result(v)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

439

440

def urlunparse(components):

441

"""Put a parsed URL back together again. This may result in a

442

slightly different, but equivalent URL, if the URL that was parsed

443

originally had redundant delimiters, e.g. a ? with an empty query

444

(the draft states that these are equivalent)."""

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

445

scheme, netloc, url, params, query, fragment, _coerce_result = (

446

_coerce_args(*components))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

447

if params:

448

url = "%s;%s" % (url, params)

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

449

return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

450

451

def urlunsplit(components):

Senthil Kumaran

8749a63

2010-06-28 14:08:00 +0000

[diff] [blame]

452

"""Combine the elements of a tuple as returned by urlsplit() into a

453

complete URL as a string. The data argument can be any five-item iterable.

454

This may result in a slightly different, but equivalent URL, if the URL that

455

was parsed originally had unnecessary delimiters (for example, a ? with an

456

empty query; the RFC states that these are equivalent)."""

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

457

scheme, netloc, url, query, fragment, _coerce_result = (

458

_coerce_args(*components))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

459

if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):

460

if url and url[:1] != '/': url = '/' + url

461

url = '//' + (netloc or '') + url

462

if scheme:

463

url = scheme + ':' + url

464

if query:

465

url = url + '?' + query

466

if fragment:

467

url = url + '#' + fragment

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

468

return _coerce_result(url)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

469

470

def urljoin(base, url, allow_fragments=True):

471

"""Join a base URL and a possibly relative URL to form an absolute

472

interpretation of the latter."""

if not base:

return url

if not url:

return base

Antoine Pitrou

2014-08-21 19:16:17 -0400

[diff] [blame]

477

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

478

base, url, _coerce_result = _coerce_args(base, url)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

479

bscheme, bnetloc, bpath, bparams, bquery, bfragment = \

480

urlparse(base, '', allow_fragments)

481

scheme, netloc, path, params, query, fragment = \

482

urlparse(url, bscheme, allow_fragments)

Antoine Pitrou

2014-08-21 19:16:17 -0400

[diff] [blame]

483

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

484

if scheme != bscheme or scheme not in uses_relative:

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

485

return _coerce_result(url)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

486

if scheme in uses_netloc:

487

if netloc:

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

488

return _coerce_result(urlunparse((scheme, netloc, path,

489

params, query, fragment)))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

490

netloc = bnetloc

Antoine Pitrou

2014-08-21 19:16:17 -0400

[diff] [blame]

491

Senthil Kumaran

dca5b86

2010-12-17 04:48:45 +0000

[diff] [blame]

492

if not path and not params:

Facundo Batista

23e3856

2008-08-14 16:55:14 +0000

[diff] [blame]

493

path = bpath

Senthil Kumaran

dca5b86

2010-12-17 04:48:45 +0000

[diff] [blame]

494

params = bparams

Facundo Batista

23e3856

2008-08-14 16:55:14 +0000

[diff] [blame]

495

if not query:

496

query = bquery

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

497

return _coerce_result(urlunparse((scheme, netloc, path,

498

params, query, fragment)))

Antoine Pitrou

2014-08-21 19:16:17 -0400

[diff] [blame]

499

500

base_parts = bpath.split('/')

501

if base_parts[-1] != '':

502

# the last item is not a directory, so will not be taken into account

503

# in resolving the relative path

504

del base_parts[-1]

505

506

# for rfc3986, ignore all base path should the first character be root.

507

if path[:1] == '/':

508

segments = path.split('/')

509

else:

510

segments = base_parts + path.split('/')

Senthil Kumaran

a66e388

2014-09-22 15:49:16 +0800

[diff] [blame]

511

# filter out elements that would cause redundant slashes on re-joining

512

# the resolved_path

Berker Peksag

20416f7

2015-04-16 02:31:14 +0300

[diff] [blame]

513

segments[1:-1] = filter(None, segments[1:-1])

Antoine Pitrou

2014-08-21 19:16:17 -0400

[diff] [blame]

resolved_path = []

for seg in segments:

if seg == '..':

try:

resolved_path.pop()

except IndexError:

# ignore any .. segments that would otherwise cause an IndexError

523

# when popped from resolved_path if resolving for rfc3986

524

pass

525

elif seg == '.':

526

continue

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

527

else:

Antoine Pitrou

2014-08-21 19:16:17 -0400

[diff] [blame]

528

resolved_path.append(seg)

529

530

if segments[-1] in ('.', '..'):

531

# do some post-processing here. if the last segment was a relative dir,

532

# then we need to append the trailing '/'

533

resolved_path.append('')

534

535

return _coerce_result(urlunparse((scheme, netloc, '/'.join(

Senthil Kumaran

a66e388

2014-09-22 15:49:16 +0800

[diff] [blame]

536

resolved_path) or '/', params, query, fragment)))

Antoine Pitrou

2014-08-21 19:16:17 -0400

[diff] [blame]

537

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

538

539

def urldefrag(url):

540

"""Removes any existing fragment from URL.

541

542

Returns a tuple of the defragmented URL and the fragment. If

543

the URL contained no fragments, the second element is the

544

empty string.

545

"""

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

546

url, _coerce_result = _coerce_args(url)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

547

if '#' in url:

548

s, n, p, a, q, frag = urlparse(url)

549

defrag = urlunparse((s, n, p, a, q, ''))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

550

else:

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

551

frag = ''

552

defrag = url

553

return _coerce_result(DefragResult(defrag, frag))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

554

Serhiy Storchaka

2013-03-14 21:31:37 +0200

[diff] [blame]

555

_hexdig = '0123456789ABCDEFabcdef'

Victor Stinner

d6a91a7

2014-03-17 22:38:41 +0100

[diff] [blame]

556

_hextobyte = None

Serhiy Storchaka

2013-03-14 21:31:37 +0200

[diff] [blame]

557

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

558

def unquote_to_bytes(string):

559

"""unquote_to_bytes('abc%20def') -> b'abc def'."""

560

# Note: strings are encoded as UTF-8. This is only an issue if it contains

561

# unescaped non-ASCII characters, which URIs should not.

Florent Xicluna

82a3f8a

2010-08-14 18:30:35 +0000

[diff] [blame]

562

if not string:

563

# Is it a string-like object?

564

string.split

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

565

return b''

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

566

if isinstance(string, str):

567

string = string.encode('utf-8')

Serhiy Storchaka

2013-03-14 21:31:37 +0200

[diff] [blame]

568

bits = string.split(b'%')

569

if len(bits) == 1:

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

570

return string

Serhiy Storchaka

2013-03-14 21:31:37 +0200

[diff] [blame]

571

res = [bits[0]]

572

append = res.append

Victor Stinner

d6a91a7

2014-03-17 22:38:41 +0100

[diff] [blame]

573

# Delay the initialization of the table to not waste memory

574

# if the function is never called

575

global _hextobyte

576

if _hextobyte is None:

577

_hextobyte = {(a + b).encode(): bytes([int(a + b, 16)])

578

for a in _hexdig for b in _hexdig}

Serhiy Storchaka

2013-03-14 21:31:37 +0200

[diff] [blame]

579

for item in bits[1:]:

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

580

try:

Serhiy Storchaka

2013-03-14 21:31:37 +0200

[diff] [blame]

581

append(_hextobyte[item[:2]])

append(item[2:])

except KeyError:

append(b'%')

append(item)

return b''.join(res)

_asciire = re.compile('([\x00-\x7f]+)')

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

589

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

590

def unquote(string, encoding='utf-8', errors='replace'):

591

"""Replace %xx escapes by their single-character equivalent. The optional

592

encoding and errors parameters specify how to decode percent-encoded

593

sequences into Unicode characters, as accepted by the bytes.decode()

594

method.

595

By default, percent-encoded sequences are decoded with UTF-8, and invalid

596

sequences are replaced by a placeholder character.

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

597

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

598

unquote('abc%20def') -> 'abc def'.

599

"""

Serhiy Storchaka

2013-03-14 21:31:37 +0200

[diff] [blame]

600

if '%' not in string:

601

string.split

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

return string

if encoding is None:

encoding = 'utf-8'

if errors is None:

errors = 'replace'

Serhiy Storchaka

2013-03-14 21:31:37 +0200

[diff] [blame]

607

bits = _asciire.split(string)

608

res = [bits[0]]

609

append = res.append

610

for i in range(1, len(bits), 2):

611

append(unquote_to_bytes(bits[i]).decode(encoding, errors))

612

append(bits[i + 1])

613

return ''.join(res)

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

614

Victor Stinner

2011-01-14 12:52:12 +0000

[diff] [blame]

615

def parse_qs(qs, keep_blank_values=False, strict_parsing=False,

616

encoding='utf-8', errors='replace'):

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

617

"""Parse a query given as a string argument.

Arguments:

Senthil Kumaran

2010-08-09 20:01:35 +0000

[diff] [blame]

621

qs: percent-encoded query string to be parsed

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

622

623

keep_blank_values: flag indicating whether blank values in

Senthil Kumaran

30e86a4

2010-08-09 20:01:35 +0000

[diff] [blame]

624

percent-encoded queries should be treated as blank strings.

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

625

A true value indicates that blanks should be retained as

626

blank strings. The default false value indicates that

627

blank values are to be ignored and treated as if they were

628

not included.

629

630

strict_parsing: flag indicating what to do with parsing errors.

631

If false (the default), errors are silently ignored.

632

If true, errors raise a ValueError exception.

Victor Stinner

2011-01-14 12:52:12 +0000

[diff] [blame]

633

634

encoding and errors: specify how to decode percent-encoded sequences

635

into Unicode characters, as accepted by the bytes.decode() method.

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

636

"""

Senthil Kumaran

2012-06-29 11:08:20 -0700

[diff] [blame]

637

parsed_result = {}

Victor Stinner

2011-01-14 12:52:12 +0000

[diff] [blame]

638

pairs = parse_qsl(qs, keep_blank_values, strict_parsing,

639

encoding=encoding, errors=errors)

640

for name, value in pairs:

Senthil Kumaran

2012-06-29 11:08:20 -0700

[diff] [blame]

641

if name in parsed_result:

642

parsed_result[name].append(value)

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

643

else:

Senthil Kumaran

2012-06-29 11:08:20 -0700

[diff] [blame]

644

parsed_result[name] = [value]

645

return parsed_result

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

646

Victor Stinner

2011-01-14 12:52:12 +0000

[diff] [blame]

647

def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,

648

encoding='utf-8', errors='replace'):

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

649

"""Parse a query given as a string argument.

Arguments:

Senthil Kumaran

2010-08-09 20:01:35 +0000

[diff] [blame]

653

qs: percent-encoded query string to be parsed

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

654

655

keep_blank_values: flag indicating whether blank values in

Senthil Kumaran

30e86a4

2010-08-09 20:01:35 +0000

[diff] [blame]

656

percent-encoded queries should be treated as blank strings. A

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

657

true value indicates that blanks should be retained as blank

658

strings. The default false value indicates that blank values

659

are to be ignored and treated as if they were not included.

660

661

strict_parsing: flag indicating what to do with parsing errors. If

662

false (the default), errors are silently ignored. If true,

663

errors raise a ValueError exception.

664

Victor Stinner

2011-01-14 12:52:12 +0000

[diff] [blame]

665

encoding and errors: specify how to decode percent-encoded sequences

666

into Unicode characters, as accepted by the bytes.decode() method.

667

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

668

Returns a list, as G-d intended.

669

"""

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

670

qs, _coerce_result = _coerce_args(qs)

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

671

pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]

672

r = []

673

for name_value in pairs:

674

if not name_value and not strict_parsing:

675

continue

676

nv = name_value.split('=', 1)

677

if len(nv) != 2:

678

if strict_parsing:

679

raise ValueError("bad query field: %r" % (name_value,))

680

# Handle case of a control-name with no equal sign

681

if keep_blank_values:

nv.append('')

else:

continue

if len(nv[1]) or keep_blank_values:

Victor Stinner

2011-01-14 12:52:12 +0000

[diff] [blame]

686

name = nv[0].replace('+', ' ')

687

name = unquote(name, encoding=encoding, errors=errors)

688

name = _coerce_result(name)

689

value = nv[1].replace('+', ' ')

690

value = unquote(value, encoding=encoding, errors=errors)

691

value = _coerce_result(value)

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

692

r.append((name, value))

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

693

return r

694

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

695

def unquote_plus(string, encoding='utf-8', errors='replace'):

696

"""Like unquote(), but also replace plus signs by spaces, as required for

697

unquoting HTML form values.

698

699

unquote_plus('%7e/abc+def') -> '~/abc def'

700

"""

701

string = string.replace('+', ' ')

702

return unquote(string, encoding, errors)

703

704

_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

705

b'abcdefghijklmnopqrstuvwxyz'

706

b'0123456789'

707

b'_.-')

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

708

_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)

709

_safe_quoters = {}

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

710

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

711

class Quoter(collections.defaultdict):

712

"""A mapping from bytes (in range(0,256)) to strings.

713

714

String values are percent-encoded byte values, unless the key < 128, and

715

in the "safe" set (either the specified safe set, or default set).

716

"""

717

# Keeps a cache internally, using defaultdict, for efficiency (lookups

718

# of cached keys don't call Python code at all).

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

719

def __init__(self, safe):

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

720

"""safe: bytes object."""

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

721

self.safe = _ALWAYS_SAFE.union(safe)

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

722

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

723

def __repr__(self):

724

# Without this, will just display as a defaultdict

Serhiy Storchaka

465e60e

2014-07-25 23:36:00 +0300

[diff] [blame]

725

return "<%s %r>" % (self.__class__.__name__, dict(self))

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

726

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

727

def __missing__(self, b):

728

# Handle a cache miss. Store quoted string in cache and return.

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

729

res = chr(b) if b in self.safe else '%{:02X}'.format(b)

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

self[b] = res

return res

def quote(string, safe='/', encoding=None, errors=None):

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

734

"""quote('abc def') -> 'abc%20def'

735

736

Each part of a URL, e.g. the path info, the query, etc., has a

737

different set of reserved characters that must be quoted.

738

739

RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists

740

the following reserved characters.

741

742

reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |

743

"$" | ","

744

745

Each of these characters is reserved in some component of a URL,

746

but not necessarily in all of them.

747

748

By default, the quote function is intended for quoting the path

749

section of a URL. Thus, it will not encode '/'. This character

750

is reserved, but in typical usage the quote function is being

751

called on a path where the existing slash characters are used as

752

reserved characters.

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

753

R David Murray

8c4e112

2014-12-24 21:23:18 -0500

[diff] [blame]

754

string and safe may be either str or bytes objects. encoding and errors

755

must not be specified if string is a bytes object.

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

756

757

The optional encoding and errors parameters specify how to deal with

758

non-ASCII characters, as accepted by the str.encode method.

759

By default, encoding='utf-8' (characters are encoded with UTF-8), and

760

errors='strict' (unsupported characters raise a UnicodeEncodeError).

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

761

"""

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

762

if isinstance(string, str):

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

763

if not string:

764

return string

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

if encoding is None:

encoding = 'utf-8'

if errors is None:

errors = 'strict'

string = string.encode(encoding, errors)

770

else:

771

if encoding is not None:

772

raise TypeError("quote() doesn't support 'encoding' for bytes")

773

if errors is not None:

774

raise TypeError("quote() doesn't support 'errors' for bytes")

775

return quote_from_bytes(string, safe)

776

777

def quote_plus(string, safe='', encoding=None, errors=None):

778

"""Like quote(), but also replace ' ' with '+', as required for quoting

779

HTML form values. Plus signs in the original string are escaped unless

780

they are included in safe. It also does not have safe default to '/'.

781

"""

Jeremy Hylton

f819886

2009-03-26 16:55:08 +0000

[diff] [blame]

782

# Check if ' ' in string, where string may either be a str or bytes. If

783

# there are no spaces, the regular quote will produce the right answer.

784

if ((isinstance(string, str) and ' ' not in string) or

785

(isinstance(string, bytes) and b' ' not in string)):

786

return quote(string, safe, encoding, errors)

787

if isinstance(safe, str):

788

space = ' '

789

else:

790

space = b' '

Georg Brandl

faf4149

2009-05-26 18:31:11 +0000

[diff] [blame]

791

string = quote(string, safe + space, encoding, errors)

Jeremy Hylton

f819886

2009-03-26 16:55:08 +0000

[diff] [blame]

792

return string.replace(' ', '+')

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

793

794

def quote_from_bytes(bs, safe='/'):

795

"""Like quote(), but accepts a bytes object rather than a str, and does

796

not perform string-to-bytes encoding. It always returns an ASCII string.

Senthil Kumaran

ffa4b2c

2012-05-26 09:53:32 +0800

[diff] [blame]

797

quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

798

"""

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

799

if not isinstance(bs, (bytes, bytearray)):

800

raise TypeError("quote_from_bytes() expected bytes")

801

if not bs:

802

return ''

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

803

if isinstance(safe, str):

804

# Normalize 'safe' by converting to bytes and removing non-ASCII chars

805

safe = safe.encode('ascii', 'ignore')

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

806

else:

807

safe = bytes([c for c in safe if c < 128])

808

if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):

809

return bs.decode()

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

810

try:

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

811

quoter = _safe_quoters[safe]

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

812

except KeyError:

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

813

_safe_quoters[safe] = quoter = Quoter(safe).__getitem__

814

return ''.join([quoter(char) for char in bs])

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

815

R David Murray

2015-05-17 20:44:50 -0400

[diff] [blame]

816

def urlencode(query, doseq=False, safe='', encoding=None, errors=None,

817

quote_via=quote_plus):

Senthil Kumaran

324ae385

2013-09-05 21:42:38 -0700

[diff] [blame]

818

"""Encode a dict or sequence of two-element tuples into a URL query string.

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

819

820

If any values in the query arg are sequences and doseq is true, each

821

sequence element is converted to a separate parameter.

822

823

If the query arg is a sequence of two-element tuples, the order of the

824

parameters in the output will match the order of parameters in the

825

input.

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

826

Senthil Kumaran

324ae385

2013-09-05 21:42:38 -0700

[diff] [blame]

827

The components of a query arg may each be either a string or a bytes type.

R David Murray

8c4e112

2014-12-24 21:23:18 -0500

[diff] [blame]

828

R David Murray

2015-05-17 20:44:50 -0400

[diff] [blame]

829

The safe, encoding, and errors parameters are passed down to the function

830

specified by quote_via (encoding and errors only if a component is a str).

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

831

"""

832

Jeremy Hylton

a4de60a

2009-03-26 14:49:26 +0000

[diff] [blame]

833

if hasattr(query, "items"):

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

834

query = query.items()

835

else:

Jeremy Hylton

230feba

2009-03-26 16:56:59 +0000

[diff] [blame]

836

# It's a bother at times that strings and string-like objects are

837

# sequences.

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

838

try:

839

# non-sequence items should not work with len()

840

# non-empty strings will fail this

841

if len(query) and not isinstance(query[0], tuple):

842

raise TypeError

Jeremy Hylton

230feba

2009-03-26 16:56:59 +0000

[diff] [blame]

843

# Zero-length sequences of all types will get here and succeed,

844

# but that's a minor nit. Since the original implementation

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

845

# allowed empty dicts that type of behavior probably should be

846

# preserved for consistency

847

except TypeError:

Jeremy Hylton

a4de60a

2009-03-26 14:49:26 +0000

[diff] [blame]

848

ty, va, tb = sys.exc_info()

849

raise TypeError("not a valid non-string sequence "

850

"or mapping object").with_traceback(tb)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

851

852

l = []

853

if not doseq:

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

854

for k, v in query:

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

855

if isinstance(k, bytes):

R David Murray

2015-05-17 20:44:50 -0400

[diff] [blame]

856

k = quote_via(k, safe)

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

857

else:

R David Murray

2015-05-17 20:44:50 -0400

[diff] [blame]

858

k = quote_via(str(k), safe, encoding, errors)

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

859

860

if isinstance(v, bytes):

R David Murray

2015-05-17 20:44:50 -0400

[diff] [blame]

861

v = quote_via(v, safe)

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

862

else:

R David Murray

2015-05-17 20:44:50 -0400

[diff] [blame]

863

v = quote_via(str(v), safe, encoding, errors)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

864

l.append(k + '=' + v)

865

else:

866

for k, v in query:

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

867

if isinstance(k, bytes):

R David Murray

2015-05-17 20:44:50 -0400

[diff] [blame]

868

k = quote_via(k, safe)

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

869

else:

R David Murray

2015-05-17 20:44:50 -0400

[diff] [blame]

870

k = quote_via(str(k), safe, encoding, errors)

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

871

872

if isinstance(v, bytes):

R David Murray

2015-05-17 20:44:50 -0400

[diff] [blame]

873

v = quote_via(v, safe)

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

874

l.append(k + '=' + v)

875

elif isinstance(v, str):

R David Murray

2015-05-17 20:44:50 -0400

[diff] [blame]

876

v = quote_via(v, safe, encoding, errors)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

877

l.append(k + '=' + v)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

878

else:

879

try:

Jeremy Hylton

230feba

2009-03-26 16:56:59 +0000

[diff] [blame]

880

# Is this a sufficient test for sequence-ness?

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

881

x = len(v)

882

except TypeError:

883

# not a sequence

R David Murray

2015-05-17 20:44:50 -0400

[diff] [blame]

884

v = quote_via(str(v), safe, encoding, errors)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

885

l.append(k + '=' + v)

886

else:

887

# loop over the sequence

888

for elt in v:

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

889

if isinstance(elt, bytes):

R David Murray

2015-05-17 20:44:50 -0400

[diff] [blame]

890

elt = quote_via(elt, safe)

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

891

else:

R David Murray

2015-05-17 20:44:50 -0400

[diff] [blame]

892

elt = quote_via(str(elt), safe, encoding, errors)

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

893

l.append(k + '=' + elt)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

894

return '&'.join(l)

895

Georg Brandl

13e8946

2008-07-01 19:56:00 +0000

[diff] [blame]

896

def to_bytes(url):

897

"""to_bytes(u"URL") --> 'URL'."""

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

898

# Most URL schemes require ASCII. If that changes, the conversion

899

# can be relaxed.

Georg Brandl

13e8946

2008-07-01 19:56:00 +0000

[diff] [blame]

900

# XXX get rid of to_bytes()

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

901

if isinstance(url, str):

902

try:

903

url = url.encode("ASCII").decode()

904

except UnicodeError:

905

raise UnicodeError("URL " + repr(url) +

906

" contains non-ASCII characters")

return url

def unwrap(url):

"""unwrap('<URL:type://host/path>') --> 'type://host/path'."""

911

url = str(url).strip()

912

if url[:1] == '<' and url[-1:] == '>':

913

url = url[1:-1].strip()

914

if url[:4] == 'URL:': url = url[4:].strip()

return url

_typeprog = None

def splittype(url):

"""splittype('type:opaquestring') --> 'type', 'opaquestring'."""

920

global _typeprog

921

if _typeprog is None:

Serhiy Storchaka

2015-03-03 20:21:35 +0200

[diff] [blame]

922

_typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

923

924

match = _typeprog.match(url)

925

if match:

Serhiy Storchaka

2015-03-03 20:21:35 +0200

[diff] [blame]

926

scheme, data = match.groups()

927

return scheme.lower(), data

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

return None, url

_hostprog = None

def splithost(url):

"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""

933

global _hostprog

934

if _hostprog is None:

Serhiy Storchaka

2015-03-03 20:21:35 +0200

[diff] [blame]

935

_hostprog = re.compile('//([^/?]*)(.*)', re.DOTALL)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

936

937

match = _hostprog.match(url)

Senthil Kumaran

c295862

2010-11-22 04:48:26 +0000

[diff] [blame]

938

if match:

Serhiy Storchaka

2015-03-03 20:21:35 +0200

[diff] [blame]

939

host_port, path = match.groups()

940

if path and path[0] != '/':

Senthil Kumaran

c295862

2010-11-22 04:48:26 +0000

[diff] [blame]

941

path = '/' + path

942

return host_port, path

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

943

return None, url

944

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

945

def splituser(host):

946

"""splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""

Serhiy Storchaka

2015-03-03 20:21:35 +0200

[diff] [blame]

947

user, delim, host = host.rpartition('@')

948

return (user if delim else None), host

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

949

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

950

def splitpasswd(user):

951

"""splitpasswd('user:passwd') -> 'user', 'passwd'."""

Serhiy Storchaka

2015-03-03 20:21:35 +0200

[diff] [blame]

952

user, delim, passwd = user.partition(':')

953

return user, (passwd if delim else None)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

954

955

# splittag('/path#tag') --> '/path', 'tag'

956

_portprog = None

957

def splitport(host):

958

"""splitport('host:port') --> 'host', 'port'."""

959

global _portprog

960

if _portprog is None:

Serhiy Storchaka

2015-03-03 20:21:35 +0200

[diff] [blame]

961

_portprog = re.compile('(.*):([0-9]*)$', re.DOTALL)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

962

963

match = _portprog.match(host)

Serhiy Storchaka

2014-01-18 18:30:33 +0200

[diff] [blame]

964

if match:

965

host, port = match.groups()

966

if port:

967

return host, port

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

968

return host, None

969

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

970

def splitnport(host, defport=-1):

971

"""Split host and port, returning numeric port.

972

Return given default port if no ':' found; defaults to -1.

973

Return numerical port if a valid number are found after ':'.

974

Return None if ':' but not a valid number."""

Serhiy Storchaka

2015-03-03 20:21:35 +0200

[diff] [blame]

975

host, delim, port = host.rpartition(':')

if not delim:

host = port

elif port:

try:

nport = int(port)

except ValueError:

nport = None

return host, nport

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

984

return host, defport

985

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

986

def splitquery(url):

987

"""splitquery('/path?query') --> '/path', 'query'."""

Serhiy Storchaka

2015-03-03 20:21:35 +0200

[diff] [blame]

988

path, delim, query = url.rpartition('?')

989

if delim:

990

return path, query

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

991

return url, None

992

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

993

def splittag(url):

994

"""splittag('/path#tag') --> '/path', 'tag'."""

Serhiy Storchaka

2015-03-03 20:21:35 +0200

[diff] [blame]

995

path, delim, tag = url.rpartition('#')

996

if delim:

997

return path, tag

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

return url, None

def splitattr(url):

"""splitattr('/path;attr1=value1;attr2=value2;...') ->

1002

'/path', ['attr1=value1', 'attr2=value2', ...]."""

1003

words = url.split(';')

1004

return words[0], words[1:]

1005

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

1006

def splitvalue(attr):

1007

"""splitvalue('attr=value') --> 'attr', 'value'."""

Serhiy Storchaka