Blame - Lib/urllib/parse.py - platform/external/python/cpython3

2008-06-18 20:49:58 +0000

[diff] [blame]

1

"""Parse (absolute and relative) URLs.

2

Senthil Kumaran

2010-04-17 14:44:14 +0000

[diff] [blame]

3

urlparse module is based upon the following RFC specifications.

4

5

RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding

6

and L. Masinter, January 2005.

7

8

RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter

9

and L.Masinter, December 1999.

10

Benjamin Peterson

d7c3ed5

2010-06-27 22:32:30 +0000

[diff] [blame]

11

RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.

Senthil Kumaran

2010-04-17 14:44:14 +0000

[diff] [blame]

12

Berners-Lee, R. Fielding, and L. Masinter, August 1998.

13

David Malcolm

ee25568

2010-12-02 16:41:00 +0000

[diff] [blame]

14

RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.

Senthil Kumaran

2010-04-17 14:44:14 +0000

[diff] [blame]

15

16

RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June

17

1995.

18

Benjamin Peterson

d7c3ed5

2010-06-27 22:32:30 +0000

[diff] [blame]

19

RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.

Senthil Kumaran

2010-04-17 14:44:14 +0000

[diff] [blame]

20

McCahill, December 1994

21

Benjamin Peterson

d7c3ed5

2010-06-27 22:32:30 +0000

[diff] [blame]

22

RFC 3986 is considered the current standard and any future changes to

23

urlparse module should conform with it. The urlparse module is

24

currently not entirely compliant with this RFC due to defacto

25

scenarios for parsing, and for backward compatibility purposes, some

26

parsing quirks from older RFCs are retained. The testcases in

Senthil Kumaran

2010-04-17 14:44:14 +0000

[diff] [blame]

27

test_urlparse.py provides a good indicator of parsing behavior.

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

28

"""

29

Facundo Batista

2ac5de2

2008-07-07 18:24:11 +0000

[diff] [blame]

30

import sys

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

31

import collections

Facundo Batista

2ac5de2

2008-07-07 18:24:11 +0000

[diff] [blame]

32

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

33

__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",

Senthil Kumaran

0256b2a

2010-10-25 16:36:20 +0000

[diff] [blame]

34

"urlsplit", "urlunsplit", "urlencode", "parse_qs",

35

"parse_qsl", "quote", "quote_plus", "quote_from_bytes",

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

36

"unquote", "unquote_plus", "unquote_to_bytes"]

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

37

38

# A classification of schemes ('' means apply by default)

39

uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',

40

'wais', 'file', 'https', 'shttp', 'mms',

Senthil Kumaran

2a157d2

2011-08-03 18:37:22 +0800

[diff] [blame]

41

'prospero', 'rtsp', 'rtspu', '', 'sftp',

42

'svn', 'svn+ssh']

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

43

uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',

44

'imap', 'wais', 'file', 'mms', 'https', 'shttp',

45

'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

46

'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

47

uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',

48

'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',

Senthil Kumaran

ed30199

2012-12-24 14:00:20 -0800

[diff] [blame^]

49

'mms', '', 'sftp', 'tel']

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

50

Georg Brandl

a61b09f

2012-08-24 18:15:29 +0200

[diff] [blame]

51

# These are not actually used anymore, but should stay for backwards

52

# compatibility. (They are undocumented, but have a public-looking name.)

53

non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',

54

'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']

55

uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',

56

'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']

57

uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',

58

'nntp', 'wais', 'https', 'shttp', 'snews',

59

'file', 'prospero', '']

60

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

61

# Characters valid in scheme names

62

scheme_chars = ('abcdefghijklmnopqrstuvwxyz'

63

'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

'0123456789'

'+-.')

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

67

# XXX: Consider replacing with functools.lru_cache

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

MAX_CACHE_SIZE = 20

_parse_cache = {}

def clear_cache():

Antoine Pitrou

2df5fc7

2009-12-08 19:38:17 +0000

[diff] [blame]

72

"""Clear the parse cache and the quoters cache."""

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

73

_parse_cache.clear()

Antoine Pitrou

2df5fc7

2009-12-08 19:38:17 +0000

[diff] [blame]

74

_safe_quoters.clear()

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

75

76

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

77

# Helpers for bytes handling

78

# For 3.2, we deliberately require applications that

79

# handle improperly quoted URLs to do their own

80

# decoding and encoding. If valid use cases are

81

# presented, we may relax this by using latin-1

82

# decoding internally for 3.3

83

_implicit_encoding = 'ascii'

84

_implicit_errors = 'strict'

def _noop(obj):

return obj

def _encode_result(obj, encoding=_implicit_encoding,

90

errors=_implicit_errors):

91

return obj.encode(encoding, errors)

92

93

def _decode_args(args, encoding=_implicit_encoding,

94

errors=_implicit_errors):

95

return tuple(x.decode(encoding, errors) if x else '' for x in args)

96

97

def _coerce_args(*args):

98

# Invokes decode if necessary to create str args

99

# and returns the coerced inputs along with

100

# an appropriate result coercion function

101

# - noop for str inputs

102

# - encoding function otherwise

103

str_input = isinstance(args[0], str)

104

for arg in args[1:]:

105

# We special-case the empty string to support the

106

# "scheme=''" default argument to some functions

107

if arg and isinstance(arg, str) != str_input:

108

raise TypeError("Cannot mix str and non-str arguments")

109

if str_input:

110

return args + (_noop,)

111

return _decode_args(args) + (_encode_result,)

112

113

# Result objects are more helpful than simple tuples

114

class _ResultMixinStr(object):

115

"""Standard approach to encoding parsed results from str to bytes"""

116

__slots__ = ()

117

118

def encode(self, encoding='ascii', errors='strict'):

119

return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))

120

121

122

class _ResultMixinBytes(object):

123

"""Standard approach to decoding parsed results from bytes to str"""

124

__slots__ = ()

125

126

def decode(self, encoding='ascii', errors='strict'):

127

return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))

128

129

130

class _NetlocResultMixinBase(object):

131

"""Shared methods for the parsed result objects containing a netloc element"""

132

__slots__ = ()

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

133

134

@property

135

def username(self):

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

136

return self._userinfo[0]

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

137

138

@property

139

def password(self):

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

140

return self._userinfo[1]

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

141

142

@property

143

def hostname(self):

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

144

hostname = self._hostinfo[0]

145

if not hostname:

146

hostname = None

147

elif hostname is not None:

148

hostname = hostname.lower()

149

return hostname

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

150

151

@property

152

def port(self):

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

153

port = self._hostinfo[1]

154

if port is not None:

155

port = int(port, 10)

Senthil Kumaran

2fc5a50

2012-05-24 21:56:17 +0800

[diff] [blame]

156

# Return None on an illegal port

157

if not ( 0 <= port <= 65535):

158

return None

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

return port

class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):

__slots__ = ()

@property

def _userinfo(self):

netloc = self.netloc

userinfo, have_info, hostinfo = netloc.rpartition('@')

169

if have_info:

170

username, have_password, password = userinfo.partition(':')

171

if not have_password:

172

password = None

Senthil Kumaran

ad02d23

2010-04-16 03:02:13 +0000

[diff] [blame]

173

else:

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

174

username = password = None

175

return username, password

@property

def _hostinfo(self):

netloc = self.netloc

_, _, hostinfo = netloc.rpartition('@')

181

_, have_open_br, bracketed = hostinfo.partition('[')

182

if have_open_br:

183

hostname, _, port = bracketed.partition(']')

184

_, have_port, port = port.partition(':')

185

else:

186

hostname, have_port, port = hostinfo.partition(':')

187

if not have_port:

188

port = None

189

return hostname, port

190

191

192

class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):

__slots__ = ()

@property

def _userinfo(self):

netloc = self.netloc

userinfo, have_info, hostinfo = netloc.rpartition(b'@')

199

if have_info:

200

username, have_password, password = userinfo.partition(b':')

201

if not have_password:

202

password = None

203

else:

204

username = password = None

205

return username, password

@property

def _hostinfo(self):

netloc = self.netloc

_, _, hostinfo = netloc.rpartition(b'@')

211

_, have_open_br, bracketed = hostinfo.partition(b'[')

212

if have_open_br:

213

hostname, _, port = bracketed.partition(b']')

214

_, have_port, port = port.partition(b':')

215

else:

216

hostname, have_port, port = hostinfo.partition(b':')

217

if not have_port:

218

port = None

219

return hostname, port

220

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

221

222

from collections import namedtuple

223

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

224

_DefragResultBase = namedtuple('DefragResult', 'url fragment')

225

_SplitResultBase = namedtuple('SplitResult', 'scheme netloc path query fragment')

226

_ParseResultBase = namedtuple('ParseResult', 'scheme netloc path params query fragment')

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

227

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

228

# For backwards compatibility, alias _NetlocResultMixinStr

229

# ResultBase is no longer part of the documented API, but it is

230

# retained since deprecating it isn't worth the hassle

231

ResultBase = _NetlocResultMixinStr

232

233

# Structured result objects for string data

234

class DefragResult(_DefragResultBase, _ResultMixinStr):

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

235

__slots__ = ()

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

236

def geturl(self):

237

if self.fragment:

238

return self.url + '#' + self.fragment

239

else:

240

return self.url

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

241

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

242

class SplitResult(_SplitResultBase, _NetlocResultMixinStr):

243

__slots__ = ()

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

244

def geturl(self):

245

return urlunsplit(self)

246

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

247

class ParseResult(_ParseResultBase, _NetlocResultMixinStr):

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

248

__slots__ = ()

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

249

def geturl(self):

250

return urlunparse(self)

251

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

252

# Structured result objects for bytes data

253

class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):

__slots__ = ()

def geturl(self):

if self.fragment:

return self.url + b'#' + self.fragment

else:

return self.url

class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):

262

__slots__ = ()

263

def geturl(self):

264

return urlunsplit(self)

265

266

class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):

267

__slots__ = ()

268

def geturl(self):

269

return urlunparse(self)

270

271

# Set up the encode/decode result pairs

272

def _fix_result_transcoding():

273

_result_pairs = (

274

(DefragResult, DefragResultBytes),

275

(SplitResult, SplitResultBytes),

276

(ParseResult, ParseResultBytes),

277

)

278

for _decoded, _encoded in _result_pairs:

279

_decoded._encoded_counterpart = _encoded

280

_encoded._decoded_counterpart = _decoded

281

282

_fix_result_transcoding()

283

del _fix_result_transcoding

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

284

285

def urlparse(url, scheme='', allow_fragments=True):

286

"""Parse a URL into 6 components:

287

288

Return a 6-tuple: (scheme, netloc, path, params, query, fragment).

289

Note that we don't break the components up in smaller bits

290

(e.g. netloc is a single string) and we don't expand % escapes."""

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

291

url, scheme, _coerce_result = _coerce_args(url, scheme)

Senthil Kumaran

2012-06-29 11:08:20 -0700

[diff] [blame]

292

splitresult = urlsplit(url, scheme, allow_fragments)

293

scheme, netloc, url, query, fragment = splitresult

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

294

if scheme in uses_params and ';' in url:

295

url, params = _splitparams(url)

296

else:

297

params = ''

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

298

result = ParseResult(scheme, netloc, url, params, query, fragment)

299

return _coerce_result(result)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

300

301

def _splitparams(url):

302

if '/' in url:

303

i = url.find(';', url.rfind('/'))

if i < 0:

return url, ''

else:

i = url.find(';')

return url[:i], url[i+1:]

309

310

def _splitnetloc(url, start=0):

311

delim = len(url) # position of end of domain part of url, default is end

312

for c in '/?#': # look for delimiters; the order is NOT important

313

wdelim = url.find(c, start) # find first of this delim

314

if wdelim >= 0: # if found

315

delim = min(delim, wdelim) # use earliest delim position

316

return url[start:delim], url[delim:] # return (domain, rest)

317

318

def urlsplit(url, scheme='', allow_fragments=True):

319

"""Parse a URL into 5 components:

320

321

Return a 5-tuple: (scheme, netloc, path, query, fragment).

322

Note that we don't break the components up in smaller bits

323

(e.g. netloc is a single string) and we don't expand % escapes."""

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

324

url, scheme, _coerce_result = _coerce_args(url, scheme)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

325

allow_fragments = bool(allow_fragments)

326

key = url, scheme, allow_fragments, type(url), type(scheme)

327

cached = _parse_cache.get(key, None)

328

if cached:

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

329

return _coerce_result(cached)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

330

if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth

331

clear_cache()

332

netloc = query = fragment = ''

333

i = url.find(':')

334

if i > 0:

335

if url[:i] == 'http': # optimize the common case

336

scheme = url[:i].lower()

337

url = url[i+1:]

338

if url[:2] == '//':

339

netloc, url = _splitnetloc(url, 2)

Senthil Kumaran

7a1e09f

2010-04-22 12:19:46 +0000

[diff] [blame]

340

if (('[' in netloc and ']' not in netloc) or

341

(']' in netloc and '[' not in netloc)):

342

raise ValueError("Invalid IPv6 URL")

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

343

if allow_fragments and '#' in url:

344

url, fragment = url.split('#', 1)

345

if '?' in url:

346

url, query = url.split('?', 1)

347

v = SplitResult(scheme, netloc, url, query, fragment)

348

_parse_cache[key] = v

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

349

return _coerce_result(v)

Senthil Kumaran

397eb44

2011-04-15 18:20:24 +0800

[diff] [blame]

350

for c in url[:i]:

351

if c not in scheme_chars:

352

break

353

else:

Ezio Melotti

6709b7d

2012-05-19 17:15:19 +0300

[diff] [blame]

354

# make sure "url" is not actually a port number (in which case

355

# "scheme" is really part of the path)

356

rest = url[i+1:]

357

if not rest or any(c not in '0123456789' for c in rest):

358

# not a port number

359

scheme, url = url[:i].lower(), rest

Senthil Kumaran

397eb44

2011-04-15 18:20:24 +0800

[diff] [blame]

360

Senthil Kumaran

6be85c5

2010-02-19 07:42:50 +0000

[diff] [blame]

361

if url[:2] == '//':

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

362

netloc, url = _splitnetloc(url, 2)

Senthil Kumaran

7a1e09f

2010-04-22 12:19:46 +0000

[diff] [blame]

363

if (('[' in netloc and ']' not in netloc) or

364

(']' in netloc and '[' not in netloc)):

365

raise ValueError("Invalid IPv6 URL")

Senthil Kumaran

1be320e

2012-05-19 08:12:00 +0800

[diff] [blame]

366

if allow_fragments and '#' in url:

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

367

url, fragment = url.split('#', 1)

Senthil Kumaran

1be320e

2012-05-19 08:12:00 +0800

[diff] [blame]

368

if '?' in url:

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

369

url, query = url.split('?', 1)

370

v = SplitResult(scheme, netloc, url, query, fragment)

371

_parse_cache[key] = v

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

372

return _coerce_result(v)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

373

374

def urlunparse(components):

375

"""Put a parsed URL back together again. This may result in a

376

slightly different, but equivalent URL, if the URL that was parsed

377

originally had redundant delimiters, e.g. a ? with an empty query

378

(the draft states that these are equivalent)."""

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

379

scheme, netloc, url, params, query, fragment, _coerce_result = (

380

_coerce_args(*components))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

381

if params:

382

url = "%s;%s" % (url, params)

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

383

return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

384

385

def urlunsplit(components):

Senthil Kumaran

8749a63

2010-06-28 14:08:00 +0000

[diff] [blame]

386

"""Combine the elements of a tuple as returned by urlsplit() into a

387

complete URL as a string. The data argument can be any five-item iterable.

388

This may result in a slightly different, but equivalent URL, if the URL that

389

was parsed originally had unnecessary delimiters (for example, a ? with an

390

empty query; the RFC states that these are equivalent)."""

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

391

scheme, netloc, url, query, fragment, _coerce_result = (

392

_coerce_args(*components))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

393

if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):

394

if url and url[:1] != '/': url = '/' + url

395

url = '//' + (netloc or '') + url

396

if scheme:

397

url = scheme + ':' + url

398

if query:

399

url = url + '?' + query

400

if fragment:

401

url = url + '#' + fragment

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

402

return _coerce_result(url)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

403

404

def urljoin(base, url, allow_fragments=True):

405

"""Join a base URL and a possibly relative URL to form an absolute

406

interpretation of the latter."""

if not base:

return url

if not url:

return base

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

411

base, url, _coerce_result = _coerce_args(base, url)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

412

bscheme, bnetloc, bpath, bparams, bquery, bfragment = \

413

urlparse(base, '', allow_fragments)

414

scheme, netloc, path, params, query, fragment = \

415

urlparse(url, bscheme, allow_fragments)

416

if scheme != bscheme or scheme not in uses_relative:

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

417

return _coerce_result(url)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

418

if scheme in uses_netloc:

419

if netloc:

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

420

return _coerce_result(urlunparse((scheme, netloc, path,

421

params, query, fragment)))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

422

netloc = bnetloc

423

if path[:1] == '/':

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

424

return _coerce_result(urlunparse((scheme, netloc, path,

425

params, query, fragment)))

Senthil Kumaran

dca5b86

2010-12-17 04:48:45 +0000

[diff] [blame]

426

if not path and not params:

Facundo Batista

23e3856

2008-08-14 16:55:14 +0000

[diff] [blame]

427

path = bpath

Senthil Kumaran

dca5b86

2010-12-17 04:48:45 +0000

[diff] [blame]

428

params = bparams

Facundo Batista

23e3856

2008-08-14 16:55:14 +0000

[diff] [blame]

429

if not query:

430

query = bquery

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

431

return _coerce_result(urlunparse((scheme, netloc, path,

432

params, query, fragment)))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

433

segments = bpath.split('/')[:-1] + path.split('/')

434

# XXX The stuff below is bogus in various ways...

435

if segments[-1] == '.':

436

segments[-1] = ''

437

while '.' in segments:

segments.remove('.')

while 1:

i = 1

n = len(segments) - 1

442

while i < n:

443

if (segments[i] == '..'

444

and segments[i-1] not in ('', '..')):

445

del segments[i-1:i+1]

break

i = i+1

else:

break

if segments == ['', '..']:

451

segments[-1] = ''

452

elif len(segments) >= 2 and segments[-1] == '..':

453

segments[-2:] = ['']

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

454

return _coerce_result(urlunparse((scheme, netloc, '/'.join(segments),

455

params, query, fragment)))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

456

457

def urldefrag(url):

458

"""Removes any existing fragment from URL.

459

460

Returns a tuple of the defragmented URL and the fragment. If

461

the URL contained no fragments, the second element is the

462

empty string.

463

"""

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

464

url, _coerce_result = _coerce_args(url)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

465

if '#' in url:

466

s, n, p, a, q, frag = urlparse(url)

467

defrag = urlunparse((s, n, p, a, q, ''))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

468

else:

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

469

frag = ''

470

defrag = url

471

return _coerce_result(DefragResult(defrag, frag))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

472

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

473

def unquote_to_bytes(string):

474

"""unquote_to_bytes('abc%20def') -> b'abc def'."""

475

# Note: strings are encoded as UTF-8. This is only an issue if it contains

476

# unescaped non-ASCII characters, which URIs should not.

Florent Xicluna

82a3f8a

2010-08-14 18:30:35 +0000

[diff] [blame]

477

if not string:

478

# Is it a string-like object?

479

string.split

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

480

return b''

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

481

if isinstance(string, str):

482

string = string.encode('utf-8')

483

res = string.split(b'%')

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

if len(res) == 1:

return string

string = res[0]

for item in res[1:]:

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

488

try:

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

489

string += bytes([int(item[:2], 16)]) + item[2:]

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

490

except ValueError:

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

491

string += b'%' + item

492

return string

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

493

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

494

def unquote(string, encoding='utf-8', errors='replace'):

495

"""Replace %xx escapes by their single-character equivalent. The optional

496

encoding and errors parameters specify how to decode percent-encoded

497

sequences into Unicode characters, as accepted by the bytes.decode()

498

method.

499

By default, percent-encoded sequences are decoded with UTF-8, and invalid

500

sequences are replaced by a placeholder character.

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

501

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

502

unquote('abc%20def') -> 'abc def'.

503

"""

Florent Xicluna

c049fca

2010-07-31 08:56:55 +0000

[diff] [blame]

504

if string == '':

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

505

return string

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

506

res = string.split('%')

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

if len(res) == 1:

return string

if encoding is None:

encoding = 'utf-8'

if errors is None:

errors = 'replace'

Florent Xicluna

0f78a94

2010-05-17 18:01:22 +0000

[diff] [blame]

513

# pct_sequence: contiguous sequence of percent-encoded bytes, decoded

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

514

pct_sequence = b''

515

string = res[0]

516

for item in res[1:]:

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

517

try:

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

518

if not item:

519

raise ValueError

520

pct_sequence += bytes.fromhex(item[:2])

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

521

rest = item[2:]

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

522

if not rest:

523

# This segment was just a single percent-encoded character.

524

# May be part of a sequence of code units, so delay decoding.

525

# (Stored in pct_sequence).

526

continue

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

527

except ValueError:

528

rest = '%' + item

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

529

# Encountered non-percent-encoded characters. Flush the current

530

# pct_sequence.

531

string += pct_sequence.decode(encoding, errors) + rest

532

pct_sequence = b''

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

533

if pct_sequence:

534

# Flush the final pct_sequence

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

535

string += pct_sequence.decode(encoding, errors)

536

return string

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

537

Victor Stinner

2011-01-14 12:52:12 +0000

[diff] [blame]

538

def parse_qs(qs, keep_blank_values=False, strict_parsing=False,

539

encoding='utf-8', errors='replace'):

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

540

"""Parse a query given as a string argument.

Arguments:

Senthil Kumaran

2010-08-09 20:01:35 +0000

[diff] [blame]

544

qs: percent-encoded query string to be parsed

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

545

546

keep_blank_values: flag indicating whether blank values in

Senthil Kumaran

30e86a4

2010-08-09 20:01:35 +0000

[diff] [blame]

547

percent-encoded queries should be treated as blank strings.

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

548

A true value indicates that blanks should be retained as

549

blank strings. The default false value indicates that

550

blank values are to be ignored and treated as if they were

551

not included.

552

553

strict_parsing: flag indicating what to do with parsing errors.

554

If false (the default), errors are silently ignored.

555

If true, errors raise a ValueError exception.

Victor Stinner

2011-01-14 12:52:12 +0000

[diff] [blame]

556

557

encoding and errors: specify how to decode percent-encoded sequences

558

into Unicode characters, as accepted by the bytes.decode() method.

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

559

"""

Senthil Kumaran

2012-06-29 11:08:20 -0700

[diff] [blame]

560

parsed_result = {}

Victor Stinner

2011-01-14 12:52:12 +0000

[diff] [blame]

561

pairs = parse_qsl(qs, keep_blank_values, strict_parsing,

562

encoding=encoding, errors=errors)

563

for name, value in pairs:

Senthil Kumaran

2012-06-29 11:08:20 -0700

[diff] [blame]

564

if name in parsed_result:

565

parsed_result[name].append(value)

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

566

else:

Senthil Kumaran

2012-06-29 11:08:20 -0700

[diff] [blame]

567

parsed_result[name] = [value]

568

return parsed_result

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

569

Victor Stinner

2011-01-14 12:52:12 +0000

[diff] [blame]

570

def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,

571

encoding='utf-8', errors='replace'):

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

572

"""Parse a query given as a string argument.

Arguments:

Senthil Kumaran

2010-08-09 20:01:35 +0000

[diff] [blame]

576

qs: percent-encoded query string to be parsed

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

577

578

keep_blank_values: flag indicating whether blank values in

Senthil Kumaran

30e86a4

2010-08-09 20:01:35 +0000

[diff] [blame]

579

percent-encoded queries should be treated as blank strings. A

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

580

true value indicates that blanks should be retained as blank

581

strings. The default false value indicates that blank values

582

are to be ignored and treated as if they were not included.

583

584

strict_parsing: flag indicating what to do with parsing errors. If

585

false (the default), errors are silently ignored. If true,

586

errors raise a ValueError exception.

587

Victor Stinner

2011-01-14 12:52:12 +0000

[diff] [blame]

588

encoding and errors: specify how to decode percent-encoded sequences

589

into Unicode characters, as accepted by the bytes.decode() method.

590

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

591

Returns a list, as G-d intended.

592

"""

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

593

qs, _coerce_result = _coerce_args(qs)

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

594

pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]

595

r = []

596

for name_value in pairs:

597

if not name_value and not strict_parsing:

598

continue

599

nv = name_value.split('=', 1)

600

if len(nv) != 2:

601

if strict_parsing:

602

raise ValueError("bad query field: %r" % (name_value,))

603

# Handle case of a control-name with no equal sign

604

if keep_blank_values:

nv.append('')

else:

continue

if len(nv[1]) or keep_blank_values:

Victor Stinner

2011-01-14 12:52:12 +0000

[diff] [blame]

609

name = nv[0].replace('+', ' ')

610

name = unquote(name, encoding=encoding, errors=errors)

611

name = _coerce_result(name)

612

value = nv[1].replace('+', ' ')

613

value = unquote(value, encoding=encoding, errors=errors)

614

value = _coerce_result(value)

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

615

r.append((name, value))

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

616

return r

617

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

618

def unquote_plus(string, encoding='utf-8', errors='replace'):

619

"""Like unquote(), but also replace plus signs by spaces, as required for

620

unquoting HTML form values.

621

622

unquote_plus('%7e/abc+def') -> '~/abc def'

623

"""

624

string = string.replace('+', ' ')

625

return unquote(string, encoding, errors)

626

627

_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

628

b'abcdefghijklmnopqrstuvwxyz'

629

b'0123456789'

630

b'_.-')

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

631

_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)

632

_safe_quoters = {}

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

633

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

634

class Quoter(collections.defaultdict):

635

"""A mapping from bytes (in range(0,256)) to strings.

636

637

String values are percent-encoded byte values, unless the key < 128, and

638

in the "safe" set (either the specified safe set, or default set).

639

"""

640

# Keeps a cache internally, using defaultdict, for efficiency (lookups

641

# of cached keys don't call Python code at all).

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

642

def __init__(self, safe):

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

643

"""safe: bytes object."""

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

644

self.safe = _ALWAYS_SAFE.union(safe)

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

645

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

646

def __repr__(self):

647

# Without this, will just display as a defaultdict

648

return "<Quoter %r>" % dict(self)

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

649

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

650

def __missing__(self, b):

651

# Handle a cache miss. Store quoted string in cache and return.

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

652

res = chr(b) if b in self.safe else '%{:02X}'.format(b)

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

self[b] = res

return res

def quote(string, safe='/', encoding=None, errors=None):

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

657

"""quote('abc def') -> 'abc%20def'

658

659

Each part of a URL, e.g. the path info, the query, etc., has a

660

different set of reserved characters that must be quoted.

661

662

RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists

663

the following reserved characters.

664

665

reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |

666

"$" | ","

667

668

Each of these characters is reserved in some component of a URL,

669

but not necessarily in all of them.

670

671

By default, the quote function is intended for quoting the path

672

section of a URL. Thus, it will not encode '/'. This character

673

is reserved, but in typical usage the quote function is being

674

called on a path where the existing slash characters are used as

675

reserved characters.

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

676

677

string and safe may be either str or bytes objects. encoding must

678

not be specified if string is a str.

679

680

The optional encoding and errors parameters specify how to deal with

681

non-ASCII characters, as accepted by the str.encode method.

682

By default, encoding='utf-8' (characters are encoded with UTF-8), and

683

errors='strict' (unsupported characters raise a UnicodeEncodeError).

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

684

"""

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

685

if isinstance(string, str):

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

686

if not string:

687

return string

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

if encoding is None:

encoding = 'utf-8'

if errors is None:

errors = 'strict'

string = string.encode(encoding, errors)

693

else:

694

if encoding is not None:

695

raise TypeError("quote() doesn't support 'encoding' for bytes")

696

if errors is not None:

697

raise TypeError("quote() doesn't support 'errors' for bytes")

698

return quote_from_bytes(string, safe)

699

700

def quote_plus(string, safe='', encoding=None, errors=None):

701

"""Like quote(), but also replace ' ' with '+', as required for quoting

702

HTML form values. Plus signs in the original string are escaped unless

703

they are included in safe. It also does not have safe default to '/'.

704

"""

Jeremy Hylton

f819886

2009-03-26 16:55:08 +0000

[diff] [blame]

705

# Check if ' ' in string, where string may either be a str or bytes. If

706

# there are no spaces, the regular quote will produce the right answer.

707

if ((isinstance(string, str) and ' ' not in string) or

708

(isinstance(string, bytes) and b' ' not in string)):

709

return quote(string, safe, encoding, errors)

710

if isinstance(safe, str):

711

space = ' '

712

else:

713

space = b' '

Georg Brandl

faf4149

2009-05-26 18:31:11 +0000

[diff] [blame]

714

string = quote(string, safe + space, encoding, errors)

Jeremy Hylton

f819886

2009-03-26 16:55:08 +0000

[diff] [blame]

715

return string.replace(' ', '+')

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

716

717

def quote_from_bytes(bs, safe='/'):

718

"""Like quote(), but accepts a bytes object rather than a str, and does

719

not perform string-to-bytes encoding. It always returns an ASCII string.

Senthil Kumaran

ffa4b2c

2012-05-26 09:53:32 +0800

[diff] [blame]

720

quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

721

"""

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

722

if not isinstance(bs, (bytes, bytearray)):

723

raise TypeError("quote_from_bytes() expected bytes")

724

if not bs:

725

return ''

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

726

if isinstance(safe, str):

727

# Normalize 'safe' by converting to bytes and removing non-ASCII chars

728

safe = safe.encode('ascii', 'ignore')

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

729

else:

730

safe = bytes([c for c in safe if c < 128])

731

if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):

732

return bs.decode()

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

733

try:

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

734

quoter = _safe_quoters[safe]

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

735

except KeyError:

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

736

_safe_quoters[safe] = quoter = Quoter(safe).__getitem__

737

return ''.join([quoter(char) for char in bs])

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

738

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

739

def urlencode(query, doseq=False, safe='', encoding=None, errors=None):

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

740

"""Encode a sequence of two-element tuples or dictionary into a URL query string.

741

742

If any values in the query arg are sequences and doseq is true, each

743

sequence element is converted to a separate parameter.

744

745

If the query arg is a sequence of two-element tuples, the order of the

746

parameters in the output will match the order of parameters in the

747

input.

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

748

749

The query arg may be either a string or a bytes type. When query arg is a

750

string, the safe, encoding and error parameters are sent the quote_plus for

751

encoding.

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

752

"""

753

Jeremy Hylton

a4de60a

2009-03-26 14:49:26 +0000

[diff] [blame]

754

if hasattr(query, "items"):

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

755

query = query.items()

756

else:

Jeremy Hylton

230feba

2009-03-26 16:56:59 +0000

[diff] [blame]

757

# It's a bother at times that strings and string-like objects are

758

# sequences.

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

759

try:

760

# non-sequence items should not work with len()

761

# non-empty strings will fail this

762

if len(query) and not isinstance(query[0], tuple):

763

raise TypeError

Jeremy Hylton

230feba

2009-03-26 16:56:59 +0000

[diff] [blame]

764

# Zero-length sequences of all types will get here and succeed,

765

# but that's a minor nit. Since the original implementation

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

766

# allowed empty dicts that type of behavior probably should be

767

# preserved for consistency

768

except TypeError:

Jeremy Hylton

a4de60a

2009-03-26 14:49:26 +0000

[diff] [blame]

769

ty, va, tb = sys.exc_info()

770

raise TypeError("not a valid non-string sequence "

771

"or mapping object").with_traceback(tb)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

772

773

l = []

774

if not doseq:

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

775

for k, v in query:

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

776

if isinstance(k, bytes):

777

k = quote_plus(k, safe)

778

else:

779

k = quote_plus(str(k), safe, encoding, errors)

780

781

if isinstance(v, bytes):

782

v = quote_plus(v, safe)

783

else:

784

v = quote_plus(str(v), safe, encoding, errors)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

785

l.append(k + '=' + v)

786

else:

787

for k, v in query:

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

788

if isinstance(k, bytes):

789

k = quote_plus(k, safe)

790

else:

791

k = quote_plus(str(k), safe, encoding, errors)

792

793

if isinstance(v, bytes):

794

v = quote_plus(v, safe)

795

l.append(k + '=' + v)

796

elif isinstance(v, str):

797

v = quote_plus(v, safe, encoding, errors)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

798

l.append(k + '=' + v)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

799

else:

800

try:

Jeremy Hylton

230feba

2009-03-26 16:56:59 +0000

[diff] [blame]

801

# Is this a sufficient test for sequence-ness?

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

802

x = len(v)

803

except TypeError:

804

# not a sequence

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

805

v = quote_plus(str(v), safe, encoding, errors)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

806

l.append(k + '=' + v)

807

else:

808

# loop over the sequence

809

for elt in v:

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

810

if isinstance(elt, bytes):

811

elt = quote_plus(elt, safe)

812

else:

813

elt = quote_plus(str(elt), safe, encoding, errors)

814

l.append(k + '=' + elt)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

815

return '&'.join(l)

816

817

# Utilities to parse URLs (most of these return None for missing parts):

818

# unwrap('<URL:type://host/path>') --> 'type://host/path'

819

# splittype('type:opaquestring') --> 'type', 'opaquestring'

820

# splithost('//host[:port]/path') --> 'host[:port]', '/path'

821

# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'

822

# splitpasswd('user:passwd') -> 'user', 'passwd'

823

# splitport('host:port') --> 'host', 'port'

824

# splitquery('/path?query') --> '/path', 'query'

825

# splittag('/path#tag') --> '/path', 'tag'

826

# splitattr('/path;attr1=value1;attr2=value2;...') ->

827

# '/path', ['attr1=value1', 'attr2=value2', ...]

828

# splitvalue('attr=value') --> 'attr', 'value'

829

# urllib.parse.unquote('abc%20def') -> 'abc def'

830

# quote('abc def') -> 'abc%20def')

831

Georg Brandl

13e8946

2008-07-01 19:56:00 +0000

[diff] [blame]

832

def to_bytes(url):

833

"""to_bytes(u"URL") --> 'URL'."""

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

834

# Most URL schemes require ASCII. If that changes, the conversion

835

# can be relaxed.

Georg Brandl

13e8946

2008-07-01 19:56:00 +0000

[diff] [blame]

836

# XXX get rid of to_bytes()

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

837

if isinstance(url, str):

838

try:

839

url = url.encode("ASCII").decode()

840

except UnicodeError:

841

raise UnicodeError("URL " + repr(url) +

842

" contains non-ASCII characters")

return url

def unwrap(url):

"""unwrap('<URL:type://host/path>') --> 'type://host/path'."""

847

url = str(url).strip()

848

if url[:1] == '<' and url[-1:] == '>':

849

url = url[1:-1].strip()

850

if url[:4] == 'URL:': url = url[4:].strip()

return url

_typeprog = None

def splittype(url):

"""splittype('type:opaquestring') --> 'type', 'opaquestring'."""

856

global _typeprog

857

if _typeprog is None:

858

import re

859

_typeprog = re.compile('^([^/:]+):')

860

861

match = _typeprog.match(url)

862

if match:

863

scheme = match.group(1)

864

return scheme.lower(), url[len(scheme) + 1:]

return None, url

_hostprog = None

def splithost(url):

"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""

870

global _hostprog

871

if _hostprog is None:

872

import re

873

_hostprog = re.compile('^//([^/?]*)(.*)$')

874

875

match = _hostprog.match(url)

Senthil Kumaran

c295862

2010-11-22 04:48:26 +0000

[diff] [blame]

876

if match:

877

host_port = match.group(1)

878

path = match.group(2)

879

if path and not path.startswith('/'):

880

path = '/' + path

881

return host_port, path

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

return None, url

_userprog = None

def splituser(host):

"""splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""

887

global _userprog

888

if _userprog is None:

889

import re

890

_userprog = re.compile('^(.*)@(.*)$')

891

892

match = _userprog.match(host)

Senthil Kumaran

daa29d0

2010-11-18 15:36:41 +0000

[diff] [blame]

893

if match: return match.group(1, 2)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

return None, host

_passwdprog = None

def splitpasswd(user):

898

"""splitpasswd('user:passwd') -> 'user', 'passwd'."""

899

global _passwdprog

900

if _passwdprog is None:

901

import re

Senthil Kumaran

eaaec27

2009-03-30 21:54:41 +0000

[diff] [blame]

902

_passwdprog = re.compile('^([^:]*):(.*)$',re.S)

Jeremy Hylton