Blame - Lib/urllib/parse.py - platform/external/python/cpython3

2008-06-18 20:49:58 +0000

[diff] [blame]

1

"""Parse (absolute and relative) URLs.

2

Senthil Kumaran

2010-04-17 14:44:14 +0000

[diff] [blame]

3

urlparse module is based upon the following RFC specifications.

4

5

RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding

6

and L. Masinter, January 2005.

7

8

RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter

9

and L.Masinter, December 1999.

10

Benjamin Peterson

d7c3ed5

2010-06-27 22:32:30 +0000

[diff] [blame]

11

RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.

Senthil Kumaran

2010-04-17 14:44:14 +0000

[diff] [blame]

12

Berners-Lee, R. Fielding, and L. Masinter, August 1998.

13

David Malcolm

ee25568

2010-12-02 16:41:00 +0000

[diff] [blame]

14

RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.

Senthil Kumaran

2010-04-17 14:44:14 +0000

[diff] [blame]

15

16

RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June

17

1995.

18

Benjamin Peterson

d7c3ed5

2010-06-27 22:32:30 +0000

[diff] [blame]

19

RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.

Senthil Kumaran

2010-04-17 14:44:14 +0000

[diff] [blame]

20

McCahill, December 1994

21

Benjamin Peterson

d7c3ed5

2010-06-27 22:32:30 +0000

[diff] [blame]

22

RFC 3986 is considered the current standard and any future changes to

23

urlparse module should conform with it. The urlparse module is

24

currently not entirely compliant with this RFC due to defacto

25

scenarios for parsing, and for backward compatibility purposes, some

26

parsing quirks from older RFCs are retained. The testcases in

Senthil Kumaran

2010-04-17 14:44:14 +0000

[diff] [blame]

27

test_urlparse.py provides a good indicator of parsing behavior.

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

28

"""

29

Facundo Batista

2ac5de2

2008-07-07 18:24:11 +0000

[diff] [blame]

30

import sys

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

31

import collections

Facundo Batista

2ac5de2

2008-07-07 18:24:11 +0000

[diff] [blame]

32

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

33

__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",

Senthil Kumaran

0256b2a

2010-10-25 16:36:20 +0000

[diff] [blame]

34

"urlsplit", "urlunsplit", "urlencode", "parse_qs",

35

"parse_qsl", "quote", "quote_plus", "quote_from_bytes",

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

36

"unquote", "unquote_plus", "unquote_to_bytes"]

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

37

38

# A classification of schemes ('' means apply by default)

39

uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',

40

'wais', 'file', 'https', 'shttp', 'mms',

Senthil Kumaran

2a157d2

2011-08-03 18:37:22 +0800

[diff] [blame]

41

'prospero', 'rtsp', 'rtspu', '', 'sftp',

42

'svn', 'svn+ssh']

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

43

uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',

44

'imap', 'wais', 'file', 'mms', 'https', 'shttp',

45

'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

46

'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

47

uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',

48

'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',

49

'mms', '', 'sftp']

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

50

51

# Characters valid in scheme names

52

scheme_chars = ('abcdefghijklmnopqrstuvwxyz'

53

'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

'0123456789'

'+-.')

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

57

# XXX: Consider replacing with functools.lru_cache

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

MAX_CACHE_SIZE = 20

_parse_cache = {}

def clear_cache():

Antoine Pitrou

2df5fc7

2009-12-08 19:38:17 +0000

[diff] [blame]

62

"""Clear the parse cache and the quoters cache."""

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

63

_parse_cache.clear()

Antoine Pitrou

2df5fc7

2009-12-08 19:38:17 +0000

[diff] [blame]

64

_safe_quoters.clear()

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

65

66

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

67

# Helpers for bytes handling

68

# For 3.2, we deliberately require applications that

69

# handle improperly quoted URLs to do their own

70

# decoding and encoding. If valid use cases are

71

# presented, we may relax this by using latin-1

72

# decoding internally for 3.3

73

_implicit_encoding = 'ascii'

74

_implicit_errors = 'strict'

def _noop(obj):

return obj

def _encode_result(obj, encoding=_implicit_encoding,

80

errors=_implicit_errors):

81

return obj.encode(encoding, errors)

82

83

def _decode_args(args, encoding=_implicit_encoding,

84

errors=_implicit_errors):

85

return tuple(x.decode(encoding, errors) if x else '' for x in args)

86

87

def _coerce_args(*args):

88

# Invokes decode if necessary to create str args

89

# and returns the coerced inputs along with

90

# an appropriate result coercion function

91

# - noop for str inputs

92

# - encoding function otherwise

93

str_input = isinstance(args[0], str)

94

for arg in args[1:]:

95

# We special-case the empty string to support the

96

# "scheme=''" default argument to some functions

97

if arg and isinstance(arg, str) != str_input:

98

raise TypeError("Cannot mix str and non-str arguments")

99

if str_input:

100

return args + (_noop,)

101

return _decode_args(args) + (_encode_result,)

102

103

# Result objects are more helpful than simple tuples

104

class _ResultMixinStr(object):

105

"""Standard approach to encoding parsed results from str to bytes"""

106

__slots__ = ()

107

108

def encode(self, encoding='ascii', errors='strict'):

109

return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))

110

111

112

class _ResultMixinBytes(object):

113

"""Standard approach to decoding parsed results from bytes to str"""

114

__slots__ = ()

115

116

def decode(self, encoding='ascii', errors='strict'):

117

return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))

118

119

120

class _NetlocResultMixinBase(object):

121

"""Shared methods for the parsed result objects containing a netloc element"""

122

__slots__ = ()

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

123

124

@property

125

def username(self):

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

126

return self._userinfo[0]

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

127

128

@property

129

def password(self):

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

130

return self._userinfo[1]

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

131

132

@property

133

def hostname(self):

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

134

hostname = self._hostinfo[0]

135

if not hostname:

136

hostname = None

137

elif hostname is not None:

138

hostname = hostname.lower()

139

return hostname

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

140

141

@property

142

def port(self):

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

143

port = self._hostinfo[1]

144

if port is not None:

145

port = int(port, 10)

Senthil Kumaran

2fc5a50

2012-05-24 21:56:17 +0800

[diff] [blame]

146

# Return None on an illegal port

147

if not ( 0 <= port <= 65535):

148

return None

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

return port

class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):

__slots__ = ()

@property

def _userinfo(self):

netloc = self.netloc

userinfo, have_info, hostinfo = netloc.rpartition('@')

159

if have_info:

160

username, have_password, password = userinfo.partition(':')

161

if not have_password:

162

password = None

Senthil Kumaran

ad02d23

2010-04-16 03:02:13 +0000

[diff] [blame]

163

else:

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

164

username = password = None

165

return username, password

@property

def _hostinfo(self):

netloc = self.netloc

_, _, hostinfo = netloc.rpartition('@')

171

_, have_open_br, bracketed = hostinfo.partition('[')

172

if have_open_br:

173

hostname, _, port = bracketed.partition(']')

174

_, have_port, port = port.partition(':')

175

else:

176

hostname, have_port, port = hostinfo.partition(':')

177

if not have_port:

178

port = None

179

return hostname, port

180

181

182

class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):

__slots__ = ()

@property

def _userinfo(self):

netloc = self.netloc

userinfo, have_info, hostinfo = netloc.rpartition(b'@')

189

if have_info:

190

username, have_password, password = userinfo.partition(b':')

191

if not have_password:

192

password = None

193

else:

194

username = password = None

195

return username, password

@property

def _hostinfo(self):

netloc = self.netloc

_, _, hostinfo = netloc.rpartition(b'@')

201

_, have_open_br, bracketed = hostinfo.partition(b'[')

202

if have_open_br:

203

hostname, _, port = bracketed.partition(b']')

204

_, have_port, port = port.partition(b':')

205

else:

206

hostname, have_port, port = hostinfo.partition(b':')

207

if not have_port:

208

port = None

209

return hostname, port

210

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

211

212

from collections import namedtuple

213

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

214

_DefragResultBase = namedtuple('DefragResult', 'url fragment')

215

_SplitResultBase = namedtuple('SplitResult', 'scheme netloc path query fragment')

216

_ParseResultBase = namedtuple('ParseResult', 'scheme netloc path params query fragment')

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

217

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

218

# For backwards compatibility, alias _NetlocResultMixinStr

219

# ResultBase is no longer part of the documented API, but it is

220

# retained since deprecating it isn't worth the hassle

221

ResultBase = _NetlocResultMixinStr

222

223

# Structured result objects for string data

224

class DefragResult(_DefragResultBase, _ResultMixinStr):

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

225

__slots__ = ()

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

226

def geturl(self):

227

if self.fragment:

228

return self.url + '#' + self.fragment

229

else:

230

return self.url

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

231

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

232

class SplitResult(_SplitResultBase, _NetlocResultMixinStr):

233

__slots__ = ()

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

234

def geturl(self):

235

return urlunsplit(self)

236

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

237

class ParseResult(_ParseResultBase, _NetlocResultMixinStr):

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

238

__slots__ = ()

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

239

def geturl(self):

240

return urlunparse(self)

241

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

242

# Structured result objects for bytes data

243

class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):

__slots__ = ()

def geturl(self):

if self.fragment:

return self.url + b'#' + self.fragment

else:

return self.url

class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):

252

__slots__ = ()

253

def geturl(self):

254

return urlunsplit(self)

255

256

class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):

257

__slots__ = ()

258

def geturl(self):

259

return urlunparse(self)

260

261

# Set up the encode/decode result pairs

262

def _fix_result_transcoding():

263

_result_pairs = (

264

(DefragResult, DefragResultBytes),

265

(SplitResult, SplitResultBytes),

266

(ParseResult, ParseResultBytes),

267

)

268

for _decoded, _encoded in _result_pairs:

269

_decoded._encoded_counterpart = _encoded

270

_encoded._decoded_counterpart = _decoded

271

272

_fix_result_transcoding()

273

del _fix_result_transcoding

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

274

275

def urlparse(url, scheme='', allow_fragments=True):

276

"""Parse a URL into 6 components:

277

278

Return a 6-tuple: (scheme, netloc, path, params, query, fragment).

279

Note that we don't break the components up in smaller bits

280

(e.g. netloc is a single string) and we don't expand % escapes."""

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

281

url, scheme, _coerce_result = _coerce_args(url, scheme)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

282

tuple = urlsplit(url, scheme, allow_fragments)

283

scheme, netloc, url, query, fragment = tuple

284

if scheme in uses_params and ';' in url:

285

url, params = _splitparams(url)

286

else:

287

params = ''

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

288

result = ParseResult(scheme, netloc, url, params, query, fragment)

289

return _coerce_result(result)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

290

291

def _splitparams(url):

292

if '/' in url:

293

i = url.find(';', url.rfind('/'))

if i < 0:

return url, ''

else:

i = url.find(';')

return url[:i], url[i+1:]

299

300

def _splitnetloc(url, start=0):

301

delim = len(url) # position of end of domain part of url, default is end

302

for c in '/?#': # look for delimiters; the order is NOT important

303

wdelim = url.find(c, start) # find first of this delim

304

if wdelim >= 0: # if found

305

delim = min(delim, wdelim) # use earliest delim position

306

return url[start:delim], url[delim:] # return (domain, rest)

307

308

def urlsplit(url, scheme='', allow_fragments=True):

309

"""Parse a URL into 5 components:

310

311

Return a 5-tuple: (scheme, netloc, path, query, fragment).

312

Note that we don't break the components up in smaller bits

313

(e.g. netloc is a single string) and we don't expand % escapes."""

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

314

url, scheme, _coerce_result = _coerce_args(url, scheme)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

315

allow_fragments = bool(allow_fragments)

316

key = url, scheme, allow_fragments, type(url), type(scheme)

317

cached = _parse_cache.get(key, None)

318

if cached:

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

319

return _coerce_result(cached)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

320

if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth

321

clear_cache()

322

netloc = query = fragment = ''

323

i = url.find(':')

324

if i > 0:

325

if url[:i] == 'http': # optimize the common case

326

scheme = url[:i].lower()

327

url = url[i+1:]

328

if url[:2] == '//':

329

netloc, url = _splitnetloc(url, 2)

Senthil Kumaran

7a1e09f

2010-04-22 12:19:46 +0000

[diff] [blame]

330

if (('[' in netloc and ']' not in netloc) or

331

(']' in netloc and '[' not in netloc)):

332

raise ValueError("Invalid IPv6 URL")

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

333

if allow_fragments and '#' in url:

334

url, fragment = url.split('#', 1)

335

if '?' in url:

336

url, query = url.split('?', 1)

337

v = SplitResult(scheme, netloc, url, query, fragment)

338

_parse_cache[key] = v

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

339

return _coerce_result(v)

Senthil Kumaran

397eb44

2011-04-15 18:20:24 +0800

[diff] [blame]

340

for c in url[:i]:

341

if c not in scheme_chars:

342

break

343

else:

Ezio Melotti

6709b7d

2012-05-19 17:15:19 +0300

[diff] [blame]

344

# make sure "url" is not actually a port number (in which case

345

# "scheme" is really part of the path)

346

rest = url[i+1:]

347

if not rest or any(c not in '0123456789' for c in rest):

348

# not a port number

349

scheme, url = url[:i].lower(), rest

Senthil Kumaran

397eb44

2011-04-15 18:20:24 +0800

[diff] [blame]

350

Senthil Kumaran

6be85c5

2010-02-19 07:42:50 +0000

[diff] [blame]

351

if url[:2] == '//':

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

352

netloc, url = _splitnetloc(url, 2)

Senthil Kumaran

7a1e09f

2010-04-22 12:19:46 +0000

[diff] [blame]

353

if (('[' in netloc and ']' not in netloc) or

354

(']' in netloc and '[' not in netloc)):

355

raise ValueError("Invalid IPv6 URL")

Senthil Kumaran

1be320e

2012-05-19 08:12:00 +0800

[diff] [blame]

356

if allow_fragments and '#' in url:

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

357

url, fragment = url.split('#', 1)

Senthil Kumaran

1be320e

2012-05-19 08:12:00 +0800

[diff] [blame]

358

if '?' in url:

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

359

url, query = url.split('?', 1)

360

v = SplitResult(scheme, netloc, url, query, fragment)

361

_parse_cache[key] = v

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

362

return _coerce_result(v)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

363

364

def urlunparse(components):

365

"""Put a parsed URL back together again. This may result in a

366

slightly different, but equivalent URL, if the URL that was parsed

367

originally had redundant delimiters, e.g. a ? with an empty query

368

(the draft states that these are equivalent)."""

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

369

scheme, netloc, url, params, query, fragment, _coerce_result = (

370

_coerce_args(*components))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

371

if params:

372

url = "%s;%s" % (url, params)

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

373

return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

374

375

def urlunsplit(components):

Senthil Kumaran

8749a63

2010-06-28 14:08:00 +0000

[diff] [blame]

376

"""Combine the elements of a tuple as returned by urlsplit() into a

377

complete URL as a string. The data argument can be any five-item iterable.

378

This may result in a slightly different, but equivalent URL, if the URL that

379

was parsed originally had unnecessary delimiters (for example, a ? with an

380

empty query; the RFC states that these are equivalent)."""

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

381

scheme, netloc, url, query, fragment, _coerce_result = (

382

_coerce_args(*components))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

383

if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):

384

if url and url[:1] != '/': url = '/' + url

385

url = '//' + (netloc or '') + url

386

if scheme:

387

url = scheme + ':' + url

388

if query:

389

url = url + '?' + query

390

if fragment:

391

url = url + '#' + fragment

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

392

return _coerce_result(url)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

393

394

def urljoin(base, url, allow_fragments=True):

395

"""Join a base URL and a possibly relative URL to form an absolute

396

interpretation of the latter."""

if not base:

return url

if not url:

return base

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

401

base, url, _coerce_result = _coerce_args(base, url)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

402

bscheme, bnetloc, bpath, bparams, bquery, bfragment = \

403

urlparse(base, '', allow_fragments)

404

scheme, netloc, path, params, query, fragment = \

405

urlparse(url, bscheme, allow_fragments)

406

if scheme != bscheme or scheme not in uses_relative:

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

407

return _coerce_result(url)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

408

if scheme in uses_netloc:

409

if netloc:

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

410

return _coerce_result(urlunparse((scheme, netloc, path,

411

params, query, fragment)))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

412

netloc = bnetloc

413

if path[:1] == '/':

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

414

return _coerce_result(urlunparse((scheme, netloc, path,

415

params, query, fragment)))

Senthil Kumaran

dca5b86

2010-12-17 04:48:45 +0000

[diff] [blame]

416

if not path and not params:

Facundo Batista

23e3856

2008-08-14 16:55:14 +0000

[diff] [blame]

417

path = bpath

Senthil Kumaran

dca5b86

2010-12-17 04:48:45 +0000

[diff] [blame]

418

params = bparams

Facundo Batista

23e3856

2008-08-14 16:55:14 +0000

[diff] [blame]

419

if not query:

420

query = bquery

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

421

return _coerce_result(urlunparse((scheme, netloc, path,

422

params, query, fragment)))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

423

segments = bpath.split('/')[:-1] + path.split('/')

424

# XXX The stuff below is bogus in various ways...

425

if segments[-1] == '.':

426

segments[-1] = ''

427

while '.' in segments:

segments.remove('.')

while 1:

i = 1

n = len(segments) - 1

432

while i < n:

433

if (segments[i] == '..'

434

and segments[i-1] not in ('', '..')):

435

del segments[i-1:i+1]

break

i = i+1

else:

break

if segments == ['', '..']:

441

segments[-1] = ''

442

elif len(segments) >= 2 and segments[-1] == '..':

443

segments[-2:] = ['']

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

444

return _coerce_result(urlunparse((scheme, netloc, '/'.join(segments),

445

params, query, fragment)))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

446

447

def urldefrag(url):

448

"""Removes any existing fragment from URL.

449

450

Returns a tuple of the defragmented URL and the fragment. If

451

the URL contained no fragments, the second element is the

452

empty string.

453

"""

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

454

url, _coerce_result = _coerce_args(url)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

455

if '#' in url:

456

s, n, p, a, q, frag = urlparse(url)

457

defrag = urlunparse((s, n, p, a, q, ''))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

458

else:

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

459

frag = ''

460

defrag = url

461

return _coerce_result(DefragResult(defrag, frag))

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

462

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

463

def unquote_to_bytes(string):

464

"""unquote_to_bytes('abc%20def') -> b'abc def'."""

465

# Note: strings are encoded as UTF-8. This is only an issue if it contains

466

# unescaped non-ASCII characters, which URIs should not.

Florent Xicluna

82a3f8a

2010-08-14 18:30:35 +0000

[diff] [blame]

467

if not string:

468

# Is it a string-like object?

469

string.split

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

470

return b''

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

471

if isinstance(string, str):

472

string = string.encode('utf-8')

473

res = string.split(b'%')

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

if len(res) == 1:

return string

string = res[0]

for item in res[1:]:

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

478

try:

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

479

string += bytes([int(item[:2], 16)]) + item[2:]

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

480

except ValueError:

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

481

string += b'%' + item

482

return string

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

483

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

484

def unquote(string, encoding='utf-8', errors='replace'):

485

"""Replace %xx escapes by their single-character equivalent. The optional

486

encoding and errors parameters specify how to decode percent-encoded

487

sequences into Unicode characters, as accepted by the bytes.decode()

488

method.

489

By default, percent-encoded sequences are decoded with UTF-8, and invalid

490

sequences are replaced by a placeholder character.

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

491

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

492

unquote('abc%20def') -> 'abc def'.

493

"""

Florent Xicluna

c049fca

2010-07-31 08:56:55 +0000

[diff] [blame]

494

if string == '':

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

495

return string

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

496

res = string.split('%')

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

if len(res) == 1:

return string

if encoding is None:

encoding = 'utf-8'

if errors is None:

errors = 'replace'

Florent Xicluna

0f78a94

2010-05-17 18:01:22 +0000

[diff] [blame]

503

# pct_sequence: contiguous sequence of percent-encoded bytes, decoded

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

504

pct_sequence = b''

505

string = res[0]

506

for item in res[1:]:

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

507

try:

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

508

if not item:

509

raise ValueError

510

pct_sequence += bytes.fromhex(item[:2])

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

511

rest = item[2:]

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

512

if not rest:

513

# This segment was just a single percent-encoded character.

514

# May be part of a sequence of code units, so delay decoding.

515

# (Stored in pct_sequence).

516

continue

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

517

except ValueError:

518

rest = '%' + item

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

519

# Encountered non-percent-encoded characters. Flush the current

520

# pct_sequence.

521

string += pct_sequence.decode(encoding, errors) + rest

522

pct_sequence = b''

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

523

if pct_sequence:

524

# Flush the final pct_sequence

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

525

string += pct_sequence.decode(encoding, errors)

526

return string

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

527

Victor Stinner

2011-01-14 12:52:12 +0000

[diff] [blame]

528

def parse_qs(qs, keep_blank_values=False, strict_parsing=False,

529

encoding='utf-8', errors='replace'):

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

530

"""Parse a query given as a string argument.

Arguments:

Senthil Kumaran

2010-08-09 20:01:35 +0000

[diff] [blame]

534

qs: percent-encoded query string to be parsed

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

535

536

keep_blank_values: flag indicating whether blank values in

Senthil Kumaran

30e86a4

2010-08-09 20:01:35 +0000

[diff] [blame]

537

percent-encoded queries should be treated as blank strings.

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

538

A true value indicates that blanks should be retained as

539

blank strings. The default false value indicates that

540

blank values are to be ignored and treated as if they were

541

not included.

542

543

strict_parsing: flag indicating what to do with parsing errors.

544

If false (the default), errors are silently ignored.

545

If true, errors raise a ValueError exception.

Victor Stinner

2011-01-14 12:52:12 +0000

[diff] [blame]

546

547

encoding and errors: specify how to decode percent-encoded sequences

548

into Unicode characters, as accepted by the bytes.decode() method.

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

549

"""

550

dict = {}

Victor Stinner

2011-01-14 12:52:12 +0000

[diff] [blame]

551

pairs = parse_qsl(qs, keep_blank_values, strict_parsing,

552

encoding=encoding, errors=errors)

553

for name, value in pairs:

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

554

if name in dict:

555

dict[name].append(value)

else:

dict[name] = [value]

return dict

Victor Stinner

2011-01-14 12:52:12 +0000

[diff] [blame]

560

def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,

561

encoding='utf-8', errors='replace'):

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

562

"""Parse a query given as a string argument.

Arguments:

Senthil Kumaran

2010-08-09 20:01:35 +0000

[diff] [blame]

566

qs: percent-encoded query string to be parsed

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

567

568

keep_blank_values: flag indicating whether blank values in

Senthil Kumaran

30e86a4

2010-08-09 20:01:35 +0000

[diff] [blame]

569

percent-encoded queries should be treated as blank strings. A

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

570

true value indicates that blanks should be retained as blank

571

strings. The default false value indicates that blank values

572

are to be ignored and treated as if they were not included.

573

574

strict_parsing: flag indicating what to do with parsing errors. If

575

false (the default), errors are silently ignored. If true,

576

errors raise a ValueError exception.

577

Victor Stinner

2011-01-14 12:52:12 +0000

[diff] [blame]

578

encoding and errors: specify how to decode percent-encoded sequences

579

into Unicode characters, as accepted by the bytes.decode() method.

580

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

581

Returns a list, as G-d intended.

582

"""

Nick Coghlan

2010-11-30 15:48:08 +0000

[diff] [blame]

583

qs, _coerce_result = _coerce_args(qs)

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

584

pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]

585

r = []

586

for name_value in pairs:

587

if not name_value and not strict_parsing:

588

continue

589

nv = name_value.split('=', 1)

590

if len(nv) != 2:

591

if strict_parsing:

592

raise ValueError("bad query field: %r" % (name_value,))

593

# Handle case of a control-name with no equal sign

594

if keep_blank_values:

nv.append('')

else:

continue

if len(nv[1]) or keep_blank_values:

Victor Stinner

2011-01-14 12:52:12 +0000

[diff] [blame]

599

name = nv[0].replace('+', ' ')

600

name = unquote(name, encoding=encoding, errors=errors)

601

name = _coerce_result(name)

602

value = nv[1].replace('+', ' ')

603

value = unquote(value, encoding=encoding, errors=errors)

604

value = _coerce_result(value)

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

605

r.append((name, value))

Facundo Batista

2008-09-03 22:49:01 +0000

[diff] [blame]

606

return r

607

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

608

def unquote_plus(string, encoding='utf-8', errors='replace'):

609

"""Like unquote(), but also replace plus signs by spaces, as required for

610

unquoting HTML form values.

611

612

unquote_plus('%7e/abc+def') -> '~/abc def'

613

"""

614

string = string.replace('+', ' ')

615

return unquote(string, encoding, errors)

616

617

_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

618

b'abcdefghijklmnopqrstuvwxyz'

619

b'0123456789'

620

b'_.-')

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

621

_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)

622

_safe_quoters = {}

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

623

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

624

class Quoter(collections.defaultdict):

625

"""A mapping from bytes (in range(0,256)) to strings.

626

627

String values are percent-encoded byte values, unless the key < 128, and

628

in the "safe" set (either the specified safe set, or default set).

629

"""

630

# Keeps a cache internally, using defaultdict, for efficiency (lookups

631

# of cached keys don't call Python code at all).

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

632

def __init__(self, safe):

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

633

"""safe: bytes object."""

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

634

self.safe = _ALWAYS_SAFE.union(safe)

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

635

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

636

def __repr__(self):

637

# Without this, will just display as a defaultdict

638

return "<Quoter %r>" % dict(self)

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

639

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

640

def __missing__(self, b):

641

# Handle a cache miss. Store quoted string in cache and return.

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

642

res = chr(b) if b in self.safe else '%{:02X}'.format(b)

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

self[b] = res

return res

def quote(string, safe='/', encoding=None, errors=None):

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

647

"""quote('abc def') -> 'abc%20def'

648

649

Each part of a URL, e.g. the path info, the query, etc., has a

650

different set of reserved characters that must be quoted.

651

652

RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists

653

the following reserved characters.

654

655

reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |

656

"$" | ","

657

658

Each of these characters is reserved in some component of a URL,

659

but not necessarily in all of them.

660

661

By default, the quote function is intended for quoting the path

662

section of a URL. Thus, it will not encode '/'. This character

663

is reserved, but in typical usage the quote function is being

664

called on a path where the existing slash characters are used as

665

reserved characters.

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

666

667

string and safe may be either str or bytes objects. encoding must

668

not be specified if string is a str.

669

670

The optional encoding and errors parameters specify how to deal with

671

non-ASCII characters, as accepted by the str.encode method.

672

By default, encoding='utf-8' (characters are encoded with UTF-8), and

673

errors='strict' (unsupported characters raise a UnicodeEncodeError).

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

674

"""

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

675

if isinstance(string, str):

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

676

if not string:

677

return string

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

if encoding is None:

encoding = 'utf-8'

if errors is None:

errors = 'strict'

string = string.encode(encoding, errors)

683

else:

684

if encoding is not None:

685

raise TypeError("quote() doesn't support 'encoding' for bytes")

686

if errors is not None:

687

raise TypeError("quote() doesn't support 'errors' for bytes")

688

return quote_from_bytes(string, safe)

689

690

def quote_plus(string, safe='', encoding=None, errors=None):

691

"""Like quote(), but also replace ' ' with '+', as required for quoting

692

HTML form values. Plus signs in the original string are escaped unless

693

they are included in safe. It also does not have safe default to '/'.

694

"""

Jeremy Hylton

f819886

2009-03-26 16:55:08 +0000

[diff] [blame]

695

# Check if ' ' in string, where string may either be a str or bytes. If

696

# there are no spaces, the regular quote will produce the right answer.

697

if ((isinstance(string, str) and ' ' not in string) or

698

(isinstance(string, bytes) and b' ' not in string)):

699

return quote(string, safe, encoding, errors)

700

if isinstance(safe, str):

701

space = ' '

702

else:

703

space = b' '

Georg Brandl

faf4149

2009-05-26 18:31:11 +0000

[diff] [blame]

704

string = quote(string, safe + space, encoding, errors)

Jeremy Hylton

f819886

2009-03-26 16:55:08 +0000

[diff] [blame]

705

return string.replace(' ', '+')

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

706

707

def quote_from_bytes(bs, safe='/'):

708

"""Like quote(), but accepts a bytes object rather than a str, and does

709

not perform string-to-bytes encoding. It always returns an ASCII string.

Senthil Kumaran

ffa4b2c

2012-05-26 09:53:32 +0800

[diff] [blame^]

710

quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

711

"""

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

712

if not isinstance(bs, (bytes, bytearray)):

713

raise TypeError("quote_from_bytes() expected bytes")

714

if not bs:

715

return ''

Guido van Rossum

2008-08-18 21:44:30 +0000

[diff] [blame]

716

if isinstance(safe, str):

717

# Normalize 'safe' by converting to bytes and removing non-ASCII chars

718

safe = safe.encode('ascii', 'ignore')

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

719

else:

720

safe = bytes([c for c in safe if c < 128])

721

if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):

722

return bs.decode()

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

723

try:

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

724

quoter = _safe_quoters[safe]

Guido van Rossum

2008-08-06 19:31:34 +0000

[diff] [blame]

725

except KeyError:

Florent Xicluna

2010-05-17 17:33:07 +0000

[diff] [blame]

726

_safe_quoters[safe] = quoter = Quoter(safe).__getitem__

727

return ''.join([quoter(char) for char in bs])

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

728

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

729

def urlencode(query, doseq=False, safe='', encoding=None, errors=None):

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

730

"""Encode a sequence of two-element tuples or dictionary into a URL query string.

731

732

If any values in the query arg are sequences and doseq is true, each

733

sequence element is converted to a separate parameter.

734

735

If the query arg is a sequence of two-element tuples, the order of the

736

parameters in the output will match the order of parameters in the

737

input.

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

738

739

The query arg may be either a string or a bytes type. When query arg is a

740

string, the safe, encoding and error parameters are sent the quote_plus for

741

encoding.

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

742

"""

743

Jeremy Hylton

a4de60a

2009-03-26 14:49:26 +0000

[diff] [blame]

744

if hasattr(query, "items"):

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

745

query = query.items()

746

else:

Jeremy Hylton

230feba

2009-03-26 16:56:59 +0000

[diff] [blame]

747

# It's a bother at times that strings and string-like objects are

748

# sequences.

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

749

try:

750

# non-sequence items should not work with len()

751

# non-empty strings will fail this

752

if len(query) and not isinstance(query[0], tuple):

753

raise TypeError

Jeremy Hylton

230feba

2009-03-26 16:56:59 +0000

[diff] [blame]

754

# Zero-length sequences of all types will get here and succeed,

755

# but that's a minor nit. Since the original implementation

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

756

# allowed empty dicts that type of behavior probably should be

757

# preserved for consistency

758

except TypeError:

Jeremy Hylton

a4de60a

2009-03-26 14:49:26 +0000

[diff] [blame]

759

ty, va, tb = sys.exc_info()

760

raise TypeError("not a valid non-string sequence "

761

"or mapping object").with_traceback(tb)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

762

763

l = []

764

if not doseq:

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

765

for k, v in query:

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

766

if isinstance(k, bytes):

767

k = quote_plus(k, safe)

768

else:

769

k = quote_plus(str(k), safe, encoding, errors)

770

771

if isinstance(v, bytes):

772

v = quote_plus(v, safe)

773

else:

774

v = quote_plus(str(v), safe, encoding, errors)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

775

l.append(k + '=' + v)

776

else:

777

for k, v in query:

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

778

if isinstance(k, bytes):

779

k = quote_plus(k, safe)

780

else:

781

k = quote_plus(str(k), safe, encoding, errors)

782

783

if isinstance(v, bytes):

784

v = quote_plus(v, safe)

785

l.append(k + '=' + v)

786

elif isinstance(v, str):

787

v = quote_plus(v, safe, encoding, errors)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

788

l.append(k + '=' + v)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

789

else:

790

try:

Jeremy Hylton

230feba

2009-03-26 16:56:59 +0000

[diff] [blame]

791

# Is this a sufficient test for sequence-ness?

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

792

x = len(v)

793

except TypeError:

794

# not a sequence

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

795

v = quote_plus(str(v), safe, encoding, errors)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

796

l.append(k + '=' + v)

797

else:

798

# loop over the sequence

799

for elt in v:

Senthil Kumaran

2010-07-03 17:48:22 +0000

[diff] [blame]

800

if isinstance(elt, bytes):

801

elt = quote_plus(elt, safe)

802

else:

803

elt = quote_plus(str(elt), safe, encoding, errors)

804

l.append(k + '=' + elt)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

805

return '&'.join(l)

806

807

# Utilities to parse URLs (most of these return None for missing parts):

808

# unwrap('<URL:type://host/path>') --> 'type://host/path'

809

# splittype('type:opaquestring') --> 'type', 'opaquestring'

810

# splithost('//host[:port]/path') --> 'host[:port]', '/path'

811

# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'

812

# splitpasswd('user:passwd') -> 'user', 'passwd'

813

# splitport('host:port') --> 'host', 'port'

814

# splitquery('/path?query') --> '/path', 'query'

815

# splittag('/path#tag') --> '/path', 'tag'

816

# splitattr('/path;attr1=value1;attr2=value2;...') ->

817

# '/path', ['attr1=value1', 'attr2=value2', ...]

818

# splitvalue('attr=value') --> 'attr', 'value'

819

# urllib.parse.unquote('abc%20def') -> 'abc def'

820

# quote('abc def') -> 'abc%20def')

821

Georg Brandl

13e8946

2008-07-01 19:56:00 +0000

[diff] [blame]

822

def to_bytes(url):

823

"""to_bytes(u"URL") --> 'URL'."""

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

824

# Most URL schemes require ASCII. If that changes, the conversion

825

# can be relaxed.

Georg Brandl

13e8946

2008-07-01 19:56:00 +0000

[diff] [blame]

826

# XXX get rid of to_bytes()

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

827

if isinstance(url, str):

828

try:

829

url = url.encode("ASCII").decode()

830

except UnicodeError:

831

raise UnicodeError("URL " + repr(url) +

832

" contains non-ASCII characters")

return url

def unwrap(url):

"""unwrap('<URL:type://host/path>') --> 'type://host/path'."""

837

url = str(url).strip()

838

if url[:1] == '<' and url[-1:] == '>':

839

url = url[1:-1].strip()

840

if url[:4] == 'URL:': url = url[4:].strip()

return url

_typeprog = None

def splittype(url):

"""splittype('type:opaquestring') --> 'type', 'opaquestring'."""

846

global _typeprog

847

if _typeprog is None:

848

import re

849

_typeprog = re.compile('^([^/:]+):')

850

851

match = _typeprog.match(url)

852

if match:

853

scheme = match.group(1)

854

return scheme.lower(), url[len(scheme) + 1:]

return None, url

_hostprog = None

def splithost(url):

"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""

860

global _hostprog

861

if _hostprog is None:

862

import re

863

_hostprog = re.compile('^//([^/?]*)(.*)$')

864

865

match = _hostprog.match(url)

Senthil Kumaran

c295862

2010-11-22 04:48:26 +0000

[diff] [blame]

866

if match:

867

host_port = match.group(1)

868

path = match.group(2)

869

if path and not path.startswith('/'):

870

path = '/' + path

871

return host_port, path

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

return None, url

_userprog = None

def splituser(host):

"""splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""

877

global _userprog

878

if _userprog is None:

879

import re

880

_userprog = re.compile('^(.*)@(.*)$')

881

882

match = _userprog.match(host)

Senthil Kumaran

daa29d0

2010-11-18 15:36:41 +0000

[diff] [blame]

883

if match: return match.group(1, 2)

Jeremy Hylton

2008-06-18 20:49:58 +0000

[diff] [blame]

return None, host

_passwdprog = None

def splitpasswd(user):

888

"""splitpasswd('user:passwd') -> 'user', 'passwd'."""

889

global _passwdprog

890

if _passwdprog is None:

891

import re

Senthil Kumaran

eaaec27

2009-03-30 21:54:41 +0000

[diff] [blame]

892

_passwdprog = re.compile('^([^:]*):(.*)$',re.S)

Jeremy Hylton