Blame - Lib/encodings/idna.py - platform/external/python/cpython3

2003-04-18 10:39:54 +0000

[diff] [blame]

1

# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)

2

Martin v. Löwis

480f1bb

2006-03-09 23:38:20 +0000

[diff] [blame]

3

import stringprep, re, codecs

Martin v. Löwis

5bd7c02

2006-03-10 11:20:04 +0000

[diff] [blame]

4

from unicodedata import ucd_3_2_0 as unicodedata

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

5

6

# IDNA section 3.1

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

7

dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

8

9

# IDNA section 5

Walter Dörwald

2007-05-11 10:32:57 +0000

[diff] [blame]

10

ace_prefix = b"xn--"

11

sace_prefix = "xn--"

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

12

13

# This assumes query strings, so AllowUnassigned is true

def nameprep(label):

# Map

newlabel = []

for c in label:

if stringprep.in_table_b1(c):

19

# Map to nothing

20

continue

21

newlabel.append(stringprep.map_table_b2(c))

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

22

label = "".join(newlabel)

Tim Peters

2003-04-24 16:02:54 +0000

[diff] [blame]

23

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

24

# Normalize

25

label = unicodedata.normalize("NFKC", label)

Tim Peters

2003-04-24 16:02:54 +0000

[diff] [blame]

26

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

27

# Prohibit

28

for c in label:

29

if stringprep.in_table_c12(c) or \

30

stringprep.in_table_c22(c) or \

31

stringprep.in_table_c3(c) or \

32

stringprep.in_table_c4(c) or \

33

stringprep.in_table_c5(c) or \

34

stringprep.in_table_c6(c) or \

35

stringprep.in_table_c7(c) or \

36

stringprep.in_table_c8(c) or \

37

stringprep.in_table_c9(c):

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

38

raise UnicodeError("Invalid character %r" % c)

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

39

40

# Check bidi

Guido van Rossum

c1f779c

2007-07-03 08:25:58 +0000

[diff] [blame]

41

RandAL = [stringprep.in_table_d1(x) for x in label]

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

42

for c in RandAL:

43

if c:

44

# There is a RandAL char in the string. Must perform further

45

# tests:

46

# 1) The characters in section 5.8 MUST be prohibited.

47

# This is table C.8, which was already checked

48

# 2) If a string contains any RandALCat character, the string

49

# MUST NOT contain any LCat character.

Guido van Rossum

c1f779c

2007-07-03 08:25:58 +0000

[diff] [blame]

50

if any(stringprep.in_table_d2(x) for x in label):

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

51

raise UnicodeError("Violation of BIDI requirement 2")

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

52

53

# 3) If a string contains any RandALCat character, a

54

# RandALCat character MUST be the first character of the

55

# string, and a RandALCat character MUST be the last

56

# character of the string.

57

if not RandAL[0] or not RandAL[-1]:

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

58

raise UnicodeError("Violation of BIDI requirement 3")

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

return label

def ToASCII(label):

try:

# Step 1: try ASCII

label = label.encode("ascii")

except UnicodeError:

pass

else:

# Skip to step 3: UseSTD3ASCIIRules is false, so

70

# Skip to step 8.

71

if 0 < len(label) < 64:

72

return label

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

73

raise UnicodeError("label empty or too long")

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

74

75

# Step 2: nameprep

76

label = nameprep(label)

77

78

# Step 3: UseSTD3ASCIIRules is false

79

# Step 4: try ASCII

80

try:

81

label = label.encode("ascii")

except UnicodeError:

pass

else:

# Skip to step 8.

if 0 < len(label) < 64:

87

return label

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

88

raise UnicodeError("label empty or too long")

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

89

90

# Step 5: Check ACE prefix

Walter Dörwald

2007-05-11 10:32:57 +0000

[diff] [blame]

91

if label.startswith(sace_prefix):

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

92

raise UnicodeError("Label starts with ACE prefix")

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

93

94

# Step 6: Encode with PUNYCODE

95

label = label.encode("punycode")

96

97

# Step 7: Prepend ACE prefix

98

label = ace_prefix + label

99

100

# Step 8: Check size

101

if 0 < len(label) < 64:

102

return label

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

103

raise UnicodeError("label empty or too long")

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

104

105

def ToUnicode(label):

106

# Step 1: Check for ASCII

Guido van Rossum

2007-05-09 23:40:37 +0000

[diff] [blame]

107

if isinstance(label, bytes):

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

pure_ascii = True

else:

try:

label = label.encode("ascii")

pure_ascii = True

except UnicodeError:

pure_ascii = False

if not pure_ascii:

# Step 2: Perform nameprep

117

label = nameprep(label)

118

# It doesn't say this, but apparently, it should be ASCII now

119

try:

120

label = label.encode("ascii")

121

except UnicodeError:

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

122

raise UnicodeError("Invalid character in IDN label")

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

123

# Step 3: Check for ACE prefix

124

if not label.startswith(ace_prefix):

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

125

return str(label, "ascii")

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

126

127

# Step 4: Remove ACE prefix

128

label1 = label[len(ace_prefix):]

129

130

# Step 5: Decode using PUNYCODE

131

result = label1.decode("punycode")

132

133

# Step 6: Apply ToASCII

134

label2 = ToASCII(result)

135

136

# Step 7: Compare the result of step 6 with the one of step 3

137

# label2 will already be in lower case.

Walter Dörwald

2007-05-11 10:32:57 +0000

[diff] [blame]

138

if str(label, "ascii").lower() != str(label2, "ascii"):

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

139

raise UnicodeError("IDNA does not round-trip", label, label2)

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

140

141

# Step 8: return the result of step 5

142

return result

Tim Peters

2003-04-24 16:02:54 +0000

[diff] [blame]

143

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

144

### Codec APIs

145

146

class Codec(codecs.Codec):

Walter Dörwald

2007-05-11 10:32:57 +0000

[diff] [blame]

147

def encode(self, input, errors='strict'):

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

148

149

if errors != 'strict':

150

# IDNA is quite clear that implementations must be strict

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

151

raise UnicodeError("unsupported error handling "+errors)

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

152

Martin v. Löwis

2005-08-25 11:03:38 +0000

[diff] [blame]

153

if not input:

Guido van Rossum

98297ee

2007-11-06 21:34:58 +0000

[diff] [blame]

154

return b'', 0

Martin v. Löwis

2005-08-25 11:03:38 +0000

[diff] [blame]

155

Antoine Pitrou

9768676

2011-11-10 22:49:20 +0100

[diff] [blame]

156

try:

157

result = input.encode('ascii')

158

except UnicodeEncodeError:

159

pass

160

else:

161

# ASCII name: fast path

162

labels = result.split(b'.')

163

for label in labels[:-1]:

164

if not (0 < len(label) < 64):

165

raise UnicodeError("label empty or too long")

166

if len(labels[-1]) >= 64:

167

raise UnicodeError("label too long")

168

return result, len(input)

169

Guido van Rossum

254348e

2007-11-21 19:29:53 +0000

[diff] [blame]

170

result = bytearray()

Martin v. Löwis

2003-08-05 06:19:47 +0000

[diff] [blame]

171

labels = dots.split(input)

Walter Dörwald

2007-05-11 10:32:57 +0000

[diff] [blame]

172

if labels and not labels[-1]:

Guido van Rossum

2007-05-09 23:40:37 +0000

[diff] [blame]

173

trailing_dot = b'.'

Martin v. Löwis

2003-08-05 06:19:47 +0000

[diff] [blame]

174

del labels[-1]

175

else:

Guido van Rossum

2007-05-09 23:40:37 +0000

[diff] [blame]

176

trailing_dot = b''

Martin v. Löwis

2003-08-05 06:19:47 +0000

[diff] [blame]

177

for label in labels:

Walter Dörwald

2007-05-11 10:32:57 +0000

[diff] [blame]

if result:

# Join with U+002E

result.extend(b'.')

result.extend(ToASCII(label))

Guido van Rossum

98297ee

2007-11-06 21:34:58 +0000

[diff] [blame]

182

return bytes(result+trailing_dot), len(input)

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

183

Walter Dörwald

2007-05-11 10:32:57 +0000

[diff] [blame]

184

def decode(self, input, errors='strict'):

Tim Peters

2003-04-24 16:02:54 +0000

[diff] [blame]

185

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

186

if errors != 'strict':

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

187

raise UnicodeError("Unsupported error handling "+errors)

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

188

Martin v. Löwis

2005-08-25 11:03:38 +0000

[diff] [blame]

189

if not input:

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

190

return "", 0

Martin v. Löwis

2005-08-25 11:03:38 +0000

[diff] [blame]

191

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

192

# IDNA allows decoding to operate on Unicode strings, too.

Antoine Pitrou

fd03645

2008-08-19 17:56:33 +0000

[diff] [blame]

193

if not isinstance(input, bytes):

194

# XXX obviously wrong, see #3232

Guido van Rossum

2007-05-09 23:40:37 +0000

[diff] [blame]

195

input = bytes(input)

Antoine Pitrou

9768676

2011-11-10 22:49:20 +0100

[diff] [blame]

196

197

if ace_prefix not in input:

198

# Fast path

199

try:

200

return input.decode('ascii'), len(input)

201

except UnicodeDecodeError:

202

pass

203

Antoine Pitrou

fd03645

2008-08-19 17:56:33 +0000

[diff] [blame]

204

labels = input.split(b".")

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

205

Martin v. Löwis

2003-08-05 06:19:47 +0000

[diff] [blame]

206

if labels and len(labels[-1]) == 0:

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

207

trailing_dot = '.'

Martin v. Löwis

2003-08-05 06:19:47 +0000

[diff] [blame]

208

del labels[-1]

209

else:

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

210

trailing_dot = ''

Martin v. Löwis

2003-08-05 06:19:47 +0000

[diff] [blame]

211

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

212

result = []

213

for label in labels:

214

result.append(ToUnicode(label))

215

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

216

return ".".join(result)+trailing_dot, len(input)

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

217

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

218

class IncrementalEncoder(codecs.BufferedIncrementalEncoder):

219

def _buffer_encode(self, input, errors, final):

220

if errors != 'strict':

221

# IDNA is quite clear that implementations must be strict

222

raise UnicodeError("unsupported error handling "+errors)

Thomas Wouters

a977329

2006-04-21 09:43:23 +0000

[diff] [blame]

223

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

224

if not input:

Walter Dörwald

2007-05-11 10:32:57 +0000

[diff] [blame]

225

return (b'', 0)

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

226

227

labels = dots.split(input)

Walter Dörwald

2007-05-11 10:32:57 +0000

[diff] [blame]

228

trailing_dot = b''

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

229

if labels:

230

if not labels[-1]:

Walter Dörwald

2007-05-11 10:32:57 +0000

[diff] [blame]

231

trailing_dot = b'.'

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

232

del labels[-1]

233

elif not final:

234

# Keep potentially unfinished label until the next call

235

del labels[-1]

236

if labels:

Walter Dörwald

2007-05-11 10:32:57 +0000

[diff] [blame]

237

trailing_dot = b'.'

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

238

Guido van Rossum

254348e

2007-11-21 19:29:53 +0000

[diff] [blame]

239

result = bytearray()

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

240

size = 0

241

for label in labels:

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

242

if size:

Walter Dörwald

2007-05-11 10:32:57 +0000

[diff] [blame]

243

# Join with U+002E

244

result.extend(b'.')

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

245

size += 1

Walter Dörwald

2007-05-11 10:32:57 +0000

[diff] [blame]

246

result.extend(ToASCII(label))

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

247

size += len(label)

248

Walter Dörwald

2007-05-11 10:32:57 +0000

[diff] [blame]

249

result += trailing_dot

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

250

size += len(trailing_dot)

Guido van Rossum

98297ee

2007-11-06 21:34:58 +0000

[diff] [blame]

251

return (bytes(result), size)

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

252

253

class IncrementalDecoder(codecs.BufferedIncrementalDecoder):

254

def _buffer_decode(self, input, errors, final):

255

if errors != 'strict':

256

raise UnicodeError("Unsupported error handling "+errors)

257

258

if not input:

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

259

return ("", 0)

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

260

261

# IDNA allows decoding to operate on Unicode strings, too.

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

262

if isinstance(input, str):

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

263

labels = dots.split(input)

264

else:

265

# Must be ASCII string

Walter Dörwald

2007-05-11 10:32:57 +0000

[diff] [blame]

266

input = str(input, "ascii")

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

267

labels = input.split(".")

268

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

269

trailing_dot = ''

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

270

if labels:

271

if not labels[-1]:

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

272

trailing_dot = '.'

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

273

del labels[-1]

274

elif not final:

275

# Keep potentially unfinished label until the next call

276

del labels[-1]

277

if labels:

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

278

trailing_dot = '.'

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

result = []

size = 0

for label in labels:

result.append(ToUnicode(label))

if size:

size += 1

size += len(label)

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

288

result = ".".join(result) + trailing_dot

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

289

size += len(trailing_dot)

290

return (result, size)

Thomas Wouters

a977329

2006-04-21 09:43:23 +0000

[diff] [blame]

291

Martin v. Löwis