Blame - Lib/encodings/idna.py - platform/external/python/cpython2

2003-04-18 10:39:54 +0000

[diff] [blame]

1

# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)

2

Martin v. Löwis

480f1bb

2006-03-09 23:38:20 +0000

[diff] [blame]

3

import stringprep, re, codecs

Martin v. Löwis

5bd7c02

2006-03-10 11:20:04 +0000

[diff] [blame]

4

from unicodedata import ucd_3_2_0 as unicodedata

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

5

6

# IDNA section 3.1

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

7

dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

8

9

# IDNA section 5

10

ace_prefix = "xn--"

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

11

12

# This assumes query strings, so AllowUnassigned is true

def nameprep(label):

# Map

newlabel = []

for c in label:

if stringprep.in_table_b1(c):

18

# Map to nothing

19

continue

20

newlabel.append(stringprep.map_table_b2(c))

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

21

label = "".join(newlabel)

Tim Peters

2003-04-24 16:02:54 +0000

[diff] [blame]

22

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

23

# Normalize

24

label = unicodedata.normalize("NFKC", label)

Tim Peters

2003-04-24 16:02:54 +0000

[diff] [blame]

25

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

26

# Prohibit

27

for c in label:

28

if stringprep.in_table_c12(c) or \

29

stringprep.in_table_c22(c) or \

30

stringprep.in_table_c3(c) or \

31

stringprep.in_table_c4(c) or \

32

stringprep.in_table_c5(c) or \

33

stringprep.in_table_c6(c) or \

34

stringprep.in_table_c7(c) or \

35

stringprep.in_table_c8(c) or \

36

stringprep.in_table_c9(c):

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

37

raise UnicodeError("Invalid character %r" % c)

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

38

39

# Check bidi

40

RandAL = map(stringprep.in_table_d1, label)

41

for c in RandAL:

42

if c:

43

# There is a RandAL char in the string. Must perform further

44

# tests:

45

# 1) The characters in section 5.8 MUST be prohibited.

46

# This is table C.8, which was already checked

47

# 2) If a string contains any RandALCat character, the string

48

# MUST NOT contain any LCat character.

49

if filter(stringprep.in_table_d2, label):

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

50

raise UnicodeError("Violation of BIDI requirement 2")

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

51

52

# 3) If a string contains any RandALCat character, a

53

# RandALCat character MUST be the first character of the

54

# string, and a RandALCat character MUST be the last

55

# character of the string.

56

if not RandAL[0] or not RandAL[-1]:

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

57

raise UnicodeError("Violation of BIDI requirement 3")

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

return label

def ToASCII(label):

try:

# Step 1: try ASCII

label = label.encode("ascii")

except UnicodeError:

pass

else:

# Skip to step 3: UseSTD3ASCIIRules is false, so

69

# Skip to step 8.

70

if 0 < len(label) < 64:

71

return label

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

72

raise UnicodeError("label empty or too long")

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

73

74

# Step 2: nameprep

75

label = nameprep(label)

76

77

# Step 3: UseSTD3ASCIIRules is false

78

# Step 4: try ASCII

79

try:

80

label = label.encode("ascii")

except UnicodeError:

pass

else:

# Skip to step 8.

if 0 < len(label) < 64:

86

return label

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

87

raise UnicodeError("label empty or too long")

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

88

89

# Step 5: Check ACE prefix

Guido van Rossum

2007-05-09 23:40:37 +0000

[diff] [blame]

90

if label.startswith(ace_prefix):

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

91

raise UnicodeError("Label starts with ACE prefix")

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

92

93

# Step 6: Encode with PUNYCODE

94

label = label.encode("punycode")

95

96

# Step 7: Prepend ACE prefix

97

label = ace_prefix + label

98

99

# Step 8: Check size

100

if 0 < len(label) < 64:

101

return label

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

102

raise UnicodeError("label empty or too long")

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

103

104

def ToUnicode(label):

105

# Step 1: Check for ASCII

Guido van Rossum

2007-05-09 23:40:37 +0000

[diff] [blame]

106

if isinstance(label, bytes):

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

pure_ascii = True

else:

try:

label = label.encode("ascii")

pure_ascii = True

except UnicodeError:

pure_ascii = False

if not pure_ascii:

# Step 2: Perform nameprep

116

label = nameprep(label)

117

# It doesn't say this, but apparently, it should be ASCII now

118

try:

119

label = label.encode("ascii")

120

except UnicodeError:

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

121

raise UnicodeError("Invalid character in IDN label")

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

122

# Step 3: Check for ACE prefix

123

if not label.startswith(ace_prefix):

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

124

return str(label, "ascii")

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

125

126

# Step 4: Remove ACE prefix

127

label1 = label[len(ace_prefix):]

128

129

# Step 5: Decode using PUNYCODE

130

result = label1.decode("punycode")

131

132

# Step 6: Apply ToASCII

133

label2 = ToASCII(result)

134

135

# Step 7: Compare the result of step 6 with the one of step 3

136

# label2 will already be in lower case.

137

if label.lower() != label2:

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

138

raise UnicodeError("IDNA does not round-trip", label, label2)

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

139

140

# Step 8: return the result of step 5

141

return result

Tim Peters

2003-04-24 16:02:54 +0000

[diff] [blame]

142

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

143

### Codec APIs

144

145

class Codec(codecs.Codec):

146

def encode(self,input,errors='strict'):

147

148

if errors != 'strict':

149

# IDNA is quite clear that implementations must be strict

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

150

raise UnicodeError("unsupported error handling "+errors)

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

151

Martin v. Löwis

2005-08-25 11:03:38 +0000

[diff] [blame]

152

if not input:

Guido van Rossum

2007-05-09 23:40:37 +0000

[diff] [blame]

153

return b"", 0

Martin v. Löwis

2005-08-25 11:03:38 +0000

[diff] [blame]

154

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

155

result = []

Martin v. Löwis

2003-08-05 06:19:47 +0000

[diff] [blame]

156

labels = dots.split(input)

157

if labels and len(labels[-1])==0:

Guido van Rossum

2007-05-09 23:40:37 +0000

[diff] [blame]

158

trailing_dot = b'.'

Martin v. Löwis

2003-08-05 06:19:47 +0000

[diff] [blame]

159

del labels[-1]

160

else:

Guido van Rossum

2007-05-09 23:40:37 +0000

[diff] [blame]

161

trailing_dot = b''

Martin v. Löwis

2003-08-05 06:19:47 +0000

[diff] [blame]

162

for label in labels:

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

163

result.append(ToASCII(label))

164

# Join with U+002E

Guido van Rossum

2007-05-09 23:40:37 +0000

[diff] [blame]

165

return b".".join(result)+trailing_dot, len(input)

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

166

167

def decode(self,input,errors='strict'):

Tim Peters

2003-04-24 16:02:54 +0000

[diff] [blame]

168

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

169

if errors != 'strict':

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

170

raise UnicodeError("Unsupported error handling "+errors)

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

171

Martin v. Löwis

2005-08-25 11:03:38 +0000

[diff] [blame]

172

if not input:

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

173

return "", 0

Martin v. Löwis

2005-08-25 11:03:38 +0000

[diff] [blame]

174

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

175

# IDNA allows decoding to operate on Unicode strings, too.

Guido van Rossum

2007-05-09 23:40:37 +0000

[diff] [blame]

176

if isinstance(input, bytes):

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

177

labels = dots.split(input)

178

else:

Guido van Rossum

2007-05-09 23:40:37 +0000

[diff] [blame]

179

# Force to bytes

180

input = bytes(input)

181

labels = input.split(b".")

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

182

Martin v. Löwis

2003-08-05 06:19:47 +0000

[diff] [blame]

183

if labels and len(labels[-1]) == 0:

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

184

trailing_dot = '.'

Martin v. Löwis

2003-08-05 06:19:47 +0000

[diff] [blame]

185

del labels[-1]

186

else:

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

187

trailing_dot = ''

Martin v. Löwis

2003-08-05 06:19:47 +0000

[diff] [blame]

188

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

189

result = []

190

for label in labels:

191

result.append(ToUnicode(label))

192

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

193

return ".".join(result)+trailing_dot, len(input)

Martin v. Löwis

2003-04-18 10:39:54 +0000

[diff] [blame]

194

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

195

class IncrementalEncoder(codecs.BufferedIncrementalEncoder):

196

def _buffer_encode(self, input, errors, final):

197

if errors != 'strict':

198

# IDNA is quite clear that implementations must be strict

199

raise UnicodeError("unsupported error handling "+errors)

Thomas Wouters

a977329

2006-04-21 09:43:23 +0000

[diff] [blame]

200

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

if not input:

return ("", 0)

labels = dots.split(input)

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

205

trailing_dot = ''

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

if labels:

if not labels[-1]:

trailing_dot = '.'

del labels[-1]

elif not final:

# Keep potentially unfinished label until the next call

del labels[-1]

if labels:

trailing_dot = '.'

result = []

size = 0

for label in labels:

result.append(ToASCII(label))

if size:

size += 1

size += len(label)

# Join with U+002E

result = ".".join(result) + trailing_dot

226

size += len(trailing_dot)

227

return (result, size)

228

229

class IncrementalDecoder(codecs.BufferedIncrementalDecoder):

230

def _buffer_decode(self, input, errors, final):

231

if errors != 'strict':

232

raise UnicodeError("Unsupported error handling "+errors)

233

234

if not input:

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

235

return ("", 0)

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

236

237

# IDNA allows decoding to operate on Unicode strings, too.

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

238

if isinstance(input, str):

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

239

labels = dots.split(input)

240

else:

241

# Must be ASCII string

242

input = str(input)

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

243

str(input, "ascii")

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

244

labels = input.split(".")

245

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

246

trailing_dot = ''

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

247

if labels:

248

if not labels[-1]:

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

249

trailing_dot = '.'

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

250

del labels[-1]

251

elif not final:

252

# Keep potentially unfinished label until the next call

253

del labels[-1]

254

if labels:

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

255

trailing_dot = '.'

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

result = []

size = 0

for label in labels:

result.append(ToUnicode(label))

if size:

size += 1

size += len(label)

Guido van Rossum

2007-05-02 19:09:54 +0000

[diff] [blame]

265

result = ".".join(result) + trailing_dot

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

266

size += len(trailing_dot)

267

return (result, size)

Thomas Wouters

a977329

2006-04-21 09:43:23 +0000

[diff] [blame]

268

Martin v. Löwis