blob: f8a31d88736a83cd6342fb6e807793a79833f592 [file] [log] [blame]
Martin v. Löwis2548c732003-04-18 10:39:54 +00001# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
2
3import stringprep, unicodedata, re, codecs
4
5# IDNA section 3.1
6dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")
7
8# IDNA section 5
9ace_prefix = "xn--"
10uace_prefix = unicode(ace_prefix, "ascii")
11
12# This assumes query strings, so AllowUnassigned is true
13def nameprep(label):
14 # Map
15 newlabel = []
16 for c in label:
17 if stringprep.in_table_b1(c):
18 # Map to nothing
19 continue
20 newlabel.append(stringprep.map_table_b2(c))
21 label = u"".join(newlabel)
Tim Peters0eadaac2003-04-24 16:02:54 +000022
Martin v. Löwis2548c732003-04-18 10:39:54 +000023 # Normalize
24 label = unicodedata.normalize("NFKC", label)
Tim Peters0eadaac2003-04-24 16:02:54 +000025
Martin v. Löwis2548c732003-04-18 10:39:54 +000026 # Prohibit
27 for c in label:
28 if stringprep.in_table_c12(c) or \
29 stringprep.in_table_c22(c) or \
30 stringprep.in_table_c3(c) or \
31 stringprep.in_table_c4(c) or \
32 stringprep.in_table_c5(c) or \
33 stringprep.in_table_c6(c) or \
34 stringprep.in_table_c7(c) or \
35 stringprep.in_table_c8(c) or \
36 stringprep.in_table_c9(c):
37 raise UnicodeError, "Invalid character %s" % repr(c)
38
39 # Check bidi
40 RandAL = map(stringprep.in_table_d1, label)
41 for c in RandAL:
42 if c:
43 # There is a RandAL char in the string. Must perform further
44 # tests:
45 # 1) The characters in section 5.8 MUST be prohibited.
46 # This is table C.8, which was already checked
47 # 2) If a string contains any RandALCat character, the string
48 # MUST NOT contain any LCat character.
49 if filter(stringprep.in_table_d2, label):
50 raise UnicodeError, "Violation of BIDI requirement 2"
51
52 # 3) If a string contains any RandALCat character, a
53 # RandALCat character MUST be the first character of the
54 # string, and a RandALCat character MUST be the last
55 # character of the string.
56 if not RandAL[0] or not RandAL[-1]:
57 raise UnicodeError, "Violation of BIDI requirement 3"
58
59 return label
60
61def ToASCII(label):
62 try:
63 # Step 1: try ASCII
64 label = label.encode("ascii")
65 except UnicodeError:
66 pass
67 else:
68 # Skip to step 3: UseSTD3ASCIIRules is false, so
69 # Skip to step 8.
70 if 0 < len(label) < 64:
71 return label
72 raise UnicodeError, "label too long"
73
74 # Step 2: nameprep
75 label = nameprep(label)
76
77 # Step 3: UseSTD3ASCIIRules is false
78 # Step 4: try ASCII
79 try:
80 label = label.encode("ascii")
81 except UnicodeError:
82 pass
83 else:
84 # Skip to step 8.
85 if 0 < len(label) < 64:
86 return label
87 raise UnicodeError, "label too long"
88
89 # Step 5: Check ACE prefix
90 if label.startswith(uace_prefix):
91 raise UnicodeError, "Label starts with ACE prefix"
92
93 # Step 6: Encode with PUNYCODE
94 label = label.encode("punycode")
95
96 # Step 7: Prepend ACE prefix
97 label = ace_prefix + label
98
99 # Step 8: Check size
100 if 0 < len(label) < 64:
101 return label
102 raise UnicodeError, "label too long"
103
104def ToUnicode(label):
105 # Step 1: Check for ASCII
106 if isinstance(label, str):
107 pure_ascii = True
108 else:
109 try:
110 label = label.encode("ascii")
111 pure_ascii = True
112 except UnicodeError:
113 pure_ascii = False
114 if not pure_ascii:
115 # Step 2: Perform nameprep
116 label = nameprep(label)
117 # It doesn't say this, but apparently, it should be ASCII now
118 try:
119 label = label.encode("ascii")
120 except UnicodeError:
121 raise UnicodeError, "Invalid character in IDN label"
122 # Step 3: Check for ACE prefix
123 if not label.startswith(ace_prefix):
124 return unicode(label, "ascii")
125
126 # Step 4: Remove ACE prefix
127 label1 = label[len(ace_prefix):]
128
129 # Step 5: Decode using PUNYCODE
130 result = label1.decode("punycode")
131
132 # Step 6: Apply ToASCII
133 label2 = ToASCII(result)
134
135 # Step 7: Compare the result of step 6 with the one of step 3
136 # label2 will already be in lower case.
137 if label.lower() != label2:
138 raise UnicodeError, ("IDNA does not round-trip", label, label2)
139
140 # Step 8: return the result of step 5
141 return result
Tim Peters0eadaac2003-04-24 16:02:54 +0000142
Martin v. Löwis2548c732003-04-18 10:39:54 +0000143### Codec APIs
144
145class Codec(codecs.Codec):
146 def encode(self,input,errors='strict'):
147
148 if errors != 'strict':
149 # IDNA is quite clear that implementations must be strict
150 raise UnicodeError, "unsupported error handling "+errors
151
Martin v. Löwis8b595142005-08-25 11:03:38 +0000152 if not input:
153 return "", 0
154
Martin v. Löwis2548c732003-04-18 10:39:54 +0000155 result = []
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000156 labels = dots.split(input)
157 if labels and len(labels[-1])==0:
158 trailing_dot = '.'
159 del labels[-1]
160 else:
161 trailing_dot = ''
162 for label in labels:
Martin v. Löwis2548c732003-04-18 10:39:54 +0000163 result.append(ToASCII(label))
164 # Join with U+002E
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000165 return ".".join(result)+trailing_dot, len(input)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000166
167 def decode(self,input,errors='strict'):
Tim Peters0eadaac2003-04-24 16:02:54 +0000168
Martin v. Löwis2548c732003-04-18 10:39:54 +0000169 if errors != 'strict':
170 raise UnicodeError, "Unsupported error handling "+errors
171
Martin v. Löwis8b595142005-08-25 11:03:38 +0000172 if not input:
173 return u"", 0
174
Martin v. Löwis2548c732003-04-18 10:39:54 +0000175 # IDNA allows decoding to operate on Unicode strings, too.
176 if isinstance(input, unicode):
177 labels = dots.split(input)
178 else:
179 # Must be ASCII string
Martin v. Löwis708b4da2004-03-23 23:40:36 +0000180 input = str(input)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000181 unicode(input, "ascii")
182 labels = input.split(".")
183
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000184 if labels and len(labels[-1]) == 0:
185 trailing_dot = u'.'
186 del labels[-1]
187 else:
188 trailing_dot = u''
189
Martin v. Löwis2548c732003-04-18 10:39:54 +0000190 result = []
191 for label in labels:
192 result.append(ToUnicode(label))
193
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000194 return u".".join(result)+trailing_dot, len(input)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000195
196class StreamWriter(Codec,codecs.StreamWriter):
197 pass
198
199class StreamReader(Codec,codecs.StreamReader):
200 pass
201
202### encodings module API
203
204def getregentry():
205
206 return (Codec().encode,Codec().decode,StreamReader,StreamWriter)