| # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) |
| |
| import stringprep, unicodedata, re, codecs |
| |
| # IDNA section 3.1 |
| dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]") |
| |
| # IDNA section 5 |
| ace_prefix = "xn--" |
| uace_prefix = unicode(ace_prefix, "ascii") |
| |
| # This assumes query strings, so AllowUnassigned is true |
| def nameprep(label): |
| # Map |
| newlabel = [] |
| for c in label: |
| if stringprep.in_table_b1(c): |
| # Map to nothing |
| continue |
| newlabel.append(stringprep.map_table_b2(c)) |
| label = u"".join(newlabel) |
| |
| # Normalize |
| label = unicodedata.normalize("NFKC", label) |
| |
| # Prohibit |
| for c in label: |
| if stringprep.in_table_c12(c) or \ |
| stringprep.in_table_c22(c) or \ |
| stringprep.in_table_c3(c) or \ |
| stringprep.in_table_c4(c) or \ |
| stringprep.in_table_c5(c) or \ |
| stringprep.in_table_c6(c) or \ |
| stringprep.in_table_c7(c) or \ |
| stringprep.in_table_c8(c) or \ |
| stringprep.in_table_c9(c): |
| raise UnicodeError, "Invalid character %s" % repr(c) |
| |
| # Check bidi |
| RandAL = map(stringprep.in_table_d1, label) |
| for c in RandAL: |
| if c: |
| # There is a RandAL char in the string. Must perform further |
| # tests: |
| # 1) The characters in section 5.8 MUST be prohibited. |
| # This is table C.8, which was already checked |
| # 2) If a string contains any RandALCat character, the string |
| # MUST NOT contain any LCat character. |
| if filter(stringprep.in_table_d2, label): |
| raise UnicodeError, "Violation of BIDI requirement 2" |
| |
| # 3) If a string contains any RandALCat character, a |
| # RandALCat character MUST be the first character of the |
| # string, and a RandALCat character MUST be the last |
| # character of the string. |
| if not RandAL[0] or not RandAL[-1]: |
| raise UnicodeError, "Violation of BIDI requirement 3" |
| |
| return label |
| |
| def ToASCII(label): |
| try: |
| # Step 1: try ASCII |
| label = label.encode("ascii") |
| except UnicodeError: |
| pass |
| else: |
| # Skip to step 3: UseSTD3ASCIIRules is false, so |
| # Skip to step 8. |
| if 0 < len(label) < 64: |
| return label |
| raise UnicodeError, "label too long" |
| |
| # Step 2: nameprep |
| label = nameprep(label) |
| |
| # Step 3: UseSTD3ASCIIRules is false |
| # Step 4: try ASCII |
| try: |
| label = label.encode("ascii") |
| except UnicodeError: |
| pass |
| else: |
| # Skip to step 8. |
| if 0 < len(label) < 64: |
| return label |
| raise UnicodeError, "label too long" |
| |
| # Step 5: Check ACE prefix |
| if label.startswith(uace_prefix): |
| raise UnicodeError, "Label starts with ACE prefix" |
| |
| # Step 6: Encode with PUNYCODE |
| label = label.encode("punycode") |
| |
| # Step 7: Prepend ACE prefix |
| label = ace_prefix + label |
| |
| # Step 8: Check size |
| if 0 < len(label) < 64: |
| return label |
| raise UnicodeError, "label too long" |
| |
| def ToUnicode(label): |
| # Step 1: Check for ASCII |
| if isinstance(label, str): |
| pure_ascii = True |
| else: |
| try: |
| label = label.encode("ascii") |
| pure_ascii = True |
| except UnicodeError: |
| pure_ascii = False |
| if not pure_ascii: |
| # Step 2: Perform nameprep |
| label = nameprep(label) |
| # It doesn't say this, but apparently, it should be ASCII now |
| try: |
| label = label.encode("ascii") |
| except UnicodeError: |
| raise UnicodeError, "Invalid character in IDN label" |
| # Step 3: Check for ACE prefix |
| if not label.startswith(ace_prefix): |
| return unicode(label, "ascii") |
| |
| # Step 4: Remove ACE prefix |
| label1 = label[len(ace_prefix):] |
| |
| # Step 5: Decode using PUNYCODE |
| result = label1.decode("punycode") |
| |
| # Step 6: Apply ToASCII |
| label2 = ToASCII(result) |
| |
| # Step 7: Compare the result of step 6 with the one of step 3 |
| # label2 will already be in lower case. |
| if label.lower() != label2: |
| raise UnicodeError, ("IDNA does not round-trip", label, label2) |
| |
| # Step 8: return the result of step 5 |
| return result |
| |
| ### Codec APIs |
| |
| class Codec(codecs.Codec): |
| def encode(self,input,errors='strict'): |
| |
| if errors != 'strict': |
| # IDNA is quite clear that implementations must be strict |
| raise UnicodeError, "unsupported error handling "+errors |
| |
| if not input: |
| return "", 0 |
| |
| result = [] |
| labels = dots.split(input) |
| if labels and len(labels[-1])==0: |
| trailing_dot = '.' |
| del labels[-1] |
| else: |
| trailing_dot = '' |
| for label in labels: |
| result.append(ToASCII(label)) |
| # Join with U+002E |
| return ".".join(result)+trailing_dot, len(input) |
| |
| def decode(self,input,errors='strict'): |
| |
| if errors != 'strict': |
| raise UnicodeError, "Unsupported error handling "+errors |
| |
| if not input: |
| return u"", 0 |
| |
| # IDNA allows decoding to operate on Unicode strings, too. |
| if isinstance(input, unicode): |
| labels = dots.split(input) |
| else: |
| # Must be ASCII string |
| input = str(input) |
| unicode(input, "ascii") |
| labels = input.split(".") |
| |
| if labels and len(labels[-1]) == 0: |
| trailing_dot = u'.' |
| del labels[-1] |
| else: |
| trailing_dot = u'' |
| |
| result = [] |
| for label in labels: |
| result.append(ToUnicode(label)) |
| |
| return u".".join(result)+trailing_dot, len(input) |
| |
| class StreamWriter(Codec,codecs.StreamWriter): |
| pass |
| |
| class StreamReader(Codec,codecs.StreamReader): |
| pass |
| |
| ### encodings module API |
| |
| def getregentry(): |
| |
| return (Codec().encode,Codec().decode,StreamReader,StreamWriter) |