Blame - Lib/encodings/idna.py - platform/external/python/cpython3

blob: fde710b395f4d9b97aee290d0b7e3886f149ddba [file] [log] [blame]

Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	1	# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
				2
Martin v. Löwis	480f1bb	2006-03-09 23:38:20 +0000	[diff] [blame^]	3	import stringprep, re, codecs
				4	from unicodedata import db_3_2_0 as unicodedata
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	5
				6	# IDNA section 3.1
				7	dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")
				8
				9	# IDNA section 5
				10	ace_prefix = "xn--"
				11	uace_prefix = unicode(ace_prefix, "ascii")
				12
				13	# This assumes query strings, so AllowUnassigned is true
				14	def nameprep(label):
				15	# Map
				16	newlabel = []
				17	for c in label:
				18	if stringprep.in_table_b1(c):
				19	# Map to nothing
				20	continue
				21	newlabel.append(stringprep.map_table_b2(c))
				22	label = u"".join(newlabel)
Tim Peters	0eadaac	2003-04-24 16:02:54 +0000	[diff] [blame]	23
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	24	# Normalize
				25	label = unicodedata.normalize("NFKC", label)
Tim Peters	0eadaac	2003-04-24 16:02:54 +0000	[diff] [blame]	26
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	27	# Prohibit
				28	for c in label:
				29	if stringprep.in_table_c12(c) or \
				30	stringprep.in_table_c22(c) or \
				31	stringprep.in_table_c3(c) or \
				32	stringprep.in_table_c4(c) or \
				33	stringprep.in_table_c5(c) or \
				34	stringprep.in_table_c6(c) or \
				35	stringprep.in_table_c7(c) or \
				36	stringprep.in_table_c8(c) or \
				37	stringprep.in_table_c9(c):
				38	raise UnicodeError, "Invalid character %s" % repr(c)
				39
				40	# Check bidi
				41	RandAL = map(stringprep.in_table_d1, label)
				42	for c in RandAL:
				43	if c:
				44	# There is a RandAL char in the string. Must perform further
				45	# tests:
				46	# 1) The characters in section 5.8 MUST be prohibited.
				47	# This is table C.8, which was already checked
				48	# 2) If a string contains any RandALCat character, the string
				49	# MUST NOT contain any LCat character.
				50	if filter(stringprep.in_table_d2, label):
				51	raise UnicodeError, "Violation of BIDI requirement 2"
				52
				53	# 3) If a string contains any RandALCat character, a
				54	# RandALCat character MUST be the first character of the
				55	# string, and a RandALCat character MUST be the last
				56	# character of the string.
				57	if not RandAL[0] or not RandAL[-1]:
				58	raise UnicodeError, "Violation of BIDI requirement 3"
				59
				60	return label
				61
				62	def ToASCII(label):
				63	try:
				64	# Step 1: try ASCII
				65	label = label.encode("ascii")
				66	except UnicodeError:
				67	pass
				68	else:
				69	# Skip to step 3: UseSTD3ASCIIRules is false, so
				70	# Skip to step 8.
				71	if 0 < len(label) < 64:
				72	return label
				73	raise UnicodeError, "label too long"
				74
				75	# Step 2: nameprep
				76	label = nameprep(label)
				77
				78	# Step 3: UseSTD3ASCIIRules is false
				79	# Step 4: try ASCII
				80	try:
				81	label = label.encode("ascii")
				82	except UnicodeError:
				83	pass
				84	else:
				85	# Skip to step 8.
				86	if 0 < len(label) < 64:
				87	return label
				88	raise UnicodeError, "label too long"
				89
				90	# Step 5: Check ACE prefix
				91	if label.startswith(uace_prefix):
				92	raise UnicodeError, "Label starts with ACE prefix"
				93
				94	# Step 6: Encode with PUNYCODE
				95	label = label.encode("punycode")
				96
				97	# Step 7: Prepend ACE prefix
				98	label = ace_prefix + label
				99
				100	# Step 8: Check size
				101	if 0 < len(label) < 64:
				102	return label
				103	raise UnicodeError, "label too long"
				104
				105	def ToUnicode(label):
				106	# Step 1: Check for ASCII
				107	if isinstance(label, str):
				108	pure_ascii = True
				109	else:
				110	try:
				111	label = label.encode("ascii")
				112	pure_ascii = True
				113	except UnicodeError:
				114	pure_ascii = False
				115	if not pure_ascii:
				116	# Step 2: Perform nameprep
				117	label = nameprep(label)
				118	# It doesn't say this, but apparently, it should be ASCII now
				119	try:
				120	label = label.encode("ascii")
				121	except UnicodeError:
				122	raise UnicodeError, "Invalid character in IDN label"
				123	# Step 3: Check for ACE prefix
				124	if not label.startswith(ace_prefix):
				125	return unicode(label, "ascii")
				126
				127	# Step 4: Remove ACE prefix
				128	label1 = label[len(ace_prefix):]
				129
				130	# Step 5: Decode using PUNYCODE
				131	result = label1.decode("punycode")
				132
				133	# Step 6: Apply ToASCII
				134	label2 = ToASCII(result)
				135
				136	# Step 7: Compare the result of step 6 with the one of step 3
				137	# label2 will already be in lower case.
				138	if label.lower() != label2:
				139	raise UnicodeError, ("IDNA does not round-trip", label, label2)
				140
				141	# Step 8: return the result of step 5
				142	return result
Tim Peters	0eadaac	2003-04-24 16:02:54 +0000	[diff] [blame]	143
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	144	### Codec APIs
				145
				146	class Codec(codecs.Codec):
				147	def encode(self,input,errors='strict'):
				148
				149	if errors != 'strict':
				150	# IDNA is quite clear that implementations must be strict
				151	raise UnicodeError, "unsupported error handling "+errors
				152
Martin v. Löwis	8b59514	2005-08-25 11:03:38 +0000	[diff] [blame]	153	if not input:
				154	return "", 0
				155
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	156	result = []
Martin v. Löwis	0d8e16c	2003-08-05 06:19:47 +0000	[diff] [blame]	157	labels = dots.split(input)
				158	if labels and len(labels[-1])==0:
				159	trailing_dot = '.'
				160	del labels[-1]
				161	else:
				162	trailing_dot = ''
				163	for label in labels:
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	164	result.append(ToASCII(label))
				165	# Join with U+002E
Martin v. Löwis	0d8e16c	2003-08-05 06:19:47 +0000	[diff] [blame]	166	return ".".join(result)+trailing_dot, len(input)
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	167
				168	def decode(self,input,errors='strict'):
Tim Peters	0eadaac	2003-04-24 16:02:54 +0000	[diff] [blame]	169
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	170	if errors != 'strict':
				171	raise UnicodeError, "Unsupported error handling "+errors
				172
Martin v. Löwis	8b59514	2005-08-25 11:03:38 +0000	[diff] [blame]	173	if not input:
				174	return u"", 0
				175
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	176	# IDNA allows decoding to operate on Unicode strings, too.
				177	if isinstance(input, unicode):
				178	labels = dots.split(input)
				179	else:
				180	# Must be ASCII string
Martin v. Löwis	708b4da	2004-03-23 23:40:36 +0000	[diff] [blame]	181	input = str(input)
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	182	unicode(input, "ascii")
				183	labels = input.split(".")
				184
Martin v. Löwis	0d8e16c	2003-08-05 06:19:47 +0000	[diff] [blame]	185	if labels and len(labels[-1]) == 0:
				186	trailing_dot = u'.'
				187	del labels[-1]
				188	else:
				189	trailing_dot = u''
				190
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	191	result = []
				192	for label in labels:
				193	result.append(ToUnicode(label))
				194
Martin v. Löwis	0d8e16c	2003-08-05 06:19:47 +0000	[diff] [blame]	195	return u".".join(result)+trailing_dot, len(input)
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	196
				197	class StreamWriter(Codec,codecs.StreamWriter):
				198	pass
				199
				200	class StreamReader(Codec,codecs.StreamReader):
				201	pass
				202
				203	### encodings module API
				204
				205	def getregentry():
				206
				207	return (Codec().encode,Codec().decode,StreamReader,StreamWriter)