Blame - Lib/encodings/idna.py - platform/external/python/cpython2

blob: b226d22fda56a591cc8dfef44512c7cc3476164e [file] [log] [blame]

Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	1	# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
				2
Martin v. Löwis	480f1bb	2006-03-09 23:38:20 +0000	[diff] [blame]	3	import stringprep, re, codecs
Martin v. Löwis	5bd7c02	2006-03-10 11:20:04 +0000	[diff] [blame]	4	from unicodedata import ucd_3_2_0 as unicodedata
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	5
				6	# IDNA section 3.1
Guido van Rossum	ef87d6e	2007-05-02 19:09:54 +0000	[diff] [blame]	7	dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	8
				9	# IDNA section 5
				10	ace_prefix = "xn--"
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	11
				12	# This assumes query strings, so AllowUnassigned is true
				13	def nameprep(label):
				14	# Map
				15	newlabel = []
				16	for c in label:
				17	if stringprep.in_table_b1(c):
				18	# Map to nothing
				19	continue
				20	newlabel.append(stringprep.map_table_b2(c))
Guido van Rossum	ef87d6e	2007-05-02 19:09:54 +0000	[diff] [blame]	21	label = "".join(newlabel)
Tim Peters	0eadaac	2003-04-24 16:02:54 +0000	[diff] [blame]	22
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	23	# Normalize
				24	label = unicodedata.normalize("NFKC", label)
Tim Peters	0eadaac	2003-04-24 16:02:54 +0000	[diff] [blame]	25
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	26	# Prohibit
				27	for c in label:
				28	if stringprep.in_table_c12(c) or \
				29	stringprep.in_table_c22(c) or \
				30	stringprep.in_table_c3(c) or \
				31	stringprep.in_table_c4(c) or \
				32	stringprep.in_table_c5(c) or \
				33	stringprep.in_table_c6(c) or \
				34	stringprep.in_table_c7(c) or \
				35	stringprep.in_table_c8(c) or \
				36	stringprep.in_table_c9(c):
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	37	raise UnicodeError("Invalid character %r" % c)
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	38
				39	# Check bidi
				40	RandAL = map(stringprep.in_table_d1, label)
				41	for c in RandAL:
				42	if c:
				43	# There is a RandAL char in the string. Must perform further
				44	# tests:
				45	# 1) The characters in section 5.8 MUST be prohibited.
				46	# This is table C.8, which was already checked
				47	# 2) If a string contains any RandALCat character, the string
				48	# MUST NOT contain any LCat character.
				49	if filter(stringprep.in_table_d2, label):
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	50	raise UnicodeError("Violation of BIDI requirement 2")
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	51
				52	# 3) If a string contains any RandALCat character, a
				53	# RandALCat character MUST be the first character of the
				54	# string, and a RandALCat character MUST be the last
				55	# character of the string.
				56	if not RandAL[0] or not RandAL[-1]:
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	57	raise UnicodeError("Violation of BIDI requirement 3")
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	58
				59	return label
				60
				61	def ToASCII(label):
				62	try:
				63	# Step 1: try ASCII
				64	label = label.encode("ascii")
				65	except UnicodeError:
				66	pass
				67	else:
				68	# Skip to step 3: UseSTD3ASCIIRules is false, so
				69	# Skip to step 8.
				70	if 0 < len(label) < 64:
				71	return label
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	72	raise UnicodeError("label empty or too long")
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	73
				74	# Step 2: nameprep
				75	label = nameprep(label)
				76
				77	# Step 3: UseSTD3ASCIIRules is false
				78	# Step 4: try ASCII
				79	try:
				80	label = label.encode("ascii")
				81	except UnicodeError:
				82	pass
				83	else:
				84	# Skip to step 8.
				85	if 0 < len(label) < 64:
				86	return label
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	87	raise UnicodeError("label empty or too long")
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	88
				89	# Step 5: Check ACE prefix
				90	if label.startswith(uace_prefix):
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	91	raise UnicodeError("Label starts with ACE prefix")
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	92
				93	# Step 6: Encode with PUNYCODE
				94	label = label.encode("punycode")
				95
				96	# Step 7: Prepend ACE prefix
				97	label = ace_prefix + label
				98
				99	# Step 8: Check size
				100	if 0 < len(label) < 64:
				101	return label
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	102	raise UnicodeError("label empty or too long")
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	103
				104	def ToUnicode(label):
				105	# Step 1: Check for ASCII
				106	if isinstance(label, str):
				107	pure_ascii = True
				108	else:
				109	try:
				110	label = label.encode("ascii")
				111	pure_ascii = True
				112	except UnicodeError:
				113	pure_ascii = False
				114	if not pure_ascii:
				115	# Step 2: Perform nameprep
				116	label = nameprep(label)
				117	# It doesn't say this, but apparently, it should be ASCII now
				118	try:
				119	label = label.encode("ascii")
				120	except UnicodeError:
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	121	raise UnicodeError("Invalid character in IDN label")
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	122	# Step 3: Check for ACE prefix
				123	if not label.startswith(ace_prefix):
Guido van Rossum	ef87d6e	2007-05-02 19:09:54 +0000	[diff] [blame]	124	return str(label, "ascii")
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	125
				126	# Step 4: Remove ACE prefix
				127	label1 = label[len(ace_prefix):]
				128
				129	# Step 5: Decode using PUNYCODE
				130	result = label1.decode("punycode")
				131
				132	# Step 6: Apply ToASCII
				133	label2 = ToASCII(result)
				134
				135	# Step 7: Compare the result of step 6 with the one of step 3
				136	# label2 will already be in lower case.
				137	if label.lower() != label2:
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	138	raise UnicodeError("IDNA does not round-trip", label, label2)
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	139
				140	# Step 8: return the result of step 5
				141	return result
Tim Peters	0eadaac	2003-04-24 16:02:54 +0000	[diff] [blame]	142
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	143	### Codec APIs
				144
				145	class Codec(codecs.Codec):
				146	def encode(self,input,errors='strict'):
				147
				148	if errors != 'strict':
				149	# IDNA is quite clear that implementations must be strict
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	150	raise UnicodeError("unsupported error handling "+errors)
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	151
Martin v. Löwis	8b59514	2005-08-25 11:03:38 +0000	[diff] [blame]	152	if not input:
				153	return "", 0
				154
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	155	result = []
Martin v. Löwis	0d8e16c	2003-08-05 06:19:47 +0000	[diff] [blame]	156	labels = dots.split(input)
				157	if labels and len(labels[-1])==0:
				158	trailing_dot = '.'
				159	del labels[-1]
				160	else:
				161	trailing_dot = ''
				162	for label in labels:
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	163	result.append(ToASCII(label))
				164	# Join with U+002E
Martin v. Löwis	0d8e16c	2003-08-05 06:19:47 +0000	[diff] [blame]	165	return ".".join(result)+trailing_dot, len(input)
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	166
				167	def decode(self,input,errors='strict'):
Tim Peters	0eadaac	2003-04-24 16:02:54 +0000	[diff] [blame]	168
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	169	if errors != 'strict':
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	170	raise UnicodeError("Unsupported error handling "+errors)
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	171
Martin v. Löwis	8b59514	2005-08-25 11:03:38 +0000	[diff] [blame]	172	if not input:
Guido van Rossum	ef87d6e	2007-05-02 19:09:54 +0000	[diff] [blame]	173	return "", 0
Martin v. Löwis	8b59514	2005-08-25 11:03:38 +0000	[diff] [blame]	174
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	175	# IDNA allows decoding to operate on Unicode strings, too.
Guido van Rossum	ef87d6e	2007-05-02 19:09:54 +0000	[diff] [blame]	176	if isinstance(input, str):
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	177	labels = dots.split(input)
				178	else:
				179	# Must be ASCII string
Martin v. Löwis	708b4da	2004-03-23 23:40:36 +0000	[diff] [blame]	180	input = str(input)
Guido van Rossum	ef87d6e	2007-05-02 19:09:54 +0000	[diff] [blame]	181	str(input, "ascii")
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	182	labels = input.split(".")
				183
Martin v. Löwis	0d8e16c	2003-08-05 06:19:47 +0000	[diff] [blame]	184	if labels and len(labels[-1]) == 0:
Guido van Rossum	ef87d6e	2007-05-02 19:09:54 +0000	[diff] [blame]	185	trailing_dot = '.'
Martin v. Löwis	0d8e16c	2003-08-05 06:19:47 +0000	[diff] [blame]	186	del labels[-1]
				187	else:
Guido van Rossum	ef87d6e	2007-05-02 19:09:54 +0000	[diff] [blame]	188	trailing_dot = ''
Martin v. Löwis	0d8e16c	2003-08-05 06:19:47 +0000	[diff] [blame]	189
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	190	result = []
				191	for label in labels:
				192	result.append(ToUnicode(label))
				193
Guido van Rossum	ef87d6e	2007-05-02 19:09:54 +0000	[diff] [blame]	194	return ".".join(result)+trailing_dot, len(input)
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	195
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	196	class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
				197	def _buffer_encode(self, input, errors, final):
				198	if errors != 'strict':
				199	# IDNA is quite clear that implementations must be strict
				200	raise UnicodeError("unsupported error handling "+errors)
Thomas Wouters	a977329	2006-04-21 09:43:23 +0000	[diff] [blame]	201
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	202	if not input:
				203	return ("", 0)
				204
				205	labels = dots.split(input)
Guido van Rossum	ef87d6e	2007-05-02 19:09:54 +0000	[diff] [blame]	206	trailing_dot = ''
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	207	if labels:
				208	if not labels[-1]:
				209	trailing_dot = '.'
				210	del labels[-1]
				211	elif not final:
				212	# Keep potentially unfinished label until the next call
				213	del labels[-1]
				214	if labels:
				215	trailing_dot = '.'
				216
				217	result = []
				218	size = 0
				219	for label in labels:
				220	result.append(ToASCII(label))
				221	if size:
				222	size += 1
				223	size += len(label)
				224
				225	# Join with U+002E
				226	result = ".".join(result) + trailing_dot
				227	size += len(trailing_dot)
				228	return (result, size)
				229
				230	class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
				231	def _buffer_decode(self, input, errors, final):
				232	if errors != 'strict':
				233	raise UnicodeError("Unsupported error handling "+errors)
				234
				235	if not input:
Guido van Rossum	ef87d6e	2007-05-02 19:09:54 +0000	[diff] [blame]	236	return ("", 0)
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	237
				238	# IDNA allows decoding to operate on Unicode strings, too.
Guido van Rossum	ef87d6e	2007-05-02 19:09:54 +0000	[diff] [blame]	239	if isinstance(input, str):
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	240	labels = dots.split(input)
				241	else:
				242	# Must be ASCII string
				243	input = str(input)
Guido van Rossum	ef87d6e	2007-05-02 19:09:54 +0000	[diff] [blame]	244	str(input, "ascii")
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	245	labels = input.split(".")
				246
Guido van Rossum	ef87d6e	2007-05-02 19:09:54 +0000	[diff] [blame]	247	trailing_dot = ''
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	248	if labels:
				249	if not labels[-1]:
Guido van Rossum	ef87d6e	2007-05-02 19:09:54 +0000	[diff] [blame]	250	trailing_dot = '.'
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	251	del labels[-1]
				252	elif not final:
				253	# Keep potentially unfinished label until the next call
				254	del labels[-1]
				255	if labels:
Guido van Rossum	ef87d6e	2007-05-02 19:09:54 +0000	[diff] [blame]	256	trailing_dot = '.'
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	257
				258	result = []
				259	size = 0
				260	for label in labels:
				261	result.append(ToUnicode(label))
				262	if size:
				263	size += 1
				264	size += len(label)
				265
Guido van Rossum	ef87d6e	2007-05-02 19:09:54 +0000	[diff] [blame]	266	result = ".".join(result) + trailing_dot
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	267	size += len(trailing_dot)
				268	return (result, size)
Thomas Wouters	a977329	2006-04-21 09:43:23 +0000	[diff] [blame]	269
Martin v. Löwis	2548c73	2003-04-18 10:39:54 +0000	[diff] [blame]	270	class StreamWriter(Codec,codecs.StreamWriter):
				271	pass
				272
				273	class StreamReader(Codec,codecs.StreamReader):
				274	pass
				275
				276	### encodings module API
				277
				278	def getregentry():
Thomas Wouters	a977329	2006-04-21 09:43:23 +0000	[diff] [blame]	279	return codecs.CodecInfo(
				280	name='idna',
				281	encode=Codec().encode,
				282	decode=Codec().decode,
				283	incrementalencoder=IncrementalEncoder,
				284	incrementaldecoder=IncrementalDecoder,
				285	streamwriter=StreamWriter,
				286	streamreader=StreamReader,
				287	)