Blame - Lib/html/__init__.py - platform/external/python/cpython3

blob: da0a0a3ce70eede6a373f22d0cb9e303464c1e56 [file] [log] [blame]

Georg Brandl	1f7fffb	2010-10-15 15:57:45 +0000	[diff] [blame]	1	"""
				2	General functions for HTML manipulation.
				3	"""
				4
Ezio Melotti	4a9ee26	2013-11-19 20:28:45 +0200	[diff] [blame]	5	import re as _re
				6	from html.entities import html5 as _html5
				7
				8
				9	__all__ = ['escape', 'unescape']
				10
Georg Brandl	1f7fffb	2010-10-15 15:57:45 +0000	[diff] [blame]	11
				12	def escape(s, quote=True):
				13	"""
				14	Replace special characters "&", "<" and ">" to HTML-safe sequences.
				15	If the optional flag quote is true (the default), the quotation mark
Senthil Kumaran	d71bbf9	2011-09-13 07:14:13 +0800	[diff] [blame]	16	characters, both double quote (") and single quote (') characters are also
				17	translated.
Georg Brandl	1f7fffb	2010-10-15 15:57:45 +0000	[diff] [blame]	18	"""
Ezio Melotti	4603487	2013-07-07 11:11:24 +0200	[diff] [blame]	19	s = s.replace("&", "&") # Must be done first!
				20	s = s.replace("<", "<")
				21	s = s.replace(">", ">")
Georg Brandl	1f7fffb	2010-10-15 15:57:45 +0000	[diff] [blame]	22	if quote:
Ezio Melotti	4603487	2013-07-07 11:11:24 +0200	[diff] [blame]	23	s = s.replace('"', """)
				24	s = s.replace('\'', "'")
				25	return s
Ezio Melotti	4a9ee26	2013-11-19 20:28:45 +0200	[diff] [blame]	26
				27
				28	# see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references
				29
				30	_invalid_charrefs = {
				31	0x00: '\ufffd', # REPLACEMENT CHARACTER
				32	0x0d: '\r', # CARRIAGE RETURN
				33	0x80: '\u20ac', # EURO SIGN
				34	0x81: '\x81', # <control>
				35	0x82: '\u201a', # SINGLE LOW-9 QUOTATION MARK
				36	0x83: '\u0192', # LATIN SMALL LETTER F WITH HOOK
				37	0x84: '\u201e', # DOUBLE LOW-9 QUOTATION MARK
				38	0x85: '\u2026', # HORIZONTAL ELLIPSIS
				39	0x86: '\u2020', # DAGGER
				40	0x87: '\u2021', # DOUBLE DAGGER
				41	0x88: '\u02c6', # MODIFIER LETTER CIRCUMFLEX ACCENT
				42	0x89: '\u2030', # PER MILLE SIGN
				43	0x8a: '\u0160', # LATIN CAPITAL LETTER S WITH CARON
				44	0x8b: '\u2039', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
				45	0x8c: '\u0152', # LATIN CAPITAL LIGATURE OE
				46	0x8d: '\x8d', # <control>
				47	0x8e: '\u017d', # LATIN CAPITAL LETTER Z WITH CARON
				48	0x8f: '\x8f', # <control>
				49	0x90: '\x90', # <control>
				50	0x91: '\u2018', # LEFT SINGLE QUOTATION MARK
				51	0x92: '\u2019', # RIGHT SINGLE QUOTATION MARK
				52	0x93: '\u201c', # LEFT DOUBLE QUOTATION MARK
				53	0x94: '\u201d', # RIGHT DOUBLE QUOTATION MARK
				54	0x95: '\u2022', # BULLET
				55	0x96: '\u2013', # EN DASH
				56	0x97: '\u2014', # EM DASH
				57	0x98: '\u02dc', # SMALL TILDE
				58	0x99: '\u2122', # TRADE MARK SIGN
				59	0x9a: '\u0161', # LATIN SMALL LETTER S WITH CARON
				60	0x9b: '\u203a', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
				61	0x9c: '\u0153', # LATIN SMALL LIGATURE OE
				62	0x9d: '\x9d', # <control>
				63	0x9e: '\u017e', # LATIN SMALL LETTER Z WITH CARON
				64	0x9f: '\u0178', # LATIN CAPITAL LETTER Y WITH DIAERESIS
				65	}
				66
				67	_invalid_codepoints = {
				68	# 0x0001 to 0x0008
				69	0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
				70	# 0x000E to 0x001F
				71	0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
				72	0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
				73	# 0x007F to 0x009F
				74	0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a,
				75	0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
				76	0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
				77	# 0xFDD0 to 0xFDEF
				78	0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, 0xfdd8,
				79	0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf, 0xfde0, 0xfde1,
				80	0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7, 0xfde8, 0xfde9, 0xfdea,
				81	0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef,
				82	# others
				83	0xb, 0xfffe, 0xffff, 0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff,
				84	0x4fffe, 0x4ffff, 0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff,
				85	0x8fffe, 0x8ffff, 0x9fffe, 0x9ffff, 0xafffe, 0xaffff, 0xbfffe, 0xbffff,
				86	0xcfffe, 0xcffff, 0xdfffe, 0xdffff, 0xefffe, 0xeffff, 0xffffe, 0xfffff,
				87	0x10fffe, 0x10ffff
				88	}
				89
				90
				91	def _replace_charref(s):
				92	s = s.group(1)
				93	if s[0] == '#':
				94	# numeric charref
				95	if s[1] in 'xX':
				96	num = int(s[2:].rstrip(';'), 16)
				97	else:
				98	num = int(s[1:].rstrip(';'))
				99	if num in _invalid_charrefs:
				100	return _invalid_charrefs[num]
				101	if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF:
				102	return '\uFFFD'
				103	if num in _invalid_codepoints:
				104	return ''
				105	return chr(num)
				106	else:
				107	# named charref
				108	if s in _html5:
				109	return _html5[s]
				110	# find the longest matching name (as defined by the standard)
				111	for x in range(len(s)-1, 1, -1):
				112	if s[:x] in _html5:
				113	return _html5[s[:x]] + s[x:]
				114	else:
				115	return '&' + s
				116
				117
				118	_charref = _re.compile(r'&(#[0-9]+;?'
				119	r'\|#[xX][0-9a-fA-F]+;?'
				120	r'\|[^\t\n\f <&#;]{1,32};?)')
				121
				122	def unescape(s):
				123	"""
				124	Convert all named and numeric character references (e.g. >, >,
				125	&x3e;) in the string s to the corresponding unicode characters.
				126	This function uses the rules defined by the HTML 5 standard
				127	for both valid and invalid character references, and the list of
				128	HTML 5 named character references defined in html.entities.html5.
				129	"""
				130	if '&' not in s:
				131	return s
				132	return _charref.sub(_replace_charref, s)