blob: da0a0a3ce70eede6a373f22d0cb9e303464c1e56 [file] [log] [blame]
Georg Brandl1f7fffb2010-10-15 15:57:45 +00001"""
2General functions for HTML manipulation.
3"""
4
Ezio Melotti4a9ee262013-11-19 20:28:45 +02005import re as _re
6from html.entities import html5 as _html5
7
8
9__all__ = ['escape', 'unescape']
10
Georg Brandl1f7fffb2010-10-15 15:57:45 +000011
12def escape(s, quote=True):
13 """
14 Replace special characters "&", "<" and ">" to HTML-safe sequences.
15 If the optional flag quote is true (the default), the quotation mark
Senthil Kumarand71bbf92011-09-13 07:14:13 +080016 characters, both double quote (") and single quote (') characters are also
17 translated.
Georg Brandl1f7fffb2010-10-15 15:57:45 +000018 """
Ezio Melotti46034872013-07-07 11:11:24 +020019 s = s.replace("&", "&amp;") # Must be done first!
20 s = s.replace("<", "&lt;")
21 s = s.replace(">", "&gt;")
Georg Brandl1f7fffb2010-10-15 15:57:45 +000022 if quote:
Ezio Melotti46034872013-07-07 11:11:24 +020023 s = s.replace('"', "&quot;")
24 s = s.replace('\'', "&#x27;")
25 return s
Ezio Melotti4a9ee262013-11-19 20:28:45 +020026
27
28# see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references
29
30_invalid_charrefs = {
31 0x00: '\ufffd', # REPLACEMENT CHARACTER
32 0x0d: '\r', # CARRIAGE RETURN
33 0x80: '\u20ac', # EURO SIGN
34 0x81: '\x81', # <control>
35 0x82: '\u201a', # SINGLE LOW-9 QUOTATION MARK
36 0x83: '\u0192', # LATIN SMALL LETTER F WITH HOOK
37 0x84: '\u201e', # DOUBLE LOW-9 QUOTATION MARK
38 0x85: '\u2026', # HORIZONTAL ELLIPSIS
39 0x86: '\u2020', # DAGGER
40 0x87: '\u2021', # DOUBLE DAGGER
41 0x88: '\u02c6', # MODIFIER LETTER CIRCUMFLEX ACCENT
42 0x89: '\u2030', # PER MILLE SIGN
43 0x8a: '\u0160', # LATIN CAPITAL LETTER S WITH CARON
44 0x8b: '\u2039', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
45 0x8c: '\u0152', # LATIN CAPITAL LIGATURE OE
46 0x8d: '\x8d', # <control>
47 0x8e: '\u017d', # LATIN CAPITAL LETTER Z WITH CARON
48 0x8f: '\x8f', # <control>
49 0x90: '\x90', # <control>
50 0x91: '\u2018', # LEFT SINGLE QUOTATION MARK
51 0x92: '\u2019', # RIGHT SINGLE QUOTATION MARK
52 0x93: '\u201c', # LEFT DOUBLE QUOTATION MARK
53 0x94: '\u201d', # RIGHT DOUBLE QUOTATION MARK
54 0x95: '\u2022', # BULLET
55 0x96: '\u2013', # EN DASH
56 0x97: '\u2014', # EM DASH
57 0x98: '\u02dc', # SMALL TILDE
58 0x99: '\u2122', # TRADE MARK SIGN
59 0x9a: '\u0161', # LATIN SMALL LETTER S WITH CARON
60 0x9b: '\u203a', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
61 0x9c: '\u0153', # LATIN SMALL LIGATURE OE
62 0x9d: '\x9d', # <control>
63 0x9e: '\u017e', # LATIN SMALL LETTER Z WITH CARON
64 0x9f: '\u0178', # LATIN CAPITAL LETTER Y WITH DIAERESIS
65}
66
67_invalid_codepoints = {
68 # 0x0001 to 0x0008
69 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
70 # 0x000E to 0x001F
71 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
72 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
73 # 0x007F to 0x009F
74 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a,
75 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
76 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
77 # 0xFDD0 to 0xFDEF
78 0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, 0xfdd8,
79 0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf, 0xfde0, 0xfde1,
80 0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7, 0xfde8, 0xfde9, 0xfdea,
81 0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef,
82 # others
83 0xb, 0xfffe, 0xffff, 0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff,
84 0x4fffe, 0x4ffff, 0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff,
85 0x8fffe, 0x8ffff, 0x9fffe, 0x9ffff, 0xafffe, 0xaffff, 0xbfffe, 0xbffff,
86 0xcfffe, 0xcffff, 0xdfffe, 0xdffff, 0xefffe, 0xeffff, 0xffffe, 0xfffff,
87 0x10fffe, 0x10ffff
88}
89
90
91def _replace_charref(s):
92 s = s.group(1)
93 if s[0] == '#':
94 # numeric charref
95 if s[1] in 'xX':
96 num = int(s[2:].rstrip(';'), 16)
97 else:
98 num = int(s[1:].rstrip(';'))
99 if num in _invalid_charrefs:
100 return _invalid_charrefs[num]
101 if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF:
102 return '\uFFFD'
103 if num in _invalid_codepoints:
104 return ''
105 return chr(num)
106 else:
107 # named charref
108 if s in _html5:
109 return _html5[s]
110 # find the longest matching name (as defined by the standard)
111 for x in range(len(s)-1, 1, -1):
112 if s[:x] in _html5:
113 return _html5[s[:x]] + s[x:]
114 else:
115 return '&' + s
116
117
118_charref = _re.compile(r'&(#[0-9]+;?'
119 r'|#[xX][0-9a-fA-F]+;?'
120 r'|[^\t\n\f <&#;]{1,32};?)')
121
122def unescape(s):
123 """
124 Convert all named and numeric character references (e.g. &gt;, &#62;,
125 &x3e;) in the string s to the corresponding unicode characters.
126 This function uses the rules defined by the HTML 5 standard
127 for both valid and invalid character references, and the list of
128 HTML 5 named character references defined in html.entities.html5.
129 """
130 if '&' not in s:
131 return s
132 return _charref.sub(_replace_charref, s)