Blame - Lib/email/Header.py - platform/external/python/cpython3

2002-04-10 21:01:31 +0000

[diff] [blame]

1

2

# Author: che@debian.org (Ben Gertzfield)

3

4

"""Header encoding and decoding functionality."""

5

6

import re

7

import email.quopriMIME

8

import email.base64MIME

9

from email.Charset import Charset

10

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

11

try:

Barry Warsaw

1c30aa2

2002-06-01 05:49:17 +0000

[diff] [blame]

12

from email._compat22 import _floordiv

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

13

except SyntaxError:

14

# Python 2.1 spells integer division differently

Barry Warsaw

1c30aa2

2002-06-01 05:49:17 +0000

[diff] [blame]

15

from email._compat21 import _floordiv

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

16

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

17

CRLFSPACE = '\r\n '

18

CRLF = '\r\n'

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame^]

19

NL = '\n'

20

SPACE8 = ' ' * 8

21

EMPTYSTRING = ''

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

MAXLINELEN = 76

ENCODE = 1

DECODE = 2

# Match encoded-word strings in the form =?charset?q?Hello_World?=

29

ecre = re.compile(r'''

30

=\? # literal =?

31

(?P<charset>[^?]*?) # non-greedy up to the next ? is the charset

32

\? # literal ?

33

(?P<encoding>[qb]) # either a "q" or a "b", case insensitive

34

\? # literal ?

35

(?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string

36

\?= # literal ?=

37

''', re.VERBOSE | re.IGNORECASE)

# Helpers

_max_append = email.quopriMIME._max_append

def decode_header(header):

47

"""Decode a message header value without converting charset.

48

49

Returns a list of (decoded_string, charset) pairs containing each of the

50

decoded parts of the header. Charset is None for non-encoded parts of the

51

header, otherwise a lower-case string containing the name of the character

52

set specified in the encoded string.

53

"""

54

# If no encoding, just return the header

55

header = str(header)

56

if not ecre.search(header):

57

return [(header, None)]

decoded = []

dec = ''

for line in header.splitlines():

62

# This line might not have an encoding in it

63

if not ecre.search(line):

64

decoded.append((line, None))

65

continue

Tim Peters

2002-05-23 15:15:30 +0000

[diff] [blame]

66

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

67

parts = ecre.split(line)

68

while parts:

69

unenc = parts.pop(0).strip()

70

if unenc:

71

# Should we continue a long line?

72

if decoded and decoded[-1][1] is None:

73

decoded[-1] = (decoded[-1][0] + dec, None)

74

else:

75

decoded.append((unenc, None))

76

if parts:

77

charset, encoding = [s.lower() for s in parts[0:2]]

encoded = parts[2]

dec = ''

if encoding == 'q':

dec = email.quopriMIME.header_decode(encoded)

82

elif encoding == 'b':

83

dec = email.base64MIME.decode(encoded)

else:

dec = encoded

if decoded and decoded[-1][1] == charset:

88

decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])

89

else:

90

decoded.append((dec, charset))

del parts[0:3]

return decoded

class Header:

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame^]

97

def __init__(self, s, charset=None, maxlinelen=None, header_name=None,

98

continuation_ws=' '):

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

99

"""Create a MIME-compliant header that can contain many languages.

100

101

Specify the initial header value in s. Specify its character set as a

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame^]

102

Charset object in the charset argument. If None, a default Charset

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

103

instance will be used.

104

105

You can later append to the header with append(s, charset) below;

106

charset does not have to be the same as the one initially specified

107

here. In fact, it's optional, and if not given, defaults to the

108

charset specified in the constructor.

109

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame^]

110

The maximum line length can be specified explicit via maxlinelen. For

111

splitting the first line to a shorter value (to account for the field

112

header which isn't included in s, e.g. `Subject') pass in the name of

113

the field in header_name. The default maxlinelen is 76.

114

115

continuation_ws must be RFC 2822 compliant folding whitespace (usually

116

either a space or a hard tab) which will be prepended to continuation

117

lines.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

"""

if charset is None:

charset = Charset()

self._charset = charset

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame^]

122

self._continuation_ws = continuation_ws

123

cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

124

# BAW: I believe `chunks' and `maxlinelen' should be non-public.

125

self._chunks = []

126

self.append(s, charset)

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

127

if maxlinelen is None:

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame^]

128

maxlinelen = MAXLINELEN

129

if header_name is None:

130

# We don't know anything about the field header so the first line

131

# is the same length as subsequent lines.

132

self._firstlinelen = maxlinelen

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

133

else:

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame^]

134

# The first line should be shorter to take into account the field

135

# header. Also subtract off 2 extra for the colon and space.

136

self._firstlinelen = maxlinelen - len(header_name) - 2

137

# Second and subsequent lines should subtract off the length in

138

# columns of the continuation whitespace prefix.

139

self._maxlinelen = maxlinelen - cws_expanded_len

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

140

141

def __str__(self):

142

"""A synonym for self.encode()."""

143

return self.encode()

144

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

145

def append(self, s, charset=None):

146

"""Append string s with Charset charset to the MIME header.

147

148

charset defaults to the one given in the class constructor.

149

"""

150

if charset is None:

151

charset = self._charset

152

self._chunks.append((s, charset))

Tim Peters

2002-05-23 15:15:30 +0000

[diff] [blame]

153

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame^]

154

def _split(self, s, charset, firstline=0):

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

155

# Split up a header safely for use with encode_chunks. BAW: this

156

# appears to be a private convenience method.

157

splittable = charset.to_splittable(s)

158

encoded = charset.from_splittable(splittable)

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

159

elen = charset.encoded_header_len(encoded)

Tim Peters

2002-05-23 15:15:30 +0000

[diff] [blame]

160

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

161

if elen <= self._maxlinelen:

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

162

return [(encoded, charset)]

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame^]

163

# BAW: I'm not sure what the right test here is. What we're trying to

164

# do is be faithful to RFC 2822's recommendation that ($2.2.3):

165

#

166

# "Note: Though structured field bodies are defined in such a way that

167

# folding can take place between many of the lexical tokens (and even

168

# within some of the lexical tokens), folding SHOULD be limited to

169

# placing the CRLF at higher-level syntactic breaks."

170

#

171

# For now, I can only imagine doing this when the charset is us-ascii,

172

# although it's possible that other charsets may also benefit from the

173

# higher-level syntactic breaks.

174

#

175

elif charset == 'us-ascii':

176

return self._ascii_split(s, charset, firstline)

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

177

# BAW: should we use encoded?

178

elif elen == len(s):

179

# We can split on _maxlinelen boundaries because we know that the

180

# encoding won't change the size of the string

181

splitpnt = self._maxlinelen

182

first = charset.from_splittable(splittable[:splitpnt], 0)

183

last = charset.from_splittable(splittable[splitpnt:], 0)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

184

else:

Barry Warsaw

1c30aa2

2002-06-01 05:49:17 +0000

[diff] [blame]

185

# Divide and conquer.

186

halfway = _floordiv(len(splittable), 2)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

187

first = charset.from_splittable(splittable[:halfway], 0)

188

last = charset.from_splittable(splittable[halfway:], 0)

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame^]

189

# Do the split

190

return self._split(first, charset, firstline) + \

191

self._split(last, charset)

192

193

def _ascii_split(self, s, charset, firstline):

194

# Attempt to split the line at the highest-level syntactic break

195

# possible. Note that we don't have a lot of smarts about field

196

# syntax; we just try to break on semi-colons, then whitespace.

197

rtn = []

198

lines = s.splitlines()

while lines:

line = lines.pop(0)

if firstline:

maxlinelen = self._firstlinelen

firstline = 0

else:

line = line.lstrip()

maxlinelen = self._maxlinelen

207

# Short lines can remain unchanged

208

if len(line.replace('\t', SPACE8)) <= maxlinelen:

rtn.append(line)

else:

oldlen = len(line)

# Try to break the line on semicolons, but if that doesn't

213

# work, try to split on folding whitespace.

214

while len(line) > maxlinelen:

215

i = line.rfind(';', 0, maxlinelen)

216

if i < 0:

217

break

218

rtn.append(line[:i] + ';')

219

line = line[i+1:]

220

# Is the remaining stuff still longer than maxlinelen?

221

if len(line) <= maxlinelen:

222

# Splitting on semis worked

223

rtn.append(line)

224

continue

225

# Splitting on semis didn't finish the job. If it did any

226

# work at all, stick the remaining junk on the front of the

227

# `lines' sequence and let the next pass do its thing.

228

if len(line) <> oldlen:

229

lines.insert(0, line)

230

continue

231

# Otherwise, splitting on semis didn't help at all.

232

parts = re.split(r'(\s+)', line)

233

if len(parts) == 1 or (len(parts) == 3 and

234

parts[0].endswith(':')):

235

# This line can't be split on whitespace. There's now

236

# little we can do to get this into maxlinelen. BAW:

237

# We're still potentially breaking the RFC by possibly

238

# allowing lines longer than the absolute maximum of 998

239

# characters. For now, let it slide.

240

#

241

# len(parts) will be 1 if this line has no `Field: '

242

# prefix, otherwise it will be len(3).

243

rtn.append(line)

244

continue

245

# There is whitespace we can split on.

first = parts.pop(0)

sublines = [first]

acc = len(first)

while parts:

len0 = len(parts[0])

len1 = len(parts[1])

if acc + len0 + len1 <= maxlinelen:

253

sublines.append(parts.pop(0))

254

sublines.append(parts.pop(0))

255

acc += len0 + len1

256

else:

257

# Split it here, but don't forget to ignore the

258

# next whitespace-only part

259

if first <> '':

260

rtn.append(EMPTYSTRING.join(sublines))

del parts[0]

first = parts.pop(0)

sublines = [first]

acc = len(first)

rtn.append(EMPTYSTRING.join(sublines))

266

return [(chunk, charset) for chunk in rtn]

267

268

def _encode_chunks(self):

269

"""MIME-encode a header with many different charsets and/or encodings.

270

271

Given a list of pairs (string, charset), return a MIME-encoded string

272

suitable for use in a header field. Each pair may have different

273

charsets and/or encodings, and the resulting header will accurately

274

reflect each setting.

275

276

Each encoding can be email.Utils.QP (quoted-printable, for ASCII-like

277

character sets like iso-8859-1), email.Utils.BASE64 (Base64, for

278

non-ASCII like character sets like KOI8-R and iso-2022-jp), or None

279

(no encoding).

280

281

Each pair will be represented on a separate line; the resulting string

282

will be in the format:

283

284

"=?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n

285

=?charset2?b?SvxyZ2VuIEL2aW5n?="

286

"""

287

chunks = []

288

for header, charset in self._chunks:

289

if charset is None or charset.header_encoding is None:

290

# There's no encoding for this chunk's charsets

291

_max_append(chunks, header, self._maxlinelen)

292

else:

293

_max_append(chunks, charset.header_encode(header, 0),

294

self._maxlinelen, ' ')

295

joiner = NL + self._continuation_ws

296

return joiner.join(chunks)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

297

298

def encode(self):

299

"""Encode a message header, possibly converting charset and encoding.

300

301

There are many issues involved in converting a given string for use in

302

an email header. Only certain character sets are readable in most

303

email clients, and as header strings can only contain a subset of

304

7-bit ASCII, care must be taken to properly convert and encode (with

305

Base64 or quoted-printable) header strings. In addition, there is a

306

75-character length limit on any given encoded header field, so

307

line-wrapping must be performed, even with double-byte character sets.

Tim Peters

2002-05-23 15:15:30 +0000

[diff] [blame]

308

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

309

This method will do its best to convert the string to the correct

310

character set used in email, and encode and line wrap it safely with

311

the appropriate scheme for that character set.

312

313

If the given charset is not known or an error occurs during

314

conversion, this function will return the header untouched.

315

"""

316

newchunks = []

317

for s, charset in self._chunks:

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame^]

318

newchunks += self._split(s, charset, 1)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

319

self._chunks = newchunks

Barry Warsaw