Blame - Lib/email/header.py - platform/external/python/cpython3

2007-08-30 01:15:14 +0000

[diff] [blame]

1

2

# Author: Ben Gertzfield, Barry Warsaw

3

# Contact: email-sig@python.org

4

5

"""Header encoding and decoding functionality."""

__all__ = [

'Header',

'decode_header',

'make_header',

]

import re

import binascii

import email.quoprimime

17

import email.base64mime

18

19

from email.errors import HeaderParseError

20

from email.charset import Charset

NL = '\n'

SPACE = ' '

BSPACE = b' '

SPACE8 = ' ' * 8

EMPTYSTRING = ''

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

27

MAXLINELEN = 78

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

28

29

USASCII = Charset('us-ascii')

30

UTF8 = Charset('utf-8')

31

32

# Match encoded-word strings in the form =?charset?q?Hello_World?=

33

ecre = re.compile(r'''

34

=\? # literal =?

35

(?P<charset>[^?]*?) # non-greedy up to the next ? is the charset

36

\? # literal ?

37

(?P<encoding>[qb]) # either a "q" or a "b", case insensitive

38

\? # literal ?

39

(?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string

40

\?= # literal ?=

41

(?=[ \t]|$) # whitespace or the end of the string

42

''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)

43

44

# Field name regexp, including trailing colon, but not separating whitespace,

45

# according to RFC 2822. Character range is from tilde to exclamation mark.

46

# For use with .match()

47

fcre = re.compile(r'[\041-\176]+:$')

# Helpers

_max_append = email.quoprimime._max_append

def decode_header(header):

57

"""Decode a message header value without converting charset.

58

59

Returns a list of (string, charset) pairs containing each of the decoded

60

parts of the header. Charset is None for non-encoded parts of the header,

61

otherwise a lower-case string containing the name of the character set

62

specified in the encoded string.

63

Amaury Forgeot d'Arc

1c25de6

2009-07-12 16:43:19 +0000

[diff] [blame]

64

An email.errors.HeaderParseError may be raised when certain decoding error

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

65

occurs (e.g. a base64 decoding exception).

66

"""

67

# If no encoding, just return the header with no charset.

68

if not ecre.search(header):

69

return [(header, None)]

70

# First step is to parse all the encoded parts into triplets of the form

71

# (encoded_string, encoding, charset). For unencoded strings, the last

72

# two parts will be None.

73

words = []

74

for line in header.splitlines():

75

parts = ecre.split(line)

76

while parts:

77

unencoded = parts.pop(0).strip()

78

if unencoded:

79

words.append((unencoded, None, None))

80

if parts:

81

charset = parts.pop(0).lower()

82

encoding = parts.pop(0).lower()

83

encoded = parts.pop(0)

84

words.append((encoded, encoding, charset))

85

# The next step is to decode each encoded word by applying the reverse

86

# base64 or quopri transformation. decoded_words is now a list of the

87

# form (decoded_word, charset).

88

decoded_words = []

89

for encoded_string, encoding, charset in words:

90

if encoding is None:

91

# This is an unencoded word.

92

decoded_words.append((encoded_string, charset))

93

elif encoding == 'q':

94

word = email.quoprimime.header_decode(encoded_string)

95

decoded_words.append((word, charset))

96

elif encoding == 'b':

R. David Murray

c4e69cc

2010-08-03 22:14:10 +0000

[diff] [blame]

97

paderr = len(encoded_string) % 4 # Postel's law: add missing padding

98

if paderr:

99

encoded_string += '==='[:4 - paderr]

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

100

try:

101

word = email.base64mime.decode(encoded_string)

102

except binascii.Error:

103

raise HeaderParseError('Base64 decoding error')

104

else:

105

decoded_words.append((word, charset))

106

else:

107

raise AssertionError('Unexpected encoding: ' + encoding)

108

# Now convert all words to bytes and collapse consecutive runs of

109

# similarly encoded words.

110

collapsed = []

111

last_word = last_charset = None

112

for word, charset in decoded_words:

113

if isinstance(word, str):

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

114

word = bytes(word, 'raw-unicode-escape')

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

115

if last_word is None:

116

last_word = word

117

last_charset = charset

118

elif charset != last_charset:

119

collapsed.append((last_word, last_charset))

120

last_word = word

121

last_charset = charset

122

elif last_charset is None:

123

last_word += BSPACE + word

124

else:

125

last_word += word

126

collapsed.append((last_word, last_charset))

return collapsed

def make_header(decoded_seq, maxlinelen=None, header_name=None,

132

continuation_ws=' '):

133

"""Create a Header from a sequence of pairs as returned by decode_header()

134

135

decode_header() takes a header value string and returns a sequence of

136

pairs of the format (decoded_string, charset) where charset is the string

137

name of the character set.

138

139

This function takes one of those sequence of pairs and returns a Header

140

instance. Optional maxlinelen, header_name, and continuation_ws are as in

141

the Header constructor.

142

"""

143

h = Header(maxlinelen=maxlinelen, header_name=header_name,

144

continuation_ws=continuation_ws)

145

for s, charset in decoded_seq:

146

# None means us-ascii but we can simply pass it on to h.append()

147

if charset is not None and not isinstance(charset, Charset):

148

charset = Charset(charset)

h.append(s, charset)

return h

class Header:

def __init__(self, s=None, charset=None,

156

maxlinelen=None, header_name=None,

157

continuation_ws=' ', errors='strict'):

158

"""Create a MIME-compliant header that can contain many character sets.

159

160

Optional s is the initial header value. If None, the initial header

161

value is not set. You can later append to the header with .append()

162

method calls. s may be a byte string or a Unicode string, but see the

163

.append() documentation for semantics.

164

165

Optional charset serves two purposes: it has the same meaning as the

166

charset argument to the .append() method. It also sets the default

167

character set for all subsequent .append() calls that omit the charset

168

argument. If charset is not provided in the constructor, the us-ascii

169

charset is used both as s's initial charset and as the default for

170

subsequent .append() calls.

171

172

The maximum line length can be specified explicit via maxlinelen. For

173

splitting the first line to a shorter value (to account for the field

174

header which isn't included in s, e.g. `Subject') pass in the name of

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

175

the field in header_name. The default maxlinelen is 78 as recommended

176

by RFC 2822.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

177

178

continuation_ws must be RFC 2822 compliant folding whitespace (usually

179

either a space or a hard tab) which will be prepended to continuation

180

lines.

181

182

errors is passed through to the .append() call.

"""

if charset is None:

charset = USASCII

elif not isinstance(charset, Charset):

187

charset = Charset(charset)

188

self._charset = charset

189

self._continuation_ws = continuation_ws

190

self._chunks = []

191

if s is not None:

192

self.append(s, charset, errors)

193

if maxlinelen is None:

194

maxlinelen = MAXLINELEN

195

self._maxlinelen = maxlinelen

196

if header_name is None:

197

self._headerlen = 0

198

else:

199

# Take the separating colon and space into account.

200

self._headerlen = len(header_name) + 2

201

202

def __str__(self):

203

"""Return the string value of the header."""

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

204

self._normalize()

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

205

uchunks = []

206

lastcs = None

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

207

for string, charset in self._chunks:

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

208

# We must preserve spaces between encoded and non-encoded word

209

# boundaries, which means for us we need to add a space when we go

210

# from a charset to None/us-ascii, or from None/us-ascii to a

211

# charset. Only do this for the second and subsequent chunks.

212

nextcs = charset

213

if uchunks:

214

if lastcs not in (None, 'us-ascii'):

215

if nextcs in (None, 'us-ascii'):

216

uchunks.append(SPACE)

217

nextcs = None

218

elif nextcs not in (None, 'us-ascii'):

219

uchunks.append(SPACE)

220

lastcs = nextcs

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

221

uchunks.append(string)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

222

return EMPTYSTRING.join(uchunks)

223

224

# Rich comparison operators for equality only. BAW: does it make sense to

225

# have or explicitly disable <, <=, >, >= operators?

226

def __eq__(self, other):

227

# other may be a Header or a string. Both are fine so coerce

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

228

# ourselves to a unicode (of the unencoded header value), swap the

229

# args and do another comparison.

230

return other == str(self)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

231

232

def __ne__(self, other):

233

return not self == other

234

235

def append(self, s, charset=None, errors='strict'):

236

"""Append a string to the MIME header.

237

238

Optional charset, if given, should be a Charset instance or the name

239

of a character set (which will be converted to a Charset instance). A

240

value of None (the default) means that the charset given in the

241

constructor is used.

242

243

s may be a byte string or a Unicode string. If it is a byte string

244

(i.e. isinstance(s, str) is true), then charset is the encoding of

245

that byte string, and a UnicodeError will be raised if the string

246

cannot be decoded with that charset. If s is a Unicode string, then

247

charset is a hint specifying the character set of the characters in

248

the string. In this case, when producing an RFC 2822 compliant header

249

using RFC 2047 rules, the Unicode string will be encoded using the

250

following charsets in order: us-ascii, the charset hint, utf-8. The

251

first character set not to provoke a UnicodeError is used.

252

253

Optional `errors' is passed as the third argument to any unicode() or

ustr.encode() call.

"""

if charset is None:

charset = self._charset

258

elif not isinstance(charset, Charset):

259

charset = Charset(charset)

260

if isinstance(s, str):

261

# Convert the string from the input character set to the output

262

# character set and store the resulting bytes and the charset for

263

# composition later.

264

input_charset = charset.input_codec or 'us-ascii'

265

input_bytes = s.encode(input_charset, errors)

266

else:

267

# We already have the bytes we will store internally.

268

input_bytes = s

269

# Ensure that the bytes we're storing can be decoded to the output

270

# character set, otherwise an early error is thrown.

271

output_charset = charset.output_codec or 'us-ascii'

272

output_string = input_bytes.decode(output_charset, errors)

273

self._chunks.append((output_string, charset))

274

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

275

def encode(self, splitchars=';, \t', maxlinelen=None):

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

276

"""Encode a message header into an RFC-compliant format.

277

278

There are many issues involved in converting a given string for use in

279

an email header. Only certain character sets are readable in most

280

email clients, and as header strings can only contain a subset of

281

7-bit ASCII, care must be taken to properly convert and encode (with

282

Base64 or quoted-printable) header strings. In addition, there is a

283

75-character length limit on any given encoded header field, so

284

line-wrapping must be performed, even with double-byte character sets.

285

286

This method will do its best to convert the string to the correct

287

character set used in email, and encode and line wrap it safely with

288

the appropriate scheme for that character set.

289

290

If the given charset is not known or an error occurs during

291

conversion, this function will return the header untouched.

292

293

Optional splitchars is a string containing characters to split long

294

ASCII lines on, in rough support of RFC 2822's `highest level

295

syntactic breaks'. This doesn't affect RFC 2047 encoded lines.

296

"""

297

self._normalize()

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

298

if maxlinelen is None:

299

maxlinelen = self._maxlinelen

300

# A maxlinelen of 0 means don't wrap. For all practical purposes,

301

# choosing a huge number here accomplishes that and makes the

302

# _ValueFormatter algorithm much simpler.

303

if maxlinelen == 0:

304

maxlinelen = 1000000

305

formatter = _ValueFormatter(self._headerlen, maxlinelen,

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

306

self._continuation_ws, splitchars)

307

for string, charset in self._chunks:

308

lines = string.splitlines()

309

for line in lines:

310

formatter.feed(line, charset)

311

if len(lines) > 1:

312

formatter.newline()

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

313

formatter.add_transition()

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

314

return str(formatter)

315

316

def _normalize(self):

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

317

# Step 1: Normalize the chunks so that all runs of identical charsets

318

# get collapsed into a single unicode string.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

chunks = []

last_charset = None

last_chunk = []

for string, charset in self._chunks:

323

if charset == last_charset:

324

last_chunk.append(string)

325

else:

326

if last_charset is not None:

327

chunks.append((SPACE.join(last_chunk), last_charset))

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

328

last_chunk = [string]

329

last_charset = charset

330

if last_chunk:

331

chunks.append((SPACE.join(last_chunk), last_charset))

332

self._chunks = chunks

class _ValueFormatter:

337

def __init__(self, headerlen, maxlen, continuation_ws, splitchars):

338

self._maxlen = maxlen

339

self._continuation_ws = continuation_ws

340

self._continuation_ws_len = len(continuation_ws.replace('\t', SPACE8))

341

self._splitchars = splitchars

342

self._lines = []

343

self._current_line = _Accumulator(headerlen)

def __str__(self):

self.newline()

return NL.join(self._lines)

348

349

def newline(self):

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

350

end_of_line = self._current_line.pop()

351

if end_of_line is not None:

352

self._current_line.push(end_of_line)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

353

if len(self._current_line) > 0:

354

self._lines.append(str(self._current_line))

355

self._current_line.reset()

356

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

357

def add_transition(self):

358

self._current_line.push(None)

359

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

360

def feed(self, string, charset):

361

# If the string itself fits on the current line in its encoded format,

362

# then add it now and be done with it.

363

encoded_string = charset.header_encode(string)

364

if len(encoded_string) + len(self._current_line) <= self._maxlen:

365

self._current_line.push(encoded_string)

366

return

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

367

# If the charset has no header encoding (i.e. it is an ASCII encoding)

368

# then we must split the header at the "highest level syntactic break"

369

# possible. Note that we don't have a lot of smarts about field

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

370

# syntax; we just try to break on semi-colons, then commas, then

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

371

# whitespace. Eventually, this should be pluggable.

372

if charset.header_encoding is None:

373

for ch in self._splitchars:

if ch in string:

break

else:

ch = None

# If there's no available split character then regardless of

379

# whether the string fits on the line, we have to put it on a line

380

# by itself.

381

if ch is None:

382

if not self._current_line.is_onlyws():

383

self._lines.append(str(self._current_line))

384

self._current_line.reset(self._continuation_ws)

385

self._current_line.push(encoded_string)

386

else:

387

self._ascii_split(string, ch)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

388

return

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

389

# Otherwise, we're doing either a Base64 or a quoted-printable

390

# encoding which means we don't need to split the line on syntactic

391

# breaks. We can basically just find enough characters to fit on the

392

# current line, minus the RFC 2047 chrome. What makes this trickier

393

# though is that we have to split at octet boundaries, not character

394

# boundaries but it's only safe to split at character boundaries so at

395

# best we can only get close.

396

encoded_lines = charset.header_encode_lines(string, self._maxlengths())

397

# The first element extends the current line, but if it's None then

398

# nothing more fit on the current line so start a new line.

399

try:

400

first_line = encoded_lines.pop(0)

401

except IndexError:

402

# There are no encoded lines, so we're done.

403

return

404

if first_line is not None:

405

self._current_line.push(first_line)

406

self._lines.append(str(self._current_line))

407

self._current_line.reset(self._continuation_ws)

408

try:

409

last_line = encoded_lines.pop()

410

except IndexError:

411

# There was only one line.

412

return

413

self._current_line.push(last_line)

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

414

# Everything else are full lines in themselves.

415

for line in encoded_lines:

416

self._lines.append(self._continuation_ws + line)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

417

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

418

def _maxlengths(self):

419

# The first line's length.

420

yield self._maxlen - len(self._current_line)

421

while True:

422

yield self._maxlen - self._continuation_ws_len

423

424

def _ascii_split(self, string, ch):

425

holding = _Accumulator()

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

426

# Split the line on the split character, preserving it. If the split

427

# character is whitespace RFC 2822 $2.2.3 requires us to fold on the

428

# whitespace, so that the line leads with the original whitespace we

429

# split on. However, if a higher syntactic break is used instead

430

# (e.g. comma or semicolon), the folding should happen after the split

431

# character. But then in that case, we need to add our own

432

# continuation whitespace -- although won't that break unfolding?

433

for part, splitpart, nextpart in _spliterator(ch, string):

434

if not splitpart:

435

# No splitpart means this is the last chunk. Put this part

436

# either on the current line or the next line depending on

437

# whether it fits.

438

holding.push(part)

439

if len(holding) + len(self._current_line) <= self._maxlen:

440

# It fits, but we're done.

441

self._current_line.push(str(holding))

442

else:

443

# It doesn't fit, but we're done. Before pushing a new

444

# line, watch out for the current line containing only

445

# whitespace.

446

holding.pop()

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

447

if self._current_line.is_onlyws() and holding.is_onlyws():

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

448

# Don't start a new line.

449

holding.push(part)

450

part = None

451

self._current_line.push(str(holding))

452

self._lines.append(str(self._current_line))

453

if part is None:

454

self._current_line.reset()

455

else:

456

holding.reset(part)

457

self._current_line.reset(str(holding))

458

return

459

elif not nextpart:

460

# There must be some trailing split characters because we

461

# found a split character but no next part. In this case we

462

# must treat the thing to fit as the part + splitpart because

463

# if splitpart is whitespace it's not allowed to be the only

464

# thing on the line, and if it's not whitespace we must split

465

# after the syntactic break. In either case, we're done.

466

holding_prelen = len(holding)

467

holding.push(part + splitpart)

468

if len(holding) + len(self._current_line) <= self._maxlen:

469

self._current_line.push(str(holding))

470

elif holding_prelen == 0:

471

# This is the only chunk left so it has to go on the

472

# current line.

473

self._current_line.push(str(holding))

474

else:

475

save_part = holding.pop()

476

self._current_line.push(str(holding))

477

self._lines.append(str(self._current_line))

478

holding.reset(save_part)

479

self._current_line.reset(str(holding))

480

return

481

elif not part:

482

# We're leading with a split character. See if the splitpart

483

# and nextpart fits on the current line.

484

holding.push(splitpart + nextpart)

485

holding_len = len(holding)

486

# We know we're not leaving the nextpart on the stack.

487

holding.pop()

488

if holding_len + len(self._current_line) <= self._maxlen:

489

holding.push(splitpart)

490

else:

491

# It doesn't fit. Since there's no current part really

492

# the best we can do is start a new line and push the

493

# split part onto it.

494

self._current_line.push(str(holding))

495

holding.reset()

496

if len(self._current_line) > 0 and self._lines:

497

self._lines.append(str(self._current_line))

498

self._current_line.reset()

499

holding.push(splitpart)

500

else:

501

# All three parts are present. First let's see if all three

502

# parts will fit on the current line. If so, we don't need to

503

# split it.

504

holding.push(part + splitpart + nextpart)

505

holding_len = len(holding)

506

# Pop the part because we'll push nextpart on the next

507

# iteration through the loop.

508

holding.pop()

509

if holding_len + len(self._current_line) <= self._maxlen:

510

holding.push(part + splitpart)

511

else:

512

# The entire thing doesn't fit. See if we need to split

513

# before or after the split characters.

514

if splitpart.isspace():

515

# Split before whitespace. Remember that the

516

# whitespace becomes the continuation whitespace of

517

# the next line so it goes to current_line not holding.

518

holding.push(part)

519

self._current_line.push(str(holding))

520

holding.reset()

521

self._lines.append(str(self._current_line))

522

self._current_line.reset(splitpart)

523

else:

524

# Split after non-whitespace. The continuation

525

# whitespace comes from the instance variable.

526

holding.push(part + splitpart)

527

self._current_line.push(str(holding))

528

holding.reset()

529

self._lines.append(str(self._current_line))

530

if nextpart[0].isspace():

531

self._current_line.reset()

532

else:

533

self._current_line.reset(self._continuation_ws)

534

# Get the last of the holding part

535

self._current_line.push(str(holding))

def _spliterator(character, string):

540

parts = list(reversed(re.split('(%s)' % character, string)))

541

while parts:

542

part = parts.pop()

543

splitparts = (parts.pop() if parts else None)

544

nextpart = (parts.pop() if parts else None)

545

yield (part, splitparts, nextpart)

546

if nextpart is not None:

547

parts.append(nextpart)

548

549

550

class _Accumulator:

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

551

def __init__(self, initial_size=0):

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

552

self._initial_size = initial_size

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

553

self._current = []

554

555

def push(self, string):

556

self._current.append(string)

557

558

def pop(self):

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

559

if not self._current:

560

return None

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

561

return self._current.pop()

562

563

def __len__(self):

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

564

return sum(((1 if string is None else len(string))

565

for string in self._current),

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

566

self._initial_size)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

567

568

def __str__(self):

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

569

if self._current and self._current[-1] is None:

570

self._current.pop()

571

return EMPTYSTRING.join((' ' if string is None else string)

572

for string in self._current)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

573

574

def reset(self, string=None):

575

self._current = []

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

576

self._initial_size = 0

577

if string is not None:

578

self.push(string)

Guido van Rossum