blob: bb26d5a556dbbea3f22e99a23cdf70c6d57e2d79 [file] [log] [blame]
R David Murray0b6f6c82012-05-25 18:42:14 -04001"""Header value parser implementing various email-related RFC parsing rules.
2
3The parsing methods defined in this module implement various email related
4parsing rules. Principal among them is RFC 5322, which is the followon
5to RFC 2822 and primarily a clarification of the former. It also implements
6RFC 2047 encoded word decoding.
7
8RFC 5322 goes to considerable trouble to maintain backward compatibility with
9RFC 822 in the parse phase, while cleaning up the structure on the generation
10phase. This parser supports correct RFC 5322 generation by tagging white space
11as folding white space only when folding is allowed in the non-obsolete rule
12sets. Actually, the parser is even more generous when accepting input than RFC
135322 mandates, following the spirit of Postel's Law, which RFC 5322 encourages.
14Where possible deviations from the standard are annotated on the 'defects'
15attribute of tokens that deviate.
16
17The general structure of the parser follows RFC 5322, and uses its terminology
18where there is a direct correspondence. Where the implementation requires a
19somewhat different structure than that used by the formal grammar, new terms
20that mimic the closest existing terms are used. Thus, it really helps to have
21a copy of RFC 5322 handy when studying this code.
22
23Input to the parser is a string that has already been unfolded according to
24RFC 5322 rules. According to the RFC this unfolding is the very first step, and
25this parser leaves the unfolding step to a higher level message parser, which
26will have already detected the line breaks that need unfolding while
27determining the beginning and end of each header.
28
29The output of the parser is a TokenList object, which is a list subclass. A
30TokenList is a recursive data structure. The terminal nodes of the structure
31are Terminal objects, which are subclasses of str. These do not correspond
32directly to terminal objects in the formal grammar, but are instead more
33practical higher level combinations of true terminals.
34
35All TokenList and Terminal objects have a 'value' attribute, which produces the
36semantically meaningful value of that part of the parse subtree. The value of
37all whitespace tokens (no matter how many sub-tokens they may contain) is a
38single space, as per the RFC rules. This includes 'CFWS', which is herein
39included in the general class of whitespace tokens. There is one exception to
40the rule that whitespace tokens are collapsed into single spaces in values: in
41the value of a 'bare-quoted-string' (a quoted-string with no leading or
42trailing whitespace), any whitespace that appeared between the quotation marks
43is preserved in the returned value. Note that in all Terminal strings quoted
44pairs are turned into their unquoted values.
45
46All TokenList and Terminal objects also have a string value, which attempts to
47be a "canonical" representation of the RFC-compliant form of the substring that
48produced the parsed subtree, including minimal use of quoted pair quoting.
49Whitespace runs are not collapsed.
50
51Comment tokens also have a 'content' attribute providing the string found
52between the parens (including any nested comments) with whitespace preserved.
53
54All TokenList and Terminal objects have a 'defects' attribute which is a
55possibly empty list all of the defects found while creating the token. Defects
56may appear on any token in the tree, and a composite list of all defects in the
57subtree is available through the 'all_defects' attribute of any node. (For
58Terminal notes x.defects == x.all_defects.)
59
60Each object in a parse tree is called a 'token', and each has a 'token_type'
61attribute that gives the name from the RFC 5322 grammar that it represents.
62Not all RFC 5322 nodes are produced, and there is one non-RFC 5322 node that
63may be produced: 'ptext'. A 'ptext' is a string of printable ascii characters.
64It is returned in place of lists of (ctext/quoted-pair) and
65(qtext/quoted-pair).
66
67XXX: provide complete list of token types.
68"""
69
70import re
R David Murray97f43c02012-06-24 05:03:27 -040071import urllib # For urllib.parse.unquote
R David Murray65171b22013-07-11 15:52:57 -040072from string import hexdigits
R David Murray7d0325d2015-03-29 21:53:05 -040073from operator import itemgetter
R David Murray0b6f6c82012-05-25 18:42:14 -040074from email import _encoded_words as _ew
75from email import errors
76from email import utils
77
78#
79# Useful constants and functions
80#
81
82WSP = set(' \t')
83CFWS_LEADER = WSP | set('(')
84SPECIALS = set(r'()<>@,:;.\"[]')
85ATOM_ENDS = SPECIALS | WSP
86DOT_ATOM_ENDS = ATOM_ENDS - set('.')
87# '.', '"', and '(' do not end phrases in order to support obs-phrase
88PHRASE_ENDS = SPECIALS - set('."(')
R David Murray97f43c02012-06-24 05:03:27 -040089TSPECIALS = (SPECIALS | set('/?=')) - set('.')
90TOKEN_ENDS = TSPECIALS | WSP
91ASPECIALS = TSPECIALS | set("*'%")
92ATTRIBUTE_ENDS = ASPECIALS | WSP
93EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
R David Murray0b6f6c82012-05-25 18:42:14 -040094
95def quote_string(value):
96 return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
97
98#
R David Murray0b6f6c82012-05-25 18:42:14 -040099# TokenList and its subclasses
100#
101
102class TokenList(list):
103
104 token_type = None
R. David Murray85d5c182017-12-03 18:51:41 -0500105 syntactic_break = True
106 ew_combine_allowed = True
R David Murray0b6f6c82012-05-25 18:42:14 -0400107
108 def __init__(self, *args, **kw):
109 super().__init__(*args, **kw)
110 self.defects = []
111
112 def __str__(self):
113 return ''.join(str(x) for x in self)
114
115 def __repr__(self):
116 return '{}({})'.format(self.__class__.__name__,
117 super().__repr__())
118
119 @property
120 def value(self):
121 return ''.join(x.value for x in self if x.value)
122
123 @property
124 def all_defects(self):
125 return sum((x.all_defects for x in self), self.defects)
126
R David Murray0b6f6c82012-05-25 18:42:14 -0400127 def startswith_fws(self):
128 return self[0].startswith_fws()
129
R David Murray0b6f6c82012-05-25 18:42:14 -0400130 @property
R. David Murray85d5c182017-12-03 18:51:41 -0500131 def as_ew_allowed(self):
132 """True if all top level tokens of this part may be RFC2047 encoded."""
133 return all(part.as_ew_allowed for part in self)
R David Murray0b6f6c82012-05-25 18:42:14 -0400134
135 @property
136 def comments(self):
137 comments = []
138 for token in self:
139 comments.extend(token.comments)
140 return comments
141
142 def fold(self, *, policy):
R. David Murray85d5c182017-12-03 18:51:41 -0500143 return _refold_parse_tree(self, policy=policy)
R David Murray0b6f6c82012-05-25 18:42:14 -0400144
145 def pprint(self, indent=''):
R. David Murray85d5c182017-12-03 18:51:41 -0500146 print(self.ppstr(indent=indent))
R David Murray0b6f6c82012-05-25 18:42:14 -0400147
148 def ppstr(self, indent=''):
R. David Murray85d5c182017-12-03 18:51:41 -0500149 return '\n'.join(self._pp(indent=indent))
R David Murray0b6f6c82012-05-25 18:42:14 -0400150
151 def _pp(self, indent=''):
152 yield '{}{}/{}('.format(
153 indent,
154 self.__class__.__name__,
155 self.token_type)
156 for token in self:
R David Murray97f43c02012-06-24 05:03:27 -0400157 if not hasattr(token, '_pp'):
158 yield (indent + ' !! invalid element in token '
159 'list: {!r}'.format(token))
160 else:
Philip Jenvey4993cc02012-10-01 12:53:43 -0700161 yield from token._pp(indent+' ')
R David Murray0b6f6c82012-05-25 18:42:14 -0400162 if self.defects:
163 extra = ' Defects: {}'.format(self.defects)
164 else:
165 extra = ''
166 yield '{}){}'.format(indent, extra)
167
168
169class WhiteSpaceTokenList(TokenList):
170
171 @property
172 def value(self):
173 return ' '
174
175 @property
176 def comments(self):
177 return [x.content for x in self if x.token_type=='comment']
178
179
180class UnstructuredTokenList(TokenList):
181
182 token_type = 'unstructured'
183
R David Murray0b6f6c82012-05-25 18:42:14 -0400184
185class Phrase(TokenList):
186
187 token_type = 'phrase'
188
R David Murray0b6f6c82012-05-25 18:42:14 -0400189class Word(TokenList):
190
191 token_type = 'word'
192
193
194class CFWSList(WhiteSpaceTokenList):
195
196 token_type = 'cfws'
197
R David Murray0b6f6c82012-05-25 18:42:14 -0400198
199class Atom(TokenList):
200
201 token_type = 'atom'
202
203
R David Murray97f43c02012-06-24 05:03:27 -0400204class Token(TokenList):
205
206 token_type = 'token'
R. David Murray85d5c182017-12-03 18:51:41 -0500207 encode_as_ew = False
R David Murray97f43c02012-06-24 05:03:27 -0400208
209
R David Murray0b6f6c82012-05-25 18:42:14 -0400210class EncodedWord(TokenList):
211
212 token_type = 'encoded-word'
213 cte = None
214 charset = None
215 lang = None
216
R David Murray0b6f6c82012-05-25 18:42:14 -0400217
218class QuotedString(TokenList):
219
220 token_type = 'quoted-string'
221
222 @property
223 def content(self):
224 for x in self:
225 if x.token_type == 'bare-quoted-string':
226 return x.value
227
228 @property
229 def quoted_value(self):
230 res = []
231 for x in self:
232 if x.token_type == 'bare-quoted-string':
233 res.append(str(x))
234 else:
235 res.append(x.value)
236 return ''.join(res)
237
R David Murray97f43c02012-06-24 05:03:27 -0400238 @property
239 def stripped_value(self):
240 for token in self:
241 if token.token_type == 'bare-quoted-string':
242 return token.value
243
R David Murray0b6f6c82012-05-25 18:42:14 -0400244
245class BareQuotedString(QuotedString):
246
247 token_type = 'bare-quoted-string'
248
249 def __str__(self):
R David Murray97f43c02012-06-24 05:03:27 -0400250 return quote_string(''.join(str(x) for x in self))
R David Murray0b6f6c82012-05-25 18:42:14 -0400251
252 @property
253 def value(self):
254 return ''.join(str(x) for x in self)
255
256
257class Comment(WhiteSpaceTokenList):
258
259 token_type = 'comment'
260
261 def __str__(self):
262 return ''.join(sum([
263 ["("],
264 [self.quote(x) for x in self],
265 [")"],
266 ], []))
267
268 def quote(self, value):
269 if value.token_type == 'comment':
270 return str(value)
271 return str(value).replace('\\', '\\\\').replace(
R David Murray44b548d2016-09-08 13:59:53 -0400272 '(', r'\(').replace(
273 ')', r'\)')
R David Murray0b6f6c82012-05-25 18:42:14 -0400274
275 @property
276 def content(self):
277 return ''.join(str(x) for x in self)
278
279 @property
280 def comments(self):
281 return [self.content]
282
283class AddressList(TokenList):
284
285 token_type = 'address-list'
286
287 @property
288 def addresses(self):
289 return [x for x in self if x.token_type=='address']
290
291 @property
292 def mailboxes(self):
293 return sum((x.mailboxes
294 for x in self if x.token_type=='address'), [])
295
296 @property
297 def all_mailboxes(self):
298 return sum((x.all_mailboxes
299 for x in self if x.token_type=='address'), [])
300
301
302class Address(TokenList):
303
304 token_type = 'address'
305
306 @property
307 def display_name(self):
308 if self[0].token_type == 'group':
309 return self[0].display_name
310
311 @property
312 def mailboxes(self):
313 if self[0].token_type == 'mailbox':
314 return [self[0]]
315 elif self[0].token_type == 'invalid-mailbox':
316 return []
317 return self[0].mailboxes
318
319 @property
320 def all_mailboxes(self):
321 if self[0].token_type == 'mailbox':
322 return [self[0]]
323 elif self[0].token_type == 'invalid-mailbox':
324 return [self[0]]
325 return self[0].all_mailboxes
326
327class MailboxList(TokenList):
328
329 token_type = 'mailbox-list'
330
331 @property
332 def mailboxes(self):
333 return [x for x in self if x.token_type=='mailbox']
334
335 @property
336 def all_mailboxes(self):
337 return [x for x in self
338 if x.token_type in ('mailbox', 'invalid-mailbox')]
339
340
341class GroupList(TokenList):
342
343 token_type = 'group-list'
344
345 @property
346 def mailboxes(self):
347 if not self or self[0].token_type != 'mailbox-list':
348 return []
349 return self[0].mailboxes
350
351 @property
352 def all_mailboxes(self):
353 if not self or self[0].token_type != 'mailbox-list':
354 return []
355 return self[0].all_mailboxes
356
357
358class Group(TokenList):
359
360 token_type = "group"
361
362 @property
363 def mailboxes(self):
364 if self[2].token_type != 'group-list':
365 return []
366 return self[2].mailboxes
367
368 @property
369 def all_mailboxes(self):
370 if self[2].token_type != 'group-list':
371 return []
372 return self[2].all_mailboxes
373
374 @property
375 def display_name(self):
376 return self[0].display_name
377
378
379class NameAddr(TokenList):
380
381 token_type = 'name-addr'
382
383 @property
384 def display_name(self):
385 if len(self) == 1:
386 return None
387 return self[0].display_name
388
389 @property
390 def local_part(self):
391 return self[-1].local_part
392
393 @property
394 def domain(self):
395 return self[-1].domain
396
397 @property
398 def route(self):
399 return self[-1].route
400
401 @property
402 def addr_spec(self):
403 return self[-1].addr_spec
404
405
406class AngleAddr(TokenList):
407
408 token_type = 'angle-addr'
409
410 @property
411 def local_part(self):
412 for x in self:
413 if x.token_type == 'addr-spec':
414 return x.local_part
415
416 @property
417 def domain(self):
418 for x in self:
419 if x.token_type == 'addr-spec':
420 return x.domain
421
422 @property
423 def route(self):
424 for x in self:
425 if x.token_type == 'obs-route':
426 return x.domains
427
428 @property
429 def addr_spec(self):
430 for x in self:
431 if x.token_type == 'addr-spec':
jayyyinaa218d12018-01-29 13:07:44 -0500432 if x.local_part:
433 return x.addr_spec
434 else:
435 return quote_string(x.local_part) + x.addr_spec
R David Murray032eed32012-05-26 14:31:12 -0400436 else:
437 return '<>'
R David Murray0b6f6c82012-05-25 18:42:14 -0400438
439
440class ObsRoute(TokenList):
441
442 token_type = 'obs-route'
443
444 @property
445 def domains(self):
446 return [x.domain for x in self if x.token_type == 'domain']
447
448
449class Mailbox(TokenList):
450
451 token_type = 'mailbox'
452
453 @property
454 def display_name(self):
455 if self[0].token_type == 'name-addr':
456 return self[0].display_name
457
458 @property
459 def local_part(self):
460 return self[0].local_part
461
462 @property
463 def domain(self):
464 return self[0].domain
465
466 @property
467 def route(self):
468 if self[0].token_type == 'name-addr':
469 return self[0].route
470
471 @property
472 def addr_spec(self):
473 return self[0].addr_spec
474
475
476class InvalidMailbox(TokenList):
477
478 token_type = 'invalid-mailbox'
479
480 @property
481 def display_name(self):
482 return None
483
484 local_part = domain = route = addr_spec = display_name
485
486
487class Domain(TokenList):
488
489 token_type = 'domain'
R. David Murray85d5c182017-12-03 18:51:41 -0500490 as_ew_allowed = False
R David Murray0b6f6c82012-05-25 18:42:14 -0400491
492 @property
493 def domain(self):
494 return ''.join(super().value.split())
495
496
497class DotAtom(TokenList):
498
499 token_type = 'dot-atom'
500
501
502class DotAtomText(TokenList):
503
504 token_type = 'dot-atom-text'
R. David Murray85d5c182017-12-03 18:51:41 -0500505 as_ew_allowed = True
R David Murray0b6f6c82012-05-25 18:42:14 -0400506
507
508class AddrSpec(TokenList):
509
510 token_type = 'addr-spec'
R. David Murray85d5c182017-12-03 18:51:41 -0500511 as_ew_allowed = False
R David Murray0b6f6c82012-05-25 18:42:14 -0400512
513 @property
514 def local_part(self):
515 return self[0].local_part
516
517 @property
518 def domain(self):
519 if len(self) < 3:
520 return None
521 return self[-1].domain
522
523 @property
524 def value(self):
525 if len(self) < 3:
526 return self[0].value
527 return self[0].value.rstrip()+self[1].value+self[2].value.lstrip()
528
529 @property
530 def addr_spec(self):
531 nameset = set(self.local_part)
532 if len(nameset) > len(nameset-DOT_ATOM_ENDS):
533 lp = quote_string(self.local_part)
534 else:
535 lp = self.local_part
536 if self.domain is not None:
537 return lp + '@' + self.domain
538 return lp
539
540
541class ObsLocalPart(TokenList):
542
543 token_type = 'obs-local-part'
R. David Murray85d5c182017-12-03 18:51:41 -0500544 as_ew_allowed = False
R David Murray0b6f6c82012-05-25 18:42:14 -0400545
546
547class DisplayName(Phrase):
548
549 token_type = 'display-name'
R. David Murray85d5c182017-12-03 18:51:41 -0500550 ew_combine_allowed = False
R David Murray0b6f6c82012-05-25 18:42:14 -0400551
552 @property
553 def display_name(self):
554 res = TokenList(self)
555 if res[0].token_type == 'cfws':
556 res.pop(0)
557 else:
558 if res[0][0].token_type == 'cfws':
559 res[0] = TokenList(res[0][1:])
560 if res[-1].token_type == 'cfws':
561 res.pop()
562 else:
563 if res[-1][-1].token_type == 'cfws':
564 res[-1] = TokenList(res[-1][:-1])
565 return res.value
566
567 @property
568 def value(self):
569 quote = False
570 if self.defects:
571 quote = True
572 else:
573 for x in self:
574 if x.token_type == 'quoted-string':
575 quote = True
576 if quote:
577 pre = post = ''
578 if self[0].token_type=='cfws' or self[0][0].token_type=='cfws':
579 pre = ' '
580 if self[-1].token_type=='cfws' or self[-1][-1].token_type=='cfws':
581 post = ' '
582 return pre+quote_string(self.display_name)+post
583 else:
584 return super().value
585
586
587class LocalPart(TokenList):
588
589 token_type = 'local-part'
R. David Murray85d5c182017-12-03 18:51:41 -0500590 as_ew_allowed = False
R David Murray0b6f6c82012-05-25 18:42:14 -0400591
592 @property
593 def value(self):
594 if self[0].token_type == "quoted-string":
595 return self[0].quoted_value
596 else:
597 return self[0].value
598
599 @property
600 def local_part(self):
601 # Strip whitespace from front, back, and around dots.
602 res = [DOT]
603 last = DOT
604 last_is_tl = False
605 for tok in self[0] + [DOT]:
606 if tok.token_type == 'cfws':
607 continue
608 if (last_is_tl and tok.token_type == 'dot' and
609 last[-1].token_type == 'cfws'):
610 res[-1] = TokenList(last[:-1])
611 is_tl = isinstance(tok, TokenList)
612 if (is_tl and last.token_type == 'dot' and
613 tok[0].token_type == 'cfws'):
614 res.append(TokenList(tok[1:]))
615 else:
616 res.append(tok)
617 last = res[-1]
618 last_is_tl = is_tl
619 res = TokenList(res[1:-1])
620 return res.value
621
622
623class DomainLiteral(TokenList):
624
625 token_type = 'domain-literal'
R. David Murray85d5c182017-12-03 18:51:41 -0500626 as_ew_allowed = False
R David Murray0b6f6c82012-05-25 18:42:14 -0400627
628 @property
629 def domain(self):
630 return ''.join(super().value.split())
631
632 @property
633 def ip(self):
634 for x in self:
635 if x.token_type == 'ptext':
636 return x.value
637
638
R David Murray97f43c02012-06-24 05:03:27 -0400639class MIMEVersion(TokenList):
640
641 token_type = 'mime-version'
642 major = None
643 minor = None
644
645
646class Parameter(TokenList):
647
648 token_type = 'parameter'
649 sectioned = False
650 extended = False
651 charset = 'us-ascii'
652
653 @property
654 def section_number(self):
655 # Because the first token, the attribute (name) eats CFWS, the second
656 # token is always the section if there is one.
657 return self[1].number if self.sectioned else 0
658
659 @property
660 def param_value(self):
661 # This is part of the "handle quoted extended parameters" hack.
662 for token in self:
663 if token.token_type == 'value':
664 return token.stripped_value
665 if token.token_type == 'quoted-string':
666 for token in token:
667 if token.token_type == 'bare-quoted-string':
668 for token in token:
669 if token.token_type == 'value':
670 return token.stripped_value
671 return ''
672
673
674class InvalidParameter(Parameter):
675
676 token_type = 'invalid-parameter'
677
678
679class Attribute(TokenList):
680
681 token_type = 'attribute'
682
683 @property
684 def stripped_value(self):
685 for token in self:
686 if token.token_type.endswith('attrtext'):
687 return token.value
688
689class Section(TokenList):
690
691 token_type = 'section'
692 number = None
693
694
695class Value(TokenList):
696
697 token_type = 'value'
698
699 @property
700 def stripped_value(self):
701 token = self[0]
702 if token.token_type == 'cfws':
703 token = self[1]
704 if token.token_type.endswith(
705 ('quoted-string', 'attribute', 'extended-attribute')):
706 return token.stripped_value
707 return self.value
708
709
710class MimeParameters(TokenList):
711
712 token_type = 'mime-parameters'
R. David Murray85d5c182017-12-03 18:51:41 -0500713 syntactic_break = False
R David Murray97f43c02012-06-24 05:03:27 -0400714
715 @property
716 def params(self):
717 # The RFC specifically states that the ordering of parameters is not
718 # guaranteed and may be reordered by the transport layer. So we have
719 # to assume the RFC 2231 pieces can come in any order. However, we
720 # output them in the order that we first see a given name, which gives
721 # us a stable __str__.
Inada Naokic95404f2019-02-05 17:05:43 +0900722 params = {} # Using order preserving dict from Python 3.7+
R David Murray97f43c02012-06-24 05:03:27 -0400723 for token in self:
724 if not token.token_type.endswith('parameter'):
725 continue
726 if token[0].token_type != 'attribute':
727 continue
728 name = token[0].value.strip()
729 if name not in params:
730 params[name] = []
731 params[name].append((token.section_number, token))
732 for name, parts in params.items():
R David Murray7d0325d2015-03-29 21:53:05 -0400733 parts = sorted(parts, key=itemgetter(0))
734 first_param = parts[0][1]
735 charset = first_param.charset
736 # Our arbitrary error recovery is to ignore duplicate parameters,
737 # to use appearance order if there are duplicate rfc 2231 parts,
738 # and to ignore gaps. This mimics the error recovery of get_param.
739 if not first_param.extended and len(parts) > 1:
740 if parts[1][0] == 0:
741 parts[1][1].defects.append(errors.InvalidHeaderDefect(
742 'duplicate parameter name; duplicate(s) ignored'))
743 parts = parts[:1]
744 # Else assume the *0* was missing...note that this is different
745 # from get_param, but we registered a defect for this earlier.
R David Murray97f43c02012-06-24 05:03:27 -0400746 value_parts = []
R David Murray7d0325d2015-03-29 21:53:05 -0400747 i = 0
748 for section_number, param in parts:
R David Murray97f43c02012-06-24 05:03:27 -0400749 if section_number != i:
R David Murray7d0325d2015-03-29 21:53:05 -0400750 # We could get fancier here and look for a complete
751 # duplicate extended parameter and ignore the second one
752 # seen. But we're not doing that. The old code didn't.
753 if not param.extended:
754 param.defects.append(errors.InvalidHeaderDefect(
755 'duplicate parameter name; duplicate ignored'))
756 continue
757 else:
758 param.defects.append(errors.InvalidHeaderDefect(
759 "inconsistent RFC2231 parameter numbering"))
760 i += 1
R David Murray97f43c02012-06-24 05:03:27 -0400761 value = param.param_value
762 if param.extended:
763 try:
764 value = urllib.parse.unquote_to_bytes(value)
765 except UnicodeEncodeError:
766 # source had surrogate escaped bytes. What we do now
767 # is a bit of an open question. I'm not sure this is
768 # the best choice, but it is what the old algorithm did
769 value = urllib.parse.unquote(value, encoding='latin-1')
770 else:
771 try:
772 value = value.decode(charset, 'surrogateescape')
773 except LookupError:
774 # XXX: there should really be a custom defect for
775 # unknown character set to make it easy to find,
776 # because otherwise unknown charset is a silent
777 # failure.
778 value = value.decode('us-ascii', 'surrogateescape')
779 if utils._has_surrogates(value):
780 param.defects.append(errors.UndecodableBytesDefect())
781 value_parts.append(value)
782 value = ''.join(value_parts)
783 yield name, value
784
785 def __str__(self):
786 params = []
787 for name, value in self.params:
788 if value:
789 params.append('{}={}'.format(name, quote_string(value)))
790 else:
791 params.append(name)
792 params = '; '.join(params)
793 return ' ' + params if params else ''
794
795
796class ParameterizedHeaderValue(TokenList):
797
R. David Murray85d5c182017-12-03 18:51:41 -0500798 # Set this false so that the value doesn't wind up on a new line even
799 # if it and the parameters would fit there but not on the first line.
800 syntactic_break = False
801
R David Murray97f43c02012-06-24 05:03:27 -0400802 @property
803 def params(self):
804 for token in reversed(self):
805 if token.token_type == 'mime-parameters':
806 return token.params
807 return {}
808
R David Murray97f43c02012-06-24 05:03:27 -0400809
810class ContentType(ParameterizedHeaderValue):
811
812 token_type = 'content-type'
R. David Murray85d5c182017-12-03 18:51:41 -0500813 as_ew_allowed = False
R David Murray97f43c02012-06-24 05:03:27 -0400814 maintype = 'text'
815 subtype = 'plain'
816
817
818class ContentDisposition(ParameterizedHeaderValue):
819
820 token_type = 'content-disposition'
R. David Murray85d5c182017-12-03 18:51:41 -0500821 as_ew_allowed = False
R David Murray97f43c02012-06-24 05:03:27 -0400822 content_disposition = None
823
824
825class ContentTransferEncoding(TokenList):
826
827 token_type = 'content-transfer-encoding'
R. David Murray85d5c182017-12-03 18:51:41 -0500828 as_ew_allowed = False
R David Murray97f43c02012-06-24 05:03:27 -0400829 cte = '7bit'
830
831
R David Murray0b6f6c82012-05-25 18:42:14 -0400832class HeaderLabel(TokenList):
833
834 token_type = 'header-label'
R. David Murray85d5c182017-12-03 18:51:41 -0500835 as_ew_allowed = False
R David Murray0b6f6c82012-05-25 18:42:14 -0400836
837
838class Header(TokenList):
839
840 token_type = 'header'
841
R David Murray0b6f6c82012-05-25 18:42:14 -0400842
843#
844# Terminal classes and instances
845#
846
847class Terminal(str):
848
R. David Murray85d5c182017-12-03 18:51:41 -0500849 as_ew_allowed = True
850 ew_combine_allowed = True
851 syntactic_break = True
852
R David Murray0b6f6c82012-05-25 18:42:14 -0400853 def __new__(cls, value, token_type):
854 self = super().__new__(cls, value)
855 self.token_type = token_type
856 self.defects = []
857 return self
858
859 def __repr__(self):
860 return "{}({})".format(self.__class__.__name__, super().__repr__())
861
R. David Murray85d5c182017-12-03 18:51:41 -0500862 def pprint(self):
863 print(self.__class__.__name__ + '/' + self.token_type)
864
R David Murray0b6f6c82012-05-25 18:42:14 -0400865 @property
866 def all_defects(self):
867 return list(self.defects)
868
869 def _pp(self, indent=''):
870 return ["{}{}/{}({}){}".format(
871 indent,
872 self.__class__.__name__,
873 self.token_type,
874 super().__repr__(),
875 '' if not self.defects else ' {}'.format(self.defects),
876 )]
877
R David Murray0b6f6c82012-05-25 18:42:14 -0400878 def pop_trailing_ws(self):
879 # This terminates the recursion.
880 return None
881
R David Murray0b6f6c82012-05-25 18:42:14 -0400882 @property
883 def comments(self):
884 return []
885
R David Murray0b6f6c82012-05-25 18:42:14 -0400886 def __getnewargs__(self):
887 return(str(self), self.token_type)
888
889
890class WhiteSpaceTerminal(Terminal):
891
892 @property
893 def value(self):
894 return ' '
895
896 def startswith_fws(self):
897 return True
898
R David Murray0b6f6c82012-05-25 18:42:14 -0400899
900class ValueTerminal(Terminal):
901
902 @property
903 def value(self):
904 return self
905
906 def startswith_fws(self):
907 return False
908
R David Murray0b6f6c82012-05-25 18:42:14 -0400909
910class EWWhiteSpaceTerminal(WhiteSpaceTerminal):
911
912 @property
913 def value(self):
914 return ''
915
R David Murray0b6f6c82012-05-25 18:42:14 -0400916 def __str__(self):
917 return ''
918
R David Murray0b6f6c82012-05-25 18:42:14 -0400919
920# XXX these need to become classes and used as instances so
921# that a program can't change them in a parse tree and screw
922# up other parse trees. Maybe should have tests for that, too.
923DOT = ValueTerminal('.', 'dot')
924ListSeparator = ValueTerminal(',', 'list-separator')
925RouteComponentMarker = ValueTerminal('@', 'route-component-marker')
926
927#
928# Parser
929#
930
Victor Stinner765531d2013-03-26 01:11:54 +0100931# Parse strings according to RFC822/2047/2822/5322 rules.
932#
933# This is a stateless parser. Each get_XXX function accepts a string and
934# returns either a Terminal or a TokenList representing the RFC object named
935# by the method and a string containing the remaining unparsed characters
936# from the input. Thus a parser method consumes the next syntactic construct
937# of a given type and returns a token representing the construct plus the
938# unparsed remainder of the input string.
939#
940# For example, if the first element of a structured header is a 'phrase',
941# then:
942#
943# phrase, value = get_phrase(value)
944#
945# returns the complete phrase from the start of the string value, plus any
946# characters left in the string after the phrase is removed.
R David Murray0b6f6c82012-05-25 18:42:14 -0400947
948_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split
949_non_atom_end_matcher = re.compile(r"[^{}]+".format(
Serhiy Storchaka05cb7282017-11-16 12:38:26 +0200950 re.escape(''.join(ATOM_ENDS)))).match
R David Murray0b6f6c82012-05-25 18:42:14 -0400951_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall
R David Murray97f43c02012-06-24 05:03:27 -0400952_non_token_end_matcher = re.compile(r"[^{}]+".format(
Serhiy Storchaka05cb7282017-11-16 12:38:26 +0200953 re.escape(''.join(TOKEN_ENDS)))).match
R David Murray97f43c02012-06-24 05:03:27 -0400954_non_attribute_end_matcher = re.compile(r"[^{}]+".format(
Serhiy Storchaka05cb7282017-11-16 12:38:26 +0200955 re.escape(''.join(ATTRIBUTE_ENDS)))).match
R David Murray97f43c02012-06-24 05:03:27 -0400956_non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format(
Serhiy Storchaka05cb7282017-11-16 12:38:26 +0200957 re.escape(''.join(EXTENDED_ATTRIBUTE_ENDS)))).match
R David Murray0b6f6c82012-05-25 18:42:14 -0400958
959def _validate_xtext(xtext):
960 """If input token contains ASCII non-printables, register a defect."""
961
962 non_printables = _non_printable_finder(xtext)
963 if non_printables:
964 xtext.defects.append(errors.NonPrintableDefect(non_printables))
965 if utils._has_surrogates(xtext):
966 xtext.defects.append(errors.UndecodableBytesDefect(
967 "Non-ASCII characters found in header token"))
968
969def _get_ptext_to_endchars(value, endchars):
970 """Scan printables/quoted-pairs until endchars and return unquoted ptext.
971
972 This function turns a run of qcontent, ccontent-without-comments, or
973 dtext-with-quoted-printables into a single string by unquoting any
974 quoted printables. It returns the string, the remaining value, and
975 a flag that is True iff there were any quoted printables decoded.
976
977 """
978 fragment, *remainder = _wsp_splitter(value, 1)
979 vchars = []
980 escape = False
981 had_qp = False
982 for pos in range(len(fragment)):
983 if fragment[pos] == '\\':
984 if escape:
985 escape = False
986 had_qp = True
987 else:
988 escape = True
989 continue
990 if escape:
991 escape = False
992 elif fragment[pos] in endchars:
993 break
994 vchars.append(fragment[pos])
995 else:
996 pos = pos + 1
997 return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp
998
R David Murray0b6f6c82012-05-25 18:42:14 -0400999def get_fws(value):
1000 """FWS = 1*WSP
1001
1002 This isn't the RFC definition. We're using fws to represent tokens where
1003 folding can be done, but when we are parsing the *un*folding has already
1004 been done so we don't need to watch out for CRLF.
1005
1006 """
1007 newvalue = value.lstrip()
1008 fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws')
1009 return fws, newvalue
1010
1011def get_encoded_word(value):
1012 """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
1013
1014 """
1015 ew = EncodedWord()
1016 if not value.startswith('=?'):
1017 raise errors.HeaderParseError(
1018 "expected encoded word but found {}".format(value))
1019 tok, *remainder = value[2:].split('?=', 1)
1020 if tok == value[2:]:
1021 raise errors.HeaderParseError(
1022 "expected encoded word but found {}".format(value))
1023 remstr = ''.join(remainder)
R David Murray65171b22013-07-11 15:52:57 -04001024 if len(remstr) > 1 and remstr[0] in hexdigits and remstr[1] in hexdigits:
1025 # The ? after the CTE was followed by an encoded word escape (=XX).
R David Murray0b6f6c82012-05-25 18:42:14 -04001026 rest, *remainder = remstr.split('?=', 1)
1027 tok = tok + '?=' + rest
1028 if len(tok.split()) > 1:
1029 ew.defects.append(errors.InvalidHeaderDefect(
1030 "whitespace inside encoded word"))
1031 ew.cte = value
1032 value = ''.join(remainder)
1033 try:
1034 text, charset, lang, defects = _ew.decode('=?' + tok + '?=')
1035 except ValueError:
1036 raise errors.HeaderParseError(
1037 "encoded word format invalid: '{}'".format(ew.cte))
1038 ew.charset = charset
1039 ew.lang = lang
1040 ew.defects.extend(defects)
1041 while text:
1042 if text[0] in WSP:
1043 token, text = get_fws(text)
1044 ew.append(token)
1045 continue
1046 chars, *remainder = _wsp_splitter(text, 1)
1047 vtext = ValueTerminal(chars, 'vtext')
1048 _validate_xtext(vtext)
1049 ew.append(vtext)
1050 text = ''.join(remainder)
1051 return ew, value
1052
1053def get_unstructured(value):
1054 """unstructured = (*([FWS] vchar) *WSP) / obs-unstruct
1055 obs-unstruct = *((*LF *CR *(obs-utext) *LF *CR)) / FWS)
1056 obs-utext = %d0 / obs-NO-WS-CTL / LF / CR
1057
1058 obs-NO-WS-CTL is control characters except WSP/CR/LF.
1059
1060 So, basically, we have printable runs, plus control characters or nulls in
1061 the obsolete syntax, separated by whitespace. Since RFC 2047 uses the
1062 obsolete syntax in its specification, but requires whitespace on either
1063 side of the encoded words, I can see no reason to need to separate the
1064 non-printable-non-whitespace from the printable runs if they occur, so we
1065 parse this into xtext tokens separated by WSP tokens.
1066
1067 Because an 'unstructured' value must by definition constitute the entire
1068 value, this 'get' routine does not return a remaining value, only the
1069 parsed TokenList.
1070
1071 """
1072 # XXX: but what about bare CR and LF? They might signal the start or
R David Murray65171b22013-07-11 15:52:57 -04001073 # end of an encoded word. YAGNI for now, since our current parsers
1074 # will never send us strings with bare CR or LF.
R David Murray0b6f6c82012-05-25 18:42:14 -04001075
1076 unstructured = UnstructuredTokenList()
1077 while value:
1078 if value[0] in WSP:
1079 token, value = get_fws(value)
1080 unstructured.append(token)
1081 continue
1082 if value.startswith('=?'):
1083 try:
1084 token, value = get_encoded_word(value)
1085 except errors.HeaderParseError:
R David Murray65171b22013-07-11 15:52:57 -04001086 # XXX: Need to figure out how to register defects when
1087 # appropriate here.
R David Murray0b6f6c82012-05-25 18:42:14 -04001088 pass
1089 else:
1090 have_ws = True
1091 if len(unstructured) > 0:
1092 if unstructured[-1].token_type != 'fws':
1093 unstructured.defects.append(errors.InvalidHeaderDefect(
1094 "missing whitespace before encoded word"))
1095 have_ws = False
1096 if have_ws and len(unstructured) > 1:
1097 if unstructured[-2].token_type == 'encoded-word':
1098 unstructured[-1] = EWWhiteSpaceTerminal(
1099 unstructured[-1], 'fws')
1100 unstructured.append(token)
1101 continue
1102 tok, *remainder = _wsp_splitter(value, 1)
1103 vtext = ValueTerminal(tok, 'vtext')
1104 _validate_xtext(vtext)
1105 unstructured.append(vtext)
1106 value = ''.join(remainder)
1107 return unstructured
1108
1109def get_qp_ctext(value):
R David Murray44b548d2016-09-08 13:59:53 -04001110 r"""ctext = <printable ascii except \ ( )>
R David Murray0b6f6c82012-05-25 18:42:14 -04001111
1112 This is not the RFC ctext, since we are handling nested comments in comment
1113 and unquoting quoted-pairs here. We allow anything except the '()'
1114 characters, but if we find any ASCII other than the RFC defined printable
Serhiy Storchaka6a7b3a72016-04-17 08:32:47 +03001115 ASCII, a NonPrintableDefect is added to the token's defects list. Since
R David Murray0b6f6c82012-05-25 18:42:14 -04001116 quoted pairs are converted to their unquoted values, what is returned is
1117 a 'ptext' token. In this case it is a WhiteSpaceTerminal, so it's value
1118 is ' '.
1119
1120 """
1121 ptext, value, _ = _get_ptext_to_endchars(value, '()')
1122 ptext = WhiteSpaceTerminal(ptext, 'ptext')
1123 _validate_xtext(ptext)
1124 return ptext, value
1125
1126def get_qcontent(value):
1127 """qcontent = qtext / quoted-pair
1128
1129 We allow anything except the DQUOTE character, but if we find any ASCII
Serhiy Storchaka6a7b3a72016-04-17 08:32:47 +03001130 other than the RFC defined printable ASCII, a NonPrintableDefect is
R David Murray0b6f6c82012-05-25 18:42:14 -04001131 added to the token's defects list. Any quoted pairs are converted to their
1132 unquoted values, so what is returned is a 'ptext' token. In this case it
1133 is a ValueTerminal.
1134
1135 """
1136 ptext, value, _ = _get_ptext_to_endchars(value, '"')
1137 ptext = ValueTerminal(ptext, 'ptext')
1138 _validate_xtext(ptext)
1139 return ptext, value
1140
1141def get_atext(value):
1142 """atext = <matches _atext_matcher>
1143
1144 We allow any non-ATOM_ENDS in atext, but add an InvalidATextDefect to
1145 the token's defects list if we find non-atext characters.
1146 """
1147 m = _non_atom_end_matcher(value)
1148 if not m:
1149 raise errors.HeaderParseError(
1150 "expected atext but found '{}'".format(value))
1151 atext = m.group()
1152 value = value[len(atext):]
1153 atext = ValueTerminal(atext, 'atext')
1154 _validate_xtext(atext)
1155 return atext, value
1156
1157def get_bare_quoted_string(value):
1158 """bare-quoted-string = DQUOTE *([FWS] qcontent) [FWS] DQUOTE
1159
1160 A quoted-string without the leading or trailing white space. Its
1161 value is the text between the quote marks, with whitespace
1162 preserved and quoted pairs decoded.
1163 """
1164 if value[0] != '"':
1165 raise errors.HeaderParseError(
1166 "expected '\"' but found '{}'".format(value))
1167 bare_quoted_string = BareQuotedString()
1168 value = value[1:]
jayyyinaa218d12018-01-29 13:07:44 -05001169 if value[0] == '"':
1170 token, value = get_qcontent(value)
1171 bare_quoted_string.append(token)
R David Murray0b6f6c82012-05-25 18:42:14 -04001172 while value and value[0] != '"':
1173 if value[0] in WSP:
1174 token, value = get_fws(value)
R David Murray0400d332014-02-08 13:12:00 -05001175 elif value[:2] == '=?':
1176 try:
1177 token, value = get_encoded_word(value)
1178 bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
1179 "encoded word inside quoted string"))
1180 except errors.HeaderParseError:
1181 token, value = get_qcontent(value)
R David Murray0b6f6c82012-05-25 18:42:14 -04001182 else:
1183 token, value = get_qcontent(value)
1184 bare_quoted_string.append(token)
1185 if not value:
1186 bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
1187 "end of header inside quoted string"))
1188 return bare_quoted_string, value
1189 return bare_quoted_string, value[1:]
1190
1191def get_comment(value):
1192 """comment = "(" *([FWS] ccontent) [FWS] ")"
1193 ccontent = ctext / quoted-pair / comment
1194
1195 We handle nested comments here, and quoted-pair in our qp-ctext routine.
1196 """
1197 if value and value[0] != '(':
1198 raise errors.HeaderParseError(
1199 "expected '(' but found '{}'".format(value))
1200 comment = Comment()
1201 value = value[1:]
1202 while value and value[0] != ")":
1203 if value[0] in WSP:
1204 token, value = get_fws(value)
1205 elif value[0] == '(':
1206 token, value = get_comment(value)
1207 else:
1208 token, value = get_qp_ctext(value)
1209 comment.append(token)
1210 if not value:
1211 comment.defects.append(errors.InvalidHeaderDefect(
1212 "end of header inside comment"))
1213 return comment, value
1214 return comment, value[1:]
1215
1216def get_cfws(value):
1217 """CFWS = (1*([FWS] comment) [FWS]) / FWS
1218
1219 """
1220 cfws = CFWSList()
1221 while value and value[0] in CFWS_LEADER:
1222 if value[0] in WSP:
1223 token, value = get_fws(value)
1224 else:
1225 token, value = get_comment(value)
1226 cfws.append(token)
1227 return cfws, value
1228
1229def get_quoted_string(value):
1230 """quoted-string = [CFWS] <bare-quoted-string> [CFWS]
1231
1232 'bare-quoted-string' is an intermediate class defined by this
1233 parser and not by the RFC grammar. It is the quoted string
1234 without any attached CFWS.
1235 """
1236 quoted_string = QuotedString()
1237 if value and value[0] in CFWS_LEADER:
1238 token, value = get_cfws(value)
1239 quoted_string.append(token)
1240 token, value = get_bare_quoted_string(value)
1241 quoted_string.append(token)
1242 if value and value[0] in CFWS_LEADER:
1243 token, value = get_cfws(value)
1244 quoted_string.append(token)
1245 return quoted_string, value
1246
1247def get_atom(value):
1248 """atom = [CFWS] 1*atext [CFWS]
1249
R David Murray923512f2013-07-12 16:00:28 -04001250 An atom could be an rfc2047 encoded word.
R David Murray0b6f6c82012-05-25 18:42:14 -04001251 """
1252 atom = Atom()
1253 if value and value[0] in CFWS_LEADER:
1254 token, value = get_cfws(value)
1255 atom.append(token)
1256 if value and value[0] in ATOM_ENDS:
1257 raise errors.HeaderParseError(
1258 "expected atom but found '{}'".format(value))
R David Murray923512f2013-07-12 16:00:28 -04001259 if value.startswith('=?'):
1260 try:
1261 token, value = get_encoded_word(value)
1262 except errors.HeaderParseError:
1263 # XXX: need to figure out how to register defects when
1264 # appropriate here.
1265 token, value = get_atext(value)
1266 else:
1267 token, value = get_atext(value)
R David Murray0b6f6c82012-05-25 18:42:14 -04001268 atom.append(token)
1269 if value and value[0] in CFWS_LEADER:
1270 token, value = get_cfws(value)
1271 atom.append(token)
1272 return atom, value
1273
1274def get_dot_atom_text(value):
1275 """ dot-text = 1*atext *("." 1*atext)
1276
1277 """
1278 dot_atom_text = DotAtomText()
1279 if not value or value[0] in ATOM_ENDS:
1280 raise errors.HeaderParseError("expected atom at a start of "
1281 "dot-atom-text but found '{}'".format(value))
1282 while value and value[0] not in ATOM_ENDS:
1283 token, value = get_atext(value)
1284 dot_atom_text.append(token)
1285 if value and value[0] == '.':
1286 dot_atom_text.append(DOT)
1287 value = value[1:]
1288 if dot_atom_text[-1] is DOT:
1289 raise errors.HeaderParseError("expected atom at end of dot-atom-text "
1290 "but found '{}'".format('.'+value))
1291 return dot_atom_text, value
1292
1293def get_dot_atom(value):
1294 """ dot-atom = [CFWS] dot-atom-text [CFWS]
1295
R David Murray923512f2013-07-12 16:00:28 -04001296 Any place we can have a dot atom, we could instead have an rfc2047 encoded
1297 word.
R David Murray0b6f6c82012-05-25 18:42:14 -04001298 """
1299 dot_atom = DotAtom()
1300 if value[0] in CFWS_LEADER:
1301 token, value = get_cfws(value)
1302 dot_atom.append(token)
R David Murray923512f2013-07-12 16:00:28 -04001303 if value.startswith('=?'):
1304 try:
1305 token, value = get_encoded_word(value)
1306 except errors.HeaderParseError:
1307 # XXX: need to figure out how to register defects when
1308 # appropriate here.
1309 token, value = get_dot_atom_text(value)
1310 else:
1311 token, value = get_dot_atom_text(value)
R David Murray0b6f6c82012-05-25 18:42:14 -04001312 dot_atom.append(token)
1313 if value and value[0] in CFWS_LEADER:
1314 token, value = get_cfws(value)
1315 dot_atom.append(token)
1316 return dot_atom, value
1317
1318def get_word(value):
1319 """word = atom / quoted-string
1320
1321 Either atom or quoted-string may start with CFWS. We have to peel off this
1322 CFWS first to determine which type of word to parse. Afterward we splice
1323 the leading CFWS, if any, into the parsed sub-token.
1324
1325 If neither an atom or a quoted-string is found before the next special, a
1326 HeaderParseError is raised.
1327
1328 The token returned is either an Atom or a QuotedString, as appropriate.
1329 This means the 'word' level of the formal grammar is not represented in the
1330 parse tree; this is because having that extra layer when manipulating the
1331 parse tree is more confusing than it is helpful.
1332
1333 """
1334 if value[0] in CFWS_LEADER:
1335 leader, value = get_cfws(value)
1336 else:
1337 leader = None
1338 if value[0]=='"':
1339 token, value = get_quoted_string(value)
1340 elif value[0] in SPECIALS:
1341 raise errors.HeaderParseError("Expected 'atom' or 'quoted-string' "
1342 "but found '{}'".format(value))
1343 else:
1344 token, value = get_atom(value)
1345 if leader is not None:
1346 token[:0] = [leader]
1347 return token, value
1348
1349def get_phrase(value):
1350 """ phrase = 1*word / obs-phrase
1351 obs-phrase = word *(word / "." / CFWS)
1352
1353 This means a phrase can be a sequence of words, periods, and CFWS in any
1354 order as long as it starts with at least one word. If anything other than
1355 words is detected, an ObsoleteHeaderDefect is added to the token's defect
1356 list. We also accept a phrase that starts with CFWS followed by a dot;
1357 this is registered as an InvalidHeaderDefect, since it is not supported by
1358 even the obsolete grammar.
1359
1360 """
1361 phrase = Phrase()
1362 try:
1363 token, value = get_word(value)
1364 phrase.append(token)
1365 except errors.HeaderParseError:
1366 phrase.defects.append(errors.InvalidHeaderDefect(
1367 "phrase does not start with word"))
1368 while value and value[0] not in PHRASE_ENDS:
1369 if value[0]=='.':
1370 phrase.append(DOT)
1371 phrase.defects.append(errors.ObsoleteHeaderDefect(
1372 "period in 'phrase'"))
1373 value = value[1:]
1374 else:
1375 try:
1376 token, value = get_word(value)
1377 except errors.HeaderParseError:
1378 if value[0] in CFWS_LEADER:
1379 token, value = get_cfws(value)
1380 phrase.defects.append(errors.ObsoleteHeaderDefect(
1381 "comment found without atom"))
1382 else:
1383 raise
1384 phrase.append(token)
1385 return phrase, value
1386
1387def get_local_part(value):
1388 """ local-part = dot-atom / quoted-string / obs-local-part
1389
1390 """
1391 local_part = LocalPart()
1392 leader = None
1393 if value[0] in CFWS_LEADER:
1394 leader, value = get_cfws(value)
1395 if not value:
1396 raise errors.HeaderParseError(
1397 "expected local-part but found '{}'".format(value))
1398 try:
1399 token, value = get_dot_atom(value)
1400 except errors.HeaderParseError:
1401 try:
1402 token, value = get_word(value)
1403 except errors.HeaderParseError:
1404 if value[0] != '\\' and value[0] in PHRASE_ENDS:
1405 raise
1406 token = TokenList()
1407 if leader is not None:
1408 token[:0] = [leader]
1409 local_part.append(token)
1410 if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
1411 obs_local_part, value = get_obs_local_part(str(local_part) + value)
1412 if obs_local_part.token_type == 'invalid-obs-local-part':
1413 local_part.defects.append(errors.InvalidHeaderDefect(
1414 "local-part is not dot-atom, quoted-string, or obs-local-part"))
1415 else:
1416 local_part.defects.append(errors.ObsoleteHeaderDefect(
1417 "local-part is not a dot-atom (contains CFWS)"))
1418 local_part[0] = obs_local_part
1419 try:
1420 local_part.value.encode('ascii')
1421 except UnicodeEncodeError:
1422 local_part.defects.append(errors.NonASCIILocalPartDefect(
1423 "local-part contains non-ASCII characters)"))
1424 return local_part, value
1425
1426def get_obs_local_part(value):
1427 """ obs-local-part = word *("." word)
1428 """
1429 obs_local_part = ObsLocalPart()
1430 last_non_ws_was_dot = False
1431 while value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
1432 if value[0] == '.':
1433 if last_non_ws_was_dot:
1434 obs_local_part.defects.append(errors.InvalidHeaderDefect(
1435 "invalid repeated '.'"))
1436 obs_local_part.append(DOT)
1437 last_non_ws_was_dot = True
1438 value = value[1:]
1439 continue
1440 elif value[0]=='\\':
1441 obs_local_part.append(ValueTerminal(value[0],
1442 'misplaced-special'))
1443 value = value[1:]
1444 obs_local_part.defects.append(errors.InvalidHeaderDefect(
1445 "'\\' character outside of quoted-string/ccontent"))
1446 last_non_ws_was_dot = False
1447 continue
1448 if obs_local_part and obs_local_part[-1].token_type != 'dot':
1449 obs_local_part.defects.append(errors.InvalidHeaderDefect(
1450 "missing '.' between words"))
1451 try:
1452 token, value = get_word(value)
1453 last_non_ws_was_dot = False
1454 except errors.HeaderParseError:
1455 if value[0] not in CFWS_LEADER:
1456 raise
1457 token, value = get_cfws(value)
1458 obs_local_part.append(token)
1459 if (obs_local_part[0].token_type == 'dot' or
1460 obs_local_part[0].token_type=='cfws' and
1461 obs_local_part[1].token_type=='dot'):
1462 obs_local_part.defects.append(errors.InvalidHeaderDefect(
1463 "Invalid leading '.' in local part"))
1464 if (obs_local_part[-1].token_type == 'dot' or
1465 obs_local_part[-1].token_type=='cfws' and
1466 obs_local_part[-2].token_type=='dot'):
1467 obs_local_part.defects.append(errors.InvalidHeaderDefect(
1468 "Invalid trailing '.' in local part"))
1469 if obs_local_part.defects:
1470 obs_local_part.token_type = 'invalid-obs-local-part'
1471 return obs_local_part, value
1472
1473def get_dtext(value):
R David Murray44b548d2016-09-08 13:59:53 -04001474 r""" dtext = <printable ascii except \ [ ]> / obs-dtext
R David Murray0b6f6c82012-05-25 18:42:14 -04001475 obs-dtext = obs-NO-WS-CTL / quoted-pair
1476
Terry Jan Reedy0f847642013-03-11 18:34:00 -04001477 We allow anything except the excluded characters, but if we find any
Serhiy Storchaka6a7b3a72016-04-17 08:32:47 +03001478 ASCII other than the RFC defined printable ASCII, a NonPrintableDefect is
R David Murray0b6f6c82012-05-25 18:42:14 -04001479 added to the token's defects list. Quoted pairs are converted to their
1480 unquoted values, so what is returned is a ptext token, in this case a
1481 ValueTerminal. If there were quoted-printables, an ObsoleteHeaderDefect is
1482 added to the returned token's defect list.
1483
1484 """
1485 ptext, value, had_qp = _get_ptext_to_endchars(value, '[]')
1486 ptext = ValueTerminal(ptext, 'ptext')
1487 if had_qp:
1488 ptext.defects.append(errors.ObsoleteHeaderDefect(
1489 "quoted printable found in domain-literal"))
1490 _validate_xtext(ptext)
1491 return ptext, value
1492
1493def _check_for_early_dl_end(value, domain_literal):
1494 if value:
1495 return False
1496 domain_literal.append(errors.InvalidHeaderDefect(
1497 "end of input inside domain-literal"))
1498 domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
1499 return True
1500
1501def get_domain_literal(value):
1502 """ domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS]
1503
1504 """
1505 domain_literal = DomainLiteral()
1506 if value[0] in CFWS_LEADER:
1507 token, value = get_cfws(value)
1508 domain_literal.append(token)
1509 if not value:
1510 raise errors.HeaderParseError("expected domain-literal")
1511 if value[0] != '[':
1512 raise errors.HeaderParseError("expected '[' at start of domain-literal "
1513 "but found '{}'".format(value))
1514 value = value[1:]
1515 if _check_for_early_dl_end(value, domain_literal):
1516 return domain_literal, value
1517 domain_literal.append(ValueTerminal('[', 'domain-literal-start'))
1518 if value[0] in WSP:
1519 token, value = get_fws(value)
1520 domain_literal.append(token)
1521 token, value = get_dtext(value)
1522 domain_literal.append(token)
1523 if _check_for_early_dl_end(value, domain_literal):
1524 return domain_literal, value
1525 if value[0] in WSP:
1526 token, value = get_fws(value)
1527 domain_literal.append(token)
1528 if _check_for_early_dl_end(value, domain_literal):
1529 return domain_literal, value
1530 if value[0] != ']':
1531 raise errors.HeaderParseError("expected ']' at end of domain-literal "
1532 "but found '{}'".format(value))
1533 domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
1534 value = value[1:]
1535 if value and value[0] in CFWS_LEADER:
1536 token, value = get_cfws(value)
1537 domain_literal.append(token)
1538 return domain_literal, value
1539
1540def get_domain(value):
1541 """ domain = dot-atom / domain-literal / obs-domain
1542 obs-domain = atom *("." atom))
1543
1544 """
1545 domain = Domain()
1546 leader = None
1547 if value[0] in CFWS_LEADER:
1548 leader, value = get_cfws(value)
1549 if not value:
1550 raise errors.HeaderParseError(
1551 "expected domain but found '{}'".format(value))
1552 if value[0] == '[':
1553 token, value = get_domain_literal(value)
1554 if leader is not None:
1555 token[:0] = [leader]
1556 domain.append(token)
1557 return domain, value
1558 try:
1559 token, value = get_dot_atom(value)
1560 except errors.HeaderParseError:
1561 token, value = get_atom(value)
1562 if leader is not None:
1563 token[:0] = [leader]
1564 domain.append(token)
1565 if value and value[0] == '.':
1566 domain.defects.append(errors.ObsoleteHeaderDefect(
1567 "domain is not a dot-atom (contains CFWS)"))
1568 if domain[0].token_type == 'dot-atom':
1569 domain[:] = domain[0]
1570 while value and value[0] == '.':
1571 domain.append(DOT)
1572 token, value = get_atom(value[1:])
1573 domain.append(token)
1574 return domain, value
1575
1576def get_addr_spec(value):
1577 """ addr-spec = local-part "@" domain
1578
1579 """
1580 addr_spec = AddrSpec()
1581 token, value = get_local_part(value)
1582 addr_spec.append(token)
1583 if not value or value[0] != '@':
1584 addr_spec.defects.append(errors.InvalidHeaderDefect(
1585 "add-spec local part with no domain"))
1586 return addr_spec, value
1587 addr_spec.append(ValueTerminal('@', 'address-at-symbol'))
1588 token, value = get_domain(value[1:])
1589 addr_spec.append(token)
1590 return addr_spec, value
1591
1592def get_obs_route(value):
1593 """ obs-route = obs-domain-list ":"
1594 obs-domain-list = *(CFWS / ",") "@" domain *("," [CFWS] ["@" domain])
1595
1596 Returns an obs-route token with the appropriate sub-tokens (that is,
1597 there is no obs-domain-list in the parse tree).
1598 """
1599 obs_route = ObsRoute()
1600 while value and (value[0]==',' or value[0] in CFWS_LEADER):
1601 if value[0] in CFWS_LEADER:
1602 token, value = get_cfws(value)
1603 obs_route.append(token)
1604 elif value[0] == ',':
1605 obs_route.append(ListSeparator)
1606 value = value[1:]
1607 if not value or value[0] != '@':
1608 raise errors.HeaderParseError(
1609 "expected obs-route domain but found '{}'".format(value))
1610 obs_route.append(RouteComponentMarker)
1611 token, value = get_domain(value[1:])
1612 obs_route.append(token)
1613 while value and value[0]==',':
1614 obs_route.append(ListSeparator)
1615 value = value[1:]
1616 if not value:
1617 break
1618 if value[0] in CFWS_LEADER:
1619 token, value = get_cfws(value)
1620 obs_route.append(token)
1621 if value[0] == '@':
1622 obs_route.append(RouteComponentMarker)
1623 token, value = get_domain(value[1:])
1624 obs_route.append(token)
1625 if not value:
1626 raise errors.HeaderParseError("end of header while parsing obs-route")
1627 if value[0] != ':':
1628 raise errors.HeaderParseError( "expected ':' marking end of "
1629 "obs-route but found '{}'".format(value))
1630 obs_route.append(ValueTerminal(':', 'end-of-obs-route-marker'))
1631 return obs_route, value[1:]
1632
1633def get_angle_addr(value):
1634 """ angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr
1635 obs-angle-addr = [CFWS] "<" obs-route addr-spec ">" [CFWS]
1636
1637 """
1638 angle_addr = AngleAddr()
1639 if value[0] in CFWS_LEADER:
1640 token, value = get_cfws(value)
1641 angle_addr.append(token)
1642 if not value or value[0] != '<':
1643 raise errors.HeaderParseError(
1644 "expected angle-addr but found '{}'".format(value))
1645 angle_addr.append(ValueTerminal('<', 'angle-addr-start'))
1646 value = value[1:]
R David Murray032eed32012-05-26 14:31:12 -04001647 # Although it is not legal per RFC5322, SMTP uses '<>' in certain
1648 # circumstances.
1649 if value[0] == '>':
1650 angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
1651 angle_addr.defects.append(errors.InvalidHeaderDefect(
1652 "null addr-spec in angle-addr"))
1653 value = value[1:]
1654 return angle_addr, value
R David Murray0b6f6c82012-05-25 18:42:14 -04001655 try:
1656 token, value = get_addr_spec(value)
1657 except errors.HeaderParseError:
1658 try:
1659 token, value = get_obs_route(value)
1660 angle_addr.defects.append(errors.ObsoleteHeaderDefect(
1661 "obsolete route specification in angle-addr"))
1662 except errors.HeaderParseError:
1663 raise errors.HeaderParseError(
R David Murray032eed32012-05-26 14:31:12 -04001664 "expected addr-spec or obs-route but found '{}'".format(value))
R David Murray0b6f6c82012-05-25 18:42:14 -04001665 angle_addr.append(token)
1666 token, value = get_addr_spec(value)
1667 angle_addr.append(token)
1668 if value and value[0] == '>':
1669 value = value[1:]
1670 else:
1671 angle_addr.defects.append(errors.InvalidHeaderDefect(
1672 "missing trailing '>' on angle-addr"))
1673 angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
1674 if value and value[0] in CFWS_LEADER:
1675 token, value = get_cfws(value)
1676 angle_addr.append(token)
1677 return angle_addr, value
1678
1679def get_display_name(value):
1680 """ display-name = phrase
1681
1682 Because this is simply a name-rule, we don't return a display-name
1683 token containing a phrase, but rather a display-name token with
1684 the content of the phrase.
1685
1686 """
1687 display_name = DisplayName()
1688 token, value = get_phrase(value)
1689 display_name.extend(token[:])
1690 display_name.defects = token.defects[:]
1691 return display_name, value
1692
1693
1694def get_name_addr(value):
1695 """ name-addr = [display-name] angle-addr
1696
1697 """
1698 name_addr = NameAddr()
1699 # Both the optional display name and the angle-addr can start with cfws.
1700 leader = None
1701 if value[0] in CFWS_LEADER:
1702 leader, value = get_cfws(value)
1703 if not value:
1704 raise errors.HeaderParseError(
1705 "expected name-addr but found '{}'".format(leader))
1706 if value[0] != '<':
1707 if value[0] in PHRASE_ENDS:
1708 raise errors.HeaderParseError(
1709 "expected name-addr but found '{}'".format(value))
1710 token, value = get_display_name(value)
1711 if not value:
1712 raise errors.HeaderParseError(
1713 "expected name-addr but found '{}'".format(token))
1714 if leader is not None:
1715 token[0][:0] = [leader]
1716 leader = None
1717 name_addr.append(token)
1718 token, value = get_angle_addr(value)
1719 if leader is not None:
1720 token[:0] = [leader]
1721 name_addr.append(token)
1722 return name_addr, value
1723
1724def get_mailbox(value):
1725 """ mailbox = name-addr / addr-spec
1726
1727 """
1728 # The only way to figure out if we are dealing with a name-addr or an
1729 # addr-spec is to try parsing each one.
1730 mailbox = Mailbox()
1731 try:
1732 token, value = get_name_addr(value)
1733 except errors.HeaderParseError:
1734 try:
1735 token, value = get_addr_spec(value)
1736 except errors.HeaderParseError:
1737 raise errors.HeaderParseError(
1738 "expected mailbox but found '{}'".format(value))
1739 if any(isinstance(x, errors.InvalidHeaderDefect)
1740 for x in token.all_defects):
1741 mailbox.token_type = 'invalid-mailbox'
1742 mailbox.append(token)
1743 return mailbox, value
1744
1745def get_invalid_mailbox(value, endchars):
1746 """ Read everything up to one of the chars in endchars.
1747
1748 This is outside the formal grammar. The InvalidMailbox TokenList that is
1749 returned acts like a Mailbox, but the data attributes are None.
1750
1751 """
1752 invalid_mailbox = InvalidMailbox()
1753 while value and value[0] not in endchars:
1754 if value[0] in PHRASE_ENDS:
1755 invalid_mailbox.append(ValueTerminal(value[0],
1756 'misplaced-special'))
1757 value = value[1:]
1758 else:
1759 token, value = get_phrase(value)
1760 invalid_mailbox.append(token)
1761 return invalid_mailbox, value
1762
1763def get_mailbox_list(value):
1764 """ mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list
1765 obs-mbox-list = *([CFWS] ",") mailbox *("," [mailbox / CFWS])
1766
1767 For this routine we go outside the formal grammar in order to improve error
1768 handling. We recognize the end of the mailbox list only at the end of the
1769 value or at a ';' (the group terminator). This is so that we can turn
1770 invalid mailboxes into InvalidMailbox tokens and continue parsing any
1771 remaining valid mailboxes. We also allow all mailbox entries to be null,
1772 and this condition is handled appropriately at a higher level.
1773
1774 """
1775 mailbox_list = MailboxList()
1776 while value and value[0] != ';':
1777 try:
1778 token, value = get_mailbox(value)
1779 mailbox_list.append(token)
1780 except errors.HeaderParseError:
1781 leader = None
1782 if value[0] in CFWS_LEADER:
1783 leader, value = get_cfws(value)
1784 if not value or value[0] in ',;':
1785 mailbox_list.append(leader)
1786 mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
1787 "empty element in mailbox-list"))
1788 else:
1789 token, value = get_invalid_mailbox(value, ',;')
1790 if leader is not None:
1791 token[:0] = [leader]
1792 mailbox_list.append(token)
1793 mailbox_list.defects.append(errors.InvalidHeaderDefect(
1794 "invalid mailbox in mailbox-list"))
1795 elif value[0] == ',':
1796 mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
1797 "empty element in mailbox-list"))
1798 else:
1799 token, value = get_invalid_mailbox(value, ',;')
1800 if leader is not None:
1801 token[:0] = [leader]
1802 mailbox_list.append(token)
1803 mailbox_list.defects.append(errors.InvalidHeaderDefect(
1804 "invalid mailbox in mailbox-list"))
1805 if value and value[0] not in ',;':
1806 # Crap after mailbox; treat it as an invalid mailbox.
1807 # The mailbox info will still be available.
1808 mailbox = mailbox_list[-1]
1809 mailbox.token_type = 'invalid-mailbox'
1810 token, value = get_invalid_mailbox(value, ',;')
1811 mailbox.extend(token)
1812 mailbox_list.defects.append(errors.InvalidHeaderDefect(
1813 "invalid mailbox in mailbox-list"))
1814 if value and value[0] == ',':
1815 mailbox_list.append(ListSeparator)
1816 value = value[1:]
1817 return mailbox_list, value
1818
1819
1820def get_group_list(value):
1821 """ group-list = mailbox-list / CFWS / obs-group-list
1822 obs-group-list = 1*([CFWS] ",") [CFWS]
1823
1824 """
1825 group_list = GroupList()
1826 if not value:
1827 group_list.defects.append(errors.InvalidHeaderDefect(
1828 "end of header before group-list"))
1829 return group_list, value
1830 leader = None
1831 if value and value[0] in CFWS_LEADER:
1832 leader, value = get_cfws(value)
1833 if not value:
1834 # This should never happen in email parsing, since CFWS-only is a
1835 # legal alternative to group-list in a group, which is the only
1836 # place group-list appears.
1837 group_list.defects.append(errors.InvalidHeaderDefect(
1838 "end of header in group-list"))
1839 group_list.append(leader)
1840 return group_list, value
1841 if value[0] == ';':
1842 group_list.append(leader)
1843 return group_list, value
1844 token, value = get_mailbox_list(value)
1845 if len(token.all_mailboxes)==0:
1846 if leader is not None:
1847 group_list.append(leader)
1848 group_list.extend(token)
1849 group_list.defects.append(errors.ObsoleteHeaderDefect(
1850 "group-list with empty entries"))
1851 return group_list, value
1852 if leader is not None:
1853 token[:0] = [leader]
1854 group_list.append(token)
1855 return group_list, value
1856
1857def get_group(value):
1858 """ group = display-name ":" [group-list] ";" [CFWS]
1859
1860 """
1861 group = Group()
1862 token, value = get_display_name(value)
1863 if not value or value[0] != ':':
1864 raise errors.HeaderParseError("expected ':' at end of group "
1865 "display name but found '{}'".format(value))
1866 group.append(token)
1867 group.append(ValueTerminal(':', 'group-display-name-terminator'))
1868 value = value[1:]
1869 if value and value[0] == ';':
1870 group.append(ValueTerminal(';', 'group-terminator'))
1871 return group, value[1:]
1872 token, value = get_group_list(value)
1873 group.append(token)
1874 if not value:
1875 group.defects.append(errors.InvalidHeaderDefect(
1876 "end of header in group"))
Dong-hee Na8fe9eed2018-07-28 21:55:11 +09001877 elif value[0] != ';':
R David Murray0b6f6c82012-05-25 18:42:14 -04001878 raise errors.HeaderParseError(
1879 "expected ';' at end of group but found {}".format(value))
1880 group.append(ValueTerminal(';', 'group-terminator'))
1881 value = value[1:]
1882 if value and value[0] in CFWS_LEADER:
1883 token, value = get_cfws(value)
1884 group.append(token)
1885 return group, value
1886
1887def get_address(value):
1888 """ address = mailbox / group
1889
1890 Note that counter-intuitively, an address can be either a single address or
1891 a list of addresses (a group). This is why the returned Address object has
1892 a 'mailboxes' attribute which treats a single address as a list of length
1893 one. When you need to differentiate between to two cases, extract the single
1894 element, which is either a mailbox or a group token.
1895
1896 """
1897 # The formal grammar isn't very helpful when parsing an address. mailbox
1898 # and group, especially when allowing for obsolete forms, start off very
1899 # similarly. It is only when you reach one of @, <, or : that you know
1900 # what you've got. So, we try each one in turn, starting with the more
1901 # likely of the two. We could perhaps make this more efficient by looking
1902 # for a phrase and then branching based on the next character, but that
1903 # would be a premature optimization.
1904 address = Address()
1905 try:
1906 token, value = get_group(value)
1907 except errors.HeaderParseError:
1908 try:
1909 token, value = get_mailbox(value)
1910 except errors.HeaderParseError:
1911 raise errors.HeaderParseError(
1912 "expected address but found '{}'".format(value))
1913 address.append(token)
1914 return address, value
1915
1916def get_address_list(value):
1917 """ address_list = (address *("," address)) / obs-addr-list
1918 obs-addr-list = *([CFWS] ",") address *("," [address / CFWS])
1919
1920 We depart from the formal grammar here by continuing to parse until the end
1921 of the input, assuming the input to be entirely composed of an
1922 address-list. This is always true in email parsing, and allows us
1923 to skip invalid addresses to parse additional valid ones.
1924
1925 """
1926 address_list = AddressList()
1927 while value:
1928 try:
1929 token, value = get_address(value)
1930 address_list.append(token)
1931 except errors.HeaderParseError as err:
1932 leader = None
1933 if value[0] in CFWS_LEADER:
1934 leader, value = get_cfws(value)
1935 if not value or value[0] == ',':
1936 address_list.append(leader)
1937 address_list.defects.append(errors.ObsoleteHeaderDefect(
1938 "address-list entry with no content"))
1939 else:
1940 token, value = get_invalid_mailbox(value, ',')
1941 if leader is not None:
1942 token[:0] = [leader]
1943 address_list.append(Address([token]))
1944 address_list.defects.append(errors.InvalidHeaderDefect(
1945 "invalid address in address-list"))
1946 elif value[0] == ',':
1947 address_list.defects.append(errors.ObsoleteHeaderDefect(
1948 "empty element in address-list"))
1949 else:
1950 token, value = get_invalid_mailbox(value, ',')
1951 if leader is not None:
1952 token[:0] = [leader]
1953 address_list.append(Address([token]))
1954 address_list.defects.append(errors.InvalidHeaderDefect(
1955 "invalid address in address-list"))
1956 if value and value[0] != ',':
1957 # Crap after address; treat it as an invalid mailbox.
1958 # The mailbox info will still be available.
1959 mailbox = address_list[-1][0]
1960 mailbox.token_type = 'invalid-mailbox'
1961 token, value = get_invalid_mailbox(value, ',')
1962 mailbox.extend(token)
1963 address_list.defects.append(errors.InvalidHeaderDefect(
1964 "invalid address in address-list"))
1965 if value: # Must be a , at this point.
1966 address_list.append(ValueTerminal(',', 'list-separator'))
1967 value = value[1:]
1968 return address_list, value
R David Murray97f43c02012-06-24 05:03:27 -04001969
1970#
1971# XXX: As I begin to add additional header parsers, I'm realizing we probably
1972# have two level of parser routines: the get_XXX methods that get a token in
1973# the grammar, and parse_XXX methods that parse an entire field value. So
1974# get_address_list above should really be a parse_ method, as probably should
1975# be get_unstructured.
1976#
1977
1978def parse_mime_version(value):
1979 """ mime-version = [CFWS] 1*digit [CFWS] "." [CFWS] 1*digit [CFWS]
1980
1981 """
1982 # The [CFWS] is implicit in the RFC 2045 BNF.
1983 # XXX: This routine is a bit verbose, should factor out a get_int method.
1984 mime_version = MIMEVersion()
1985 if not value:
1986 mime_version.defects.append(errors.HeaderMissingRequiredValue(
1987 "Missing MIME version number (eg: 1.0)"))
1988 return mime_version
1989 if value[0] in CFWS_LEADER:
1990 token, value = get_cfws(value)
1991 mime_version.append(token)
1992 if not value:
1993 mime_version.defects.append(errors.HeaderMissingRequiredValue(
1994 "Expected MIME version number but found only CFWS"))
1995 digits = ''
1996 while value and value[0] != '.' and value[0] not in CFWS_LEADER:
1997 digits += value[0]
1998 value = value[1:]
1999 if not digits.isdigit():
2000 mime_version.defects.append(errors.InvalidHeaderDefect(
2001 "Expected MIME major version number but found {!r}".format(digits)))
2002 mime_version.append(ValueTerminal(digits, 'xtext'))
2003 else:
2004 mime_version.major = int(digits)
2005 mime_version.append(ValueTerminal(digits, 'digits'))
2006 if value and value[0] in CFWS_LEADER:
2007 token, value = get_cfws(value)
2008 mime_version.append(token)
2009 if not value or value[0] != '.':
2010 if mime_version.major is not None:
2011 mime_version.defects.append(errors.InvalidHeaderDefect(
2012 "Incomplete MIME version; found only major number"))
2013 if value:
2014 mime_version.append(ValueTerminal(value, 'xtext'))
2015 return mime_version
2016 mime_version.append(ValueTerminal('.', 'version-separator'))
2017 value = value[1:]
2018 if value and value[0] in CFWS_LEADER:
2019 token, value = get_cfws(value)
2020 mime_version.append(token)
2021 if not value:
2022 if mime_version.major is not None:
2023 mime_version.defects.append(errors.InvalidHeaderDefect(
2024 "Incomplete MIME version; found only major number"))
2025 return mime_version
2026 digits = ''
2027 while value and value[0] not in CFWS_LEADER:
2028 digits += value[0]
2029 value = value[1:]
2030 if not digits.isdigit():
2031 mime_version.defects.append(errors.InvalidHeaderDefect(
2032 "Expected MIME minor version number but found {!r}".format(digits)))
2033 mime_version.append(ValueTerminal(digits, 'xtext'))
2034 else:
2035 mime_version.minor = int(digits)
2036 mime_version.append(ValueTerminal(digits, 'digits'))
2037 if value and value[0] in CFWS_LEADER:
2038 token, value = get_cfws(value)
2039 mime_version.append(token)
2040 if value:
2041 mime_version.defects.append(errors.InvalidHeaderDefect(
2042 "Excess non-CFWS text after MIME version"))
2043 mime_version.append(ValueTerminal(value, 'xtext'))
2044 return mime_version
2045
2046def get_invalid_parameter(value):
2047 """ Read everything up to the next ';'.
2048
2049 This is outside the formal grammar. The InvalidParameter TokenList that is
2050 returned acts like a Parameter, but the data attributes are None.
2051
2052 """
2053 invalid_parameter = InvalidParameter()
2054 while value and value[0] != ';':
2055 if value[0] in PHRASE_ENDS:
2056 invalid_parameter.append(ValueTerminal(value[0],
2057 'misplaced-special'))
2058 value = value[1:]
2059 else:
2060 token, value = get_phrase(value)
2061 invalid_parameter.append(token)
2062 return invalid_parameter, value
2063
2064def get_ttext(value):
2065 """ttext = <matches _ttext_matcher>
2066
2067 We allow any non-TOKEN_ENDS in ttext, but add defects to the token's
2068 defects list if we find non-ttext characters. We also register defects for
2069 *any* non-printables even though the RFC doesn't exclude all of them,
2070 because we follow the spirit of RFC 5322.
2071
2072 """
2073 m = _non_token_end_matcher(value)
2074 if not m:
2075 raise errors.HeaderParseError(
2076 "expected ttext but found '{}'".format(value))
2077 ttext = m.group()
2078 value = value[len(ttext):]
2079 ttext = ValueTerminal(ttext, 'ttext')
2080 _validate_xtext(ttext)
2081 return ttext, value
2082
2083def get_token(value):
2084 """token = [CFWS] 1*ttext [CFWS]
2085
2086 The RFC equivalent of ttext is any US-ASCII chars except space, ctls, or
2087 tspecials. We also exclude tabs even though the RFC doesn't.
2088
2089 The RFC implies the CFWS but is not explicit about it in the BNF.
2090
2091 """
2092 mtoken = Token()
2093 if value and value[0] in CFWS_LEADER:
2094 token, value = get_cfws(value)
2095 mtoken.append(token)
2096 if value and value[0] in TOKEN_ENDS:
2097 raise errors.HeaderParseError(
2098 "expected token but found '{}'".format(value))
2099 token, value = get_ttext(value)
2100 mtoken.append(token)
2101 if value and value[0] in CFWS_LEADER:
2102 token, value = get_cfws(value)
2103 mtoken.append(token)
2104 return mtoken, value
2105
2106def get_attrtext(value):
2107 """attrtext = 1*(any non-ATTRIBUTE_ENDS character)
2108
2109 We allow any non-ATTRIBUTE_ENDS in attrtext, but add defects to the
2110 token's defects list if we find non-attrtext characters. We also register
2111 defects for *any* non-printables even though the RFC doesn't exclude all of
2112 them, because we follow the spirit of RFC 5322.
2113
2114 """
2115 m = _non_attribute_end_matcher(value)
2116 if not m:
2117 raise errors.HeaderParseError(
2118 "expected attrtext but found {!r}".format(value))
2119 attrtext = m.group()
2120 value = value[len(attrtext):]
2121 attrtext = ValueTerminal(attrtext, 'attrtext')
2122 _validate_xtext(attrtext)
2123 return attrtext, value
2124
2125def get_attribute(value):
2126 """ [CFWS] 1*attrtext [CFWS]
2127
2128 This version of the BNF makes the CFWS explicit, and as usual we use a
2129 value terminal for the actual run of characters. The RFC equivalent of
2130 attrtext is the token characters, with the subtraction of '*', "'", and '%'.
2131 We include tab in the excluded set just as we do for token.
2132
2133 """
2134 attribute = Attribute()
2135 if value and value[0] in CFWS_LEADER:
2136 token, value = get_cfws(value)
2137 attribute.append(token)
2138 if value and value[0] in ATTRIBUTE_ENDS:
2139 raise errors.HeaderParseError(
2140 "expected token but found '{}'".format(value))
2141 token, value = get_attrtext(value)
2142 attribute.append(token)
2143 if value and value[0] in CFWS_LEADER:
2144 token, value = get_cfws(value)
2145 attribute.append(token)
2146 return attribute, value
2147
2148def get_extended_attrtext(value):
2149 """attrtext = 1*(any non-ATTRIBUTE_ENDS character plus '%')
2150
2151 This is a special parsing routine so that we get a value that
2152 includes % escapes as a single string (which we decode as a single
2153 string later).
2154
2155 """
2156 m = _non_extended_attribute_end_matcher(value)
2157 if not m:
2158 raise errors.HeaderParseError(
2159 "expected extended attrtext but found {!r}".format(value))
2160 attrtext = m.group()
2161 value = value[len(attrtext):]
2162 attrtext = ValueTerminal(attrtext, 'extended-attrtext')
2163 _validate_xtext(attrtext)
2164 return attrtext, value
2165
2166def get_extended_attribute(value):
2167 """ [CFWS] 1*extended_attrtext [CFWS]
2168
2169 This is like the non-extended version except we allow % characters, so that
2170 we can pick up an encoded value as a single string.
2171
2172 """
2173 # XXX: should we have an ExtendedAttribute TokenList?
2174 attribute = Attribute()
2175 if value and value[0] in CFWS_LEADER:
2176 token, value = get_cfws(value)
2177 attribute.append(token)
2178 if value and value[0] in EXTENDED_ATTRIBUTE_ENDS:
2179 raise errors.HeaderParseError(
2180 "expected token but found '{}'".format(value))
2181 token, value = get_extended_attrtext(value)
2182 attribute.append(token)
2183 if value and value[0] in CFWS_LEADER:
2184 token, value = get_cfws(value)
2185 attribute.append(token)
2186 return attribute, value
2187
2188def get_section(value):
2189 """ '*' digits
2190
2191 The formal BNF is more complicated because leading 0s are not allowed. We
2192 check for that and add a defect. We also assume no CFWS is allowed between
2193 the '*' and the digits, though the RFC is not crystal clear on that.
2194 The caller should already have dealt with leading CFWS.
2195
2196 """
2197 section = Section()
2198 if not value or value[0] != '*':
2199 raise errors.HeaderParseError("Expected section but found {}".format(
2200 value))
2201 section.append(ValueTerminal('*', 'section-marker'))
2202 value = value[1:]
2203 if not value or not value[0].isdigit():
2204 raise errors.HeaderParseError("Expected section number but "
2205 "found {}".format(value))
2206 digits = ''
2207 while value and value[0].isdigit():
2208 digits += value[0]
2209 value = value[1:]
2210 if digits[0] == '0' and digits != '0':
Serhiy Storchaka34fd4c22018-11-05 16:20:25 +02002211 section.defects.append(errors.InvalidHeaderError(
2212 "section number has an invalid leading 0"))
R David Murray97f43c02012-06-24 05:03:27 -04002213 section.number = int(digits)
2214 section.append(ValueTerminal(digits, 'digits'))
2215 return section, value
2216
2217
2218def get_value(value):
2219 """ quoted-string / attribute
2220
2221 """
2222 v = Value()
2223 if not value:
2224 raise errors.HeaderParseError("Expected value but found end of string")
2225 leader = None
2226 if value[0] in CFWS_LEADER:
2227 leader, value = get_cfws(value)
2228 if not value:
2229 raise errors.HeaderParseError("Expected value but found "
2230 "only {}".format(leader))
2231 if value[0] == '"':
2232 token, value = get_quoted_string(value)
2233 else:
2234 token, value = get_extended_attribute(value)
2235 if leader is not None:
2236 token[:0] = [leader]
2237 v.append(token)
2238 return v, value
2239
2240def get_parameter(value):
2241 """ attribute [section] ["*"] [CFWS] "=" value
2242
2243 The CFWS is implied by the RFC but not made explicit in the BNF. This
2244 simplified form of the BNF from the RFC is made to conform with the RFC BNF
2245 through some extra checks. We do it this way because it makes both error
2246 recovery and working with the resulting parse tree easier.
2247 """
2248 # It is possible CFWS would also be implicitly allowed between the section
2249 # and the 'extended-attribute' marker (the '*') , but we've never seen that
2250 # in the wild and we will therefore ignore the possibility.
2251 param = Parameter()
2252 token, value = get_attribute(value)
2253 param.append(token)
2254 if not value or value[0] == ';':
2255 param.defects.append(errors.InvalidHeaderDefect("Parameter contains "
2256 "name ({}) but no value".format(token)))
2257 return param, value
2258 if value[0] == '*':
2259 try:
2260 token, value = get_section(value)
2261 param.sectioned = True
2262 param.append(token)
2263 except errors.HeaderParseError:
2264 pass
2265 if not value:
2266 raise errors.HeaderParseError("Incomplete parameter")
2267 if value[0] == '*':
2268 param.append(ValueTerminal('*', 'extended-parameter-marker'))
2269 value = value[1:]
2270 param.extended = True
2271 if value[0] != '=':
2272 raise errors.HeaderParseError("Parameter not followed by '='")
2273 param.append(ValueTerminal('=', 'parameter-separator'))
2274 value = value[1:]
2275 leader = None
2276 if value and value[0] in CFWS_LEADER:
2277 token, value = get_cfws(value)
2278 param.append(token)
2279 remainder = None
2280 appendto = param
2281 if param.extended and value and value[0] == '"':
2282 # Now for some serious hackery to handle the common invalid case of
2283 # double quotes around an extended value. We also accept (with defect)
2284 # a value marked as encoded that isn't really.
2285 qstring, remainder = get_quoted_string(value)
2286 inner_value = qstring.stripped_value
2287 semi_valid = False
2288 if param.section_number == 0:
2289 if inner_value and inner_value[0] == "'":
2290 semi_valid = True
2291 else:
2292 token, rest = get_attrtext(inner_value)
2293 if rest and rest[0] == "'":
2294 semi_valid = True
2295 else:
2296 try:
2297 token, rest = get_extended_attrtext(inner_value)
2298 except:
2299 pass
2300 else:
2301 if not rest:
2302 semi_valid = True
2303 if semi_valid:
2304 param.defects.append(errors.InvalidHeaderDefect(
2305 "Quoted string value for extended parameter is invalid"))
2306 param.append(qstring)
2307 for t in qstring:
2308 if t.token_type == 'bare-quoted-string':
2309 t[:] = []
2310 appendto = t
2311 break
2312 value = inner_value
2313 else:
2314 remainder = None
2315 param.defects.append(errors.InvalidHeaderDefect(
2316 "Parameter marked as extended but appears to have a "
2317 "quoted string value that is non-encoded"))
2318 if value and value[0] == "'":
2319 token = None
2320 else:
2321 token, value = get_value(value)
2322 if not param.extended or param.section_number > 0:
2323 if not value or value[0] != "'":
2324 appendto.append(token)
2325 if remainder is not None:
2326 assert not value, value
2327 value = remainder
2328 return param, value
2329 param.defects.append(errors.InvalidHeaderDefect(
2330 "Apparent initial-extended-value but attribute "
2331 "was not marked as extended or was not initial section"))
2332 if not value:
2333 # Assume the charset/lang is missing and the token is the value.
2334 param.defects.append(errors.InvalidHeaderDefect(
2335 "Missing required charset/lang delimiters"))
2336 appendto.append(token)
2337 if remainder is None:
2338 return param, value
2339 else:
2340 if token is not None:
2341 for t in token:
2342 if t.token_type == 'extended-attrtext':
2343 break
2344 t.token_type == 'attrtext'
2345 appendto.append(t)
2346 param.charset = t.value
2347 if value[0] != "'":
2348 raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
2349 "delimiter, but found {!r}".format(value))
R. David Murray85d5c182017-12-03 18:51:41 -05002350 appendto.append(ValueTerminal("'", 'RFC2231-delimiter'))
R David Murray97f43c02012-06-24 05:03:27 -04002351 value = value[1:]
2352 if value and value[0] != "'":
2353 token, value = get_attrtext(value)
2354 appendto.append(token)
2355 param.lang = token.value
2356 if not value or value[0] != "'":
2357 raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
2358 "delimiter, but found {}".format(value))
R. David Murray85d5c182017-12-03 18:51:41 -05002359 appendto.append(ValueTerminal("'", 'RFC2231-delimiter'))
R David Murray97f43c02012-06-24 05:03:27 -04002360 value = value[1:]
2361 if remainder is not None:
2362 # Treat the rest of value as bare quoted string content.
2363 v = Value()
2364 while value:
2365 if value[0] in WSP:
2366 token, value = get_fws(value)
2367 else:
2368 token, value = get_qcontent(value)
2369 v.append(token)
2370 token = v
2371 else:
2372 token, value = get_value(value)
2373 appendto.append(token)
2374 if remainder is not None:
2375 assert not value, value
2376 value = remainder
2377 return param, value
2378
2379def parse_mime_parameters(value):
2380 """ parameter *( ";" parameter )
2381
2382 That BNF is meant to indicate this routine should only be called after
2383 finding and handling the leading ';'. There is no corresponding rule in
2384 the formal RFC grammar, but it is more convenient for us for the set of
2385 parameters to be treated as its own TokenList.
2386
2387 This is 'parse' routine because it consumes the reminaing value, but it
2388 would never be called to parse a full header. Instead it is called to
2389 parse everything after the non-parameter value of a specific MIME header.
2390
2391 """
2392 mime_parameters = MimeParameters()
2393 while value:
2394 try:
2395 token, value = get_parameter(value)
2396 mime_parameters.append(token)
2397 except errors.HeaderParseError as err:
2398 leader = None
2399 if value[0] in CFWS_LEADER:
2400 leader, value = get_cfws(value)
2401 if not value:
2402 mime_parameters.append(leader)
2403 return mime_parameters
2404 if value[0] == ';':
2405 if leader is not None:
2406 mime_parameters.append(leader)
2407 mime_parameters.defects.append(errors.InvalidHeaderDefect(
2408 "parameter entry with no content"))
2409 else:
2410 token, value = get_invalid_parameter(value)
2411 if leader:
2412 token[:0] = [leader]
2413 mime_parameters.append(token)
2414 mime_parameters.defects.append(errors.InvalidHeaderDefect(
2415 "invalid parameter {!r}".format(token)))
2416 if value and value[0] != ';':
2417 # Junk after the otherwise valid parameter. Mark it as
2418 # invalid, but it will have a value.
2419 param = mime_parameters[-1]
2420 param.token_type = 'invalid-parameter'
2421 token, value = get_invalid_parameter(value)
2422 param.extend(token)
2423 mime_parameters.defects.append(errors.InvalidHeaderDefect(
2424 "parameter with invalid trailing text {!r}".format(token)))
2425 if value:
2426 # Must be a ';' at this point.
2427 mime_parameters.append(ValueTerminal(';', 'parameter-separator'))
2428 value = value[1:]
2429 return mime_parameters
2430
2431def _find_mime_parameters(tokenlist, value):
2432 """Do our best to find the parameters in an invalid MIME header
2433
2434 """
2435 while value and value[0] != ';':
2436 if value[0] in PHRASE_ENDS:
2437 tokenlist.append(ValueTerminal(value[0], 'misplaced-special'))
2438 value = value[1:]
2439 else:
2440 token, value = get_phrase(value)
2441 tokenlist.append(token)
2442 if not value:
2443 return
2444 tokenlist.append(ValueTerminal(';', 'parameter-separator'))
2445 tokenlist.append(parse_mime_parameters(value[1:]))
2446
2447def parse_content_type_header(value):
2448 """ maintype "/" subtype *( ";" parameter )
2449
2450 The maintype and substype are tokens. Theoretically they could
2451 be checked against the official IANA list + x-token, but we
2452 don't do that.
2453 """
2454 ctype = ContentType()
2455 recover = False
2456 if not value:
2457 ctype.defects.append(errors.HeaderMissingRequiredValue(
2458 "Missing content type specification"))
2459 return ctype
2460 try:
2461 token, value = get_token(value)
2462 except errors.HeaderParseError:
2463 ctype.defects.append(errors.InvalidHeaderDefect(
2464 "Expected content maintype but found {!r}".format(value)))
2465 _find_mime_parameters(ctype, value)
2466 return ctype
2467 ctype.append(token)
Martin Panter46f50722016-05-26 05:35:26 +00002468 # XXX: If we really want to follow the formal grammar we should make
R David Murray97f43c02012-06-24 05:03:27 -04002469 # mantype and subtype specialized TokenLists here. Probably not worth it.
2470 if not value or value[0] != '/':
2471 ctype.defects.append(errors.InvalidHeaderDefect(
2472 "Invalid content type"))
2473 if value:
2474 _find_mime_parameters(ctype, value)
2475 return ctype
2476 ctype.maintype = token.value.strip().lower()
2477 ctype.append(ValueTerminal('/', 'content-type-separator'))
2478 value = value[1:]
2479 try:
2480 token, value = get_token(value)
2481 except errors.HeaderParseError:
2482 ctype.defects.append(errors.InvalidHeaderDefect(
2483 "Expected content subtype but found {!r}".format(value)))
2484 _find_mime_parameters(ctype, value)
2485 return ctype
2486 ctype.append(token)
2487 ctype.subtype = token.value.strip().lower()
2488 if not value:
2489 return ctype
2490 if value[0] != ';':
2491 ctype.defects.append(errors.InvalidHeaderDefect(
2492 "Only parameters are valid after content type, but "
2493 "found {!r}".format(value)))
2494 # The RFC requires that a syntactically invalid content-type be treated
2495 # as text/plain. Perhaps we should postel this, but we should probably
2496 # only do that if we were checking the subtype value against IANA.
2497 del ctype.maintype, ctype.subtype
2498 _find_mime_parameters(ctype, value)
2499 return ctype
2500 ctype.append(ValueTerminal(';', 'parameter-separator'))
2501 ctype.append(parse_mime_parameters(value[1:]))
2502 return ctype
2503
2504def parse_content_disposition_header(value):
2505 """ disposition-type *( ";" parameter )
2506
2507 """
2508 disp_header = ContentDisposition()
2509 if not value:
2510 disp_header.defects.append(errors.HeaderMissingRequiredValue(
2511 "Missing content disposition"))
2512 return disp_header
2513 try:
2514 token, value = get_token(value)
2515 except errors.HeaderParseError:
Ezio Melottid5774802014-08-04 17:16:49 +03002516 disp_header.defects.append(errors.InvalidHeaderDefect(
R David Murray97f43c02012-06-24 05:03:27 -04002517 "Expected content disposition but found {!r}".format(value)))
2518 _find_mime_parameters(disp_header, value)
2519 return disp_header
2520 disp_header.append(token)
2521 disp_header.content_disposition = token.value.strip().lower()
2522 if not value:
2523 return disp_header
2524 if value[0] != ';':
2525 disp_header.defects.append(errors.InvalidHeaderDefect(
2526 "Only parameters are valid after content disposition, but "
2527 "found {!r}".format(value)))
2528 _find_mime_parameters(disp_header, value)
2529 return disp_header
2530 disp_header.append(ValueTerminal(';', 'parameter-separator'))
2531 disp_header.append(parse_mime_parameters(value[1:]))
2532 return disp_header
2533
2534def parse_content_transfer_encoding_header(value):
2535 """ mechanism
2536
2537 """
2538 # We should probably validate the values, since the list is fixed.
2539 cte_header = ContentTransferEncoding()
2540 if not value:
2541 cte_header.defects.append(errors.HeaderMissingRequiredValue(
2542 "Missing content transfer encoding"))
2543 return cte_header
2544 try:
2545 token, value = get_token(value)
2546 except errors.HeaderParseError:
Ezio Melottid5774802014-08-04 17:16:49 +03002547 cte_header.defects.append(errors.InvalidHeaderDefect(
2548 "Expected content transfer encoding but found {!r}".format(value)))
R David Murray97f43c02012-06-24 05:03:27 -04002549 else:
2550 cte_header.append(token)
2551 cte_header.cte = token.value.strip().lower()
2552 if not value:
2553 return cte_header
2554 while value:
2555 cte_header.defects.append(errors.InvalidHeaderDefect(
2556 "Extra text after content transfer encoding"))
2557 if value[0] in PHRASE_ENDS:
2558 cte_header.append(ValueTerminal(value[0], 'misplaced-special'))
2559 value = value[1:]
2560 else:
2561 token, value = get_phrase(value)
2562 cte_header.append(token)
2563 return cte_header
R. David Murray85d5c182017-12-03 18:51:41 -05002564
2565
2566#
2567# Header folding
2568#
2569# Header folding is complex, with lots of rules and corner cases. The
2570# following code does its best to obey the rules and handle the corner
2571# cases, but you can be sure there are few bugs:)
2572#
2573# This folder generally canonicalizes as it goes, preferring the stringified
2574# version of each token. The tokens contain information that supports the
2575# folder, including which tokens can be encoded in which ways.
2576#
2577# Folded text is accumulated in a simple list of strings ('lines'), each
2578# one of which should be less than policy.max_line_length ('maxlen').
2579#
2580
2581def _steal_trailing_WSP_if_exists(lines):
2582 wsp = ''
2583 if lines and lines[-1] and lines[-1][-1] in WSP:
2584 wsp = lines[-1][-1]
2585 lines[-1] = lines[-1][:-1]
2586 return wsp
2587
2588def _refold_parse_tree(parse_tree, *, policy):
2589 """Return string of contents of parse_tree folded according to RFC rules.
2590
2591 """
2592 # max_line_length 0/None means no limit, ie: infinitely long.
2593 maxlen = policy.max_line_length or float("+inf")
2594 encoding = 'utf-8' if policy.utf8 else 'us-ascii'
2595 lines = ['']
2596 last_ew = None
2597 wrap_as_ew_blocked = 0
2598 want_encoding = False
2599 end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked')
2600 parts = list(parse_tree)
2601 while parts:
2602 part = parts.pop(0)
2603 if part is end_ew_not_allowed:
2604 wrap_as_ew_blocked -= 1
2605 continue
2606 tstr = str(part)
2607 try:
2608 tstr.encode(encoding)
2609 charset = encoding
2610 except UnicodeEncodeError:
2611 if any(isinstance(x, errors.UndecodableBytesDefect)
2612 for x in part.all_defects):
2613 charset = 'unknown-8bit'
2614 else:
2615 # If policy.utf8 is false this should really be taken from a
2616 # 'charset' property on the policy.
2617 charset = 'utf-8'
2618 want_encoding = True
2619 if part.token_type == 'mime-parameters':
2620 # Mime parameter folding (using RFC2231) is extra special.
2621 _fold_mime_parameters(part, lines, maxlen, encoding)
2622 continue
2623 if want_encoding and not wrap_as_ew_blocked:
2624 if not part.as_ew_allowed:
2625 want_encoding = False
2626 last_ew = None
2627 if part.syntactic_break:
Jens Troeger45b2f882019-05-14 11:07:39 +10002628 encoded_part = part.fold(policy=policy)[:-len(policy.linesep)]
R. David Murray85d5c182017-12-03 18:51:41 -05002629 if policy.linesep not in encoded_part:
2630 # It fits on a single line
2631 if len(encoded_part) > maxlen - len(lines[-1]):
2632 # But not on this one, so start a new one.
2633 newline = _steal_trailing_WSP_if_exists(lines)
2634 # XXX what if encoded_part has no leading FWS?
2635 lines.append(newline)
2636 lines[-1] += encoded_part
2637 continue
2638 # Either this is not a major syntactic break, so we don't
2639 # want it on a line by itself even if it fits, or it
2640 # doesn't fit on a line by itself. Either way, fall through
2641 # to unpacking the subparts and wrapping them.
2642 if not hasattr(part, 'encode'):
2643 # It's not a Terminal, do each piece individually.
2644 parts = list(part) + parts
2645 else:
2646 # It's a terminal, wrap it as an encoded word, possibly
2647 # combining it with previously encoded words if allowed.
2648 last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
2649 part.ew_combine_allowed, charset)
2650 want_encoding = False
2651 continue
2652 if len(tstr) <= maxlen - len(lines[-1]):
2653 lines[-1] += tstr
2654 continue
2655 # This part is too long to fit. The RFC wants us to break at
2656 # "major syntactic breaks", so unless we don't consider this
2657 # to be one, check if it will fit on the next line by itself.
2658 if (part.syntactic_break and
2659 len(tstr) + 1 <= maxlen):
2660 newline = _steal_trailing_WSP_if_exists(lines)
2661 if newline or part.startswith_fws():
2662 lines.append(newline + tstr)
2663 continue
2664 if not hasattr(part, 'encode'):
2665 # It's not a terminal, try folding the subparts.
2666 newparts = list(part)
2667 if not part.as_ew_allowed:
2668 wrap_as_ew_blocked += 1
2669 newparts.append(end_ew_not_allowed)
2670 parts = newparts + parts
2671 continue
2672 if part.as_ew_allowed and not wrap_as_ew_blocked:
2673 # It doesn't need CTE encoding, but encode it anyway so we can
2674 # wrap it.
2675 parts.insert(0, part)
2676 want_encoding = True
2677 continue
2678 # We can't figure out how to wrap, it, so give up.
2679 newline = _steal_trailing_WSP_if_exists(lines)
2680 if newline or part.startswith_fws():
2681 lines.append(newline + tstr)
2682 else:
2683 # We can't fold it onto the next line either...
2684 lines[-1] += tstr
2685 return policy.linesep.join(lines) + policy.linesep
2686
2687def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
2688 """Fold string to_encode into lines as encoded word, combining if allowed.
2689 Return the new value for last_ew, or None if ew_combine_allowed is False.
2690
2691 If there is already an encoded word in the last line of lines (indicated by
2692 a non-None value for last_ew) and ew_combine_allowed is true, decode the
2693 existing ew, combine it with to_encode, and re-encode. Otherwise, encode
2694 to_encode. In either case, split to_encode as necessary so that the
2695 encoded segments fit within maxlen.
2696
2697 """
2698 if last_ew is not None and ew_combine_allowed:
2699 to_encode = str(
2700 get_unstructured(lines[-1][last_ew:] + to_encode))
2701 lines[-1] = lines[-1][:last_ew]
2702 if to_encode[0] in WSP:
2703 # We're joining this to non-encoded text, so don't encode
2704 # the leading blank.
2705 leading_wsp = to_encode[0]
2706 to_encode = to_encode[1:]
2707 if (len(lines[-1]) == maxlen):
2708 lines.append(_steal_trailing_WSP_if_exists(lines))
2709 lines[-1] += leading_wsp
2710 trailing_wsp = ''
2711 if to_encode[-1] in WSP:
2712 # Likewise for the trailing space.
2713 trailing_wsp = to_encode[-1]
2714 to_encode = to_encode[:-1]
2715 new_last_ew = len(lines[-1]) if last_ew is None else last_ew
2716 while to_encode:
2717 remaining_space = maxlen - len(lines[-1])
2718 # The RFC2047 chrome takes up 7 characters plus the length
2719 # of the charset name.
2720 encode_as = 'utf-8' if charset == 'us-ascii' else charset
2721 text_space = remaining_space - len(encode_as) - 7
2722 if text_space <= 0:
2723 lines.append(' ')
2724 # XXX We'll get an infinite loop here if maxlen is <= 7
2725 continue
2726 first_part = to_encode[:text_space]
2727 ew = _ew.encode(first_part, charset=encode_as)
2728 excess = len(ew) - remaining_space
2729 if excess > 0:
2730 # encode always chooses the shortest encoding, so this
2731 # is guaranteed to fit at this point.
2732 first_part = first_part[:-excess]
2733 ew = _ew.encode(first_part)
2734 lines[-1] += ew
2735 to_encode = to_encode[len(first_part):]
2736 if to_encode:
2737 lines.append(' ')
2738 new_last_ew = len(lines[-1])
2739 lines[-1] += trailing_wsp
2740 return new_last_ew if ew_combine_allowed else None
2741
2742def _fold_mime_parameters(part, lines, maxlen, encoding):
2743 """Fold TokenList 'part' into the 'lines' list as mime parameters.
2744
2745 Using the decoded list of parameters and values, format them according to
2746 the RFC rules, including using RFC2231 encoding if the value cannot be
Leo Ariasc3d95082018-02-03 18:36:10 -06002747 expressed in 'encoding' and/or the parameter+value is too long to fit
2748 within 'maxlen'.
R. David Murray85d5c182017-12-03 18:51:41 -05002749
2750 """
2751 # Special case for RFC2231 encoding: start from decoded values and use
2752 # RFC2231 encoding iff needed.
2753 #
2754 # Note that the 1 and 2s being added to the length calculations are
2755 # accounting for the possibly-needed spaces and semicolons we'll be adding.
2756 #
2757 for name, value in part.params:
2758 # XXX What if this ';' puts us over maxlen the first time through the
2759 # loop? We should split the header value onto a newline in that case,
2760 # but to do that we need to recognize the need earlier or reparse the
2761 # header, so I'm going to ignore that bug for now. It'll only put us
2762 # one character over.
2763 if not lines[-1].rstrip().endswith(';'):
2764 lines[-1] += ';'
2765 charset = encoding
2766 error_handler = 'strict'
2767 try:
2768 value.encode(encoding)
2769 encoding_required = False
2770 except UnicodeEncodeError:
2771 encoding_required = True
2772 if utils._has_surrogates(value):
2773 charset = 'unknown-8bit'
2774 error_handler = 'surrogateescape'
2775 else:
2776 charset = 'utf-8'
2777 if encoding_required:
2778 encoded_value = urllib.parse.quote(
2779 value, safe='', errors=error_handler)
2780 tstr = "{}*={}''{}".format(name, charset, encoded_value)
2781 else:
2782 tstr = '{}={}'.format(name, quote_string(value))
2783 if len(lines[-1]) + len(tstr) + 1 < maxlen:
2784 lines[-1] = lines[-1] + ' ' + tstr
2785 continue
2786 elif len(tstr) + 2 <= maxlen:
2787 lines.append(' ' + tstr)
2788 continue
2789 # We need multiple sections. We are allowed to mix encoded and
2790 # non-encoded sections, but we aren't going to. We'll encode them all.
2791 section = 0
2792 extra_chrome = charset + "''"
2793 while value:
2794 chrome_len = len(name) + len(str(section)) + 3 + len(extra_chrome)
2795 if maxlen <= chrome_len + 3:
2796 # We need room for the leading blank, the trailing semicolon,
2797 # and at least one character of the value. If we don't
2798 # have that, we'd be stuck, so in that case fall back to
2799 # the RFC standard width.
2800 maxlen = 78
2801 splitpoint = maxchars = maxlen - chrome_len - 2
2802 while True:
2803 partial = value[:splitpoint]
2804 encoded_value = urllib.parse.quote(
2805 partial, safe='', errors=error_handler)
2806 if len(encoded_value) <= maxchars:
2807 break
2808 splitpoint -= 1
2809 lines.append(" {}*{}*={}{}".format(
2810 name, section, extra_chrome, encoded_value))
2811 extra_chrome = ''
2812 section += 1
2813 value = value[splitpoint:]
2814 if value:
2815 lines[-1] += ';'