blob: a9bdf4458b62a5df64a81c71752bdf26bca74973 [file] [log] [blame]
R David Murray0b6f6c82012-05-25 18:42:14 -04001"""Header value parser implementing various email-related RFC parsing rules.
2
3The parsing methods defined in this module implement various email related
4parsing rules. Principal among them is RFC 5322, which is the followon
5to RFC 2822 and primarily a clarification of the former. It also implements
6RFC 2047 encoded word decoding.
7
8RFC 5322 goes to considerable trouble to maintain backward compatibility with
9RFC 822 in the parse phase, while cleaning up the structure on the generation
10phase. This parser supports correct RFC 5322 generation by tagging white space
11as folding white space only when folding is allowed in the non-obsolete rule
12sets. Actually, the parser is even more generous when accepting input than RFC
135322 mandates, following the spirit of Postel's Law, which RFC 5322 encourages.
14Where possible deviations from the standard are annotated on the 'defects'
15attribute of tokens that deviate.
16
17The general structure of the parser follows RFC 5322, and uses its terminology
18where there is a direct correspondence. Where the implementation requires a
19somewhat different structure than that used by the formal grammar, new terms
20that mimic the closest existing terms are used. Thus, it really helps to have
21a copy of RFC 5322 handy when studying this code.
22
23Input to the parser is a string that has already been unfolded according to
24RFC 5322 rules. According to the RFC this unfolding is the very first step, and
25this parser leaves the unfolding step to a higher level message parser, which
26will have already detected the line breaks that need unfolding while
27determining the beginning and end of each header.
28
29The output of the parser is a TokenList object, which is a list subclass. A
30TokenList is a recursive data structure. The terminal nodes of the structure
31are Terminal objects, which are subclasses of str. These do not correspond
32directly to terminal objects in the formal grammar, but are instead more
33practical higher level combinations of true terminals.
34
35All TokenList and Terminal objects have a 'value' attribute, which produces the
36semantically meaningful value of that part of the parse subtree. The value of
37all whitespace tokens (no matter how many sub-tokens they may contain) is a
38single space, as per the RFC rules. This includes 'CFWS', which is herein
39included in the general class of whitespace tokens. There is one exception to
40the rule that whitespace tokens are collapsed into single spaces in values: in
41the value of a 'bare-quoted-string' (a quoted-string with no leading or
42trailing whitespace), any whitespace that appeared between the quotation marks
43is preserved in the returned value. Note that in all Terminal strings quoted
44pairs are turned into their unquoted values.
45
46All TokenList and Terminal objects also have a string value, which attempts to
47be a "canonical" representation of the RFC-compliant form of the substring that
48produced the parsed subtree, including minimal use of quoted pair quoting.
49Whitespace runs are not collapsed.
50
51Comment tokens also have a 'content' attribute providing the string found
52between the parens (including any nested comments) with whitespace preserved.
53
54All TokenList and Terminal objects have a 'defects' attribute which is a
55possibly empty list all of the defects found while creating the token. Defects
56may appear on any token in the tree, and a composite list of all defects in the
57subtree is available through the 'all_defects' attribute of any node. (For
58Terminal notes x.defects == x.all_defects.)
59
60Each object in a parse tree is called a 'token', and each has a 'token_type'
61attribute that gives the name from the RFC 5322 grammar that it represents.
62Not all RFC 5322 nodes are produced, and there is one non-RFC 5322 node that
63may be produced: 'ptext'. A 'ptext' is a string of printable ascii characters.
64It is returned in place of lists of (ctext/quoted-pair) and
65(qtext/quoted-pair).
66
67XXX: provide complete list of token types.
68"""
69
70import re
R David Murray97f43c02012-06-24 05:03:27 -040071import urllib # For urllib.parse.unquote
R David Murray65171b22013-07-11 15:52:57 -040072from string import hexdigits
Victor Stinner7fa767e2014-03-20 09:16:38 +010073from collections import OrderedDict
R David Murray7d0325d2015-03-29 21:53:05 -040074from operator import itemgetter
R David Murray0b6f6c82012-05-25 18:42:14 -040075from email import _encoded_words as _ew
76from email import errors
77from email import utils
78
79#
80# Useful constants and functions
81#
82
83WSP = set(' \t')
84CFWS_LEADER = WSP | set('(')
85SPECIALS = set(r'()<>@,:;.\"[]')
86ATOM_ENDS = SPECIALS | WSP
87DOT_ATOM_ENDS = ATOM_ENDS - set('.')
88# '.', '"', and '(' do not end phrases in order to support obs-phrase
89PHRASE_ENDS = SPECIALS - set('."(')
R David Murray97f43c02012-06-24 05:03:27 -040090TSPECIALS = (SPECIALS | set('/?=')) - set('.')
91TOKEN_ENDS = TSPECIALS | WSP
92ASPECIALS = TSPECIALS | set("*'%")
93ATTRIBUTE_ENDS = ASPECIALS | WSP
94EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
R David Murray0b6f6c82012-05-25 18:42:14 -040095
96def quote_string(value):
97 return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
98
99#
100# Accumulator for header folding
101#
102
103class _Folded:
104
105 def __init__(self, maxlen, policy):
106 self.maxlen = maxlen
107 self.policy = policy
108 self.lastlen = 0
109 self.stickyspace = None
110 self.firstline = True
111 self.done = []
112 self.current = []
113
114 def newline(self):
115 self.done.extend(self.current)
116 self.done.append(self.policy.linesep)
117 self.current.clear()
118 self.lastlen = 0
119
120 def finalize(self):
121 if self.current:
122 self.newline()
123
124 def __str__(self):
125 return ''.join(self.done)
126
127 def append(self, stoken):
128 self.current.append(stoken)
129
130 def append_if_fits(self, token, stoken=None):
131 if stoken is None:
132 stoken = str(token)
133 l = len(stoken)
134 if self.stickyspace is not None:
135 stickyspace_len = len(self.stickyspace)
136 if self.lastlen + stickyspace_len + l <= self.maxlen:
137 self.current.append(self.stickyspace)
138 self.lastlen += stickyspace_len
139 self.current.append(stoken)
140 self.lastlen += l
141 self.stickyspace = None
142 self.firstline = False
143 return True
144 if token.has_fws:
145 ws = token.pop_leading_fws()
146 if ws is not None:
147 self.stickyspace += str(ws)
148 stickyspace_len += len(ws)
149 token._fold(self)
150 return True
151 if stickyspace_len and l + 1 <= self.maxlen:
152 margin = self.maxlen - l
153 if 0 < margin < stickyspace_len:
154 trim = stickyspace_len - margin
155 self.current.append(self.stickyspace[:trim])
156 self.stickyspace = self.stickyspace[trim:]
157 stickyspace_len = trim
158 self.newline()
159 self.current.append(self.stickyspace)
160 self.current.append(stoken)
161 self.lastlen = l + stickyspace_len
162 self.stickyspace = None
163 self.firstline = False
164 return True
165 if not self.firstline:
166 self.newline()
167 self.current.append(self.stickyspace)
168 self.current.append(stoken)
169 self.stickyspace = None
170 self.firstline = False
171 return True
172 if self.lastlen + l <= self.maxlen:
173 self.current.append(stoken)
174 self.lastlen += l
175 return True
176 if l < self.maxlen:
177 self.newline()
178 self.current.append(stoken)
179 self.lastlen = l
180 return True
181 return False
182
183#
184# TokenList and its subclasses
185#
186
187class TokenList(list):
188
189 token_type = None
190
191 def __init__(self, *args, **kw):
192 super().__init__(*args, **kw)
193 self.defects = []
194
195 def __str__(self):
196 return ''.join(str(x) for x in self)
197
198 def __repr__(self):
199 return '{}({})'.format(self.__class__.__name__,
200 super().__repr__())
201
202 @property
203 def value(self):
204 return ''.join(x.value for x in self if x.value)
205
206 @property
207 def all_defects(self):
208 return sum((x.all_defects for x in self), self.defects)
209
210 #
211 # Folding API
212 #
213 # parts():
214 #
215 # return a list of objects that constitute the "higher level syntactic
216 # objects" specified by the RFC as the best places to fold a header line.
217 # The returned objects must include leading folding white space, even if
218 # this means mutating the underlying parse tree of the object. Each object
219 # is only responsible for returning *its* parts, and should not drill down
220 # to any lower level except as required to meet the leading folding white
221 # space constraint.
222 #
223 # _fold(folded):
224 #
225 # folded: the result accumulator. This is an instance of _Folded.
226 # (XXX: I haven't finished factoring this out yet, the folding code
227 # pretty much uses this as a state object.) When the folded.current
228 # contains as much text as will fit, the _fold method should call
229 # folded.newline.
230 # folded.lastlen: the current length of the test stored in folded.current.
231 # folded.maxlen: The maximum number of characters that may appear on a
232 # folded line. Differs from the policy setting in that "no limit" is
233 # represented by +inf, which means it can be used in the trivially
234 # logical fashion in comparisons.
235 #
236 # Currently no subclasses implement parts, and I think this will remain
237 # true. A subclass only needs to implement _fold when the generic version
238 # isn't sufficient. _fold will need to be implemented primarily when it is
239 # possible for encoded words to appear in the specialized token-list, since
240 # there is no generic algorithm that can know where exactly the encoded
241 # words are allowed. A _fold implementation is responsible for filling
242 # lines in the same general way that the top level _fold does. It may, and
243 # should, call the _fold method of sub-objects in a similar fashion to that
244 # of the top level _fold.
245 #
246 # XXX: I'm hoping it will be possible to factor the existing code further
247 # to reduce redundancy and make the logic clearer.
248
249 @property
250 def parts(self):
251 klass = self.__class__
252 this = []
253 for token in self:
254 if token.startswith_fws():
255 if this:
256 yield this[0] if len(this)==1 else klass(this)
257 this.clear()
258 end_ws = token.pop_trailing_ws()
259 this.append(token)
260 if end_ws:
261 yield klass(this)
262 this = [end_ws]
263 if this:
264 yield this[0] if len(this)==1 else klass(this)
265
266 def startswith_fws(self):
267 return self[0].startswith_fws()
268
269 def pop_leading_fws(self):
270 if self[0].token_type == 'fws':
271 return self.pop(0)
272 return self[0].pop_leading_fws()
273
274 def pop_trailing_ws(self):
275 if self[-1].token_type == 'cfws':
276 return self.pop(-1)
277 return self[-1].pop_trailing_ws()
278
279 @property
280 def has_fws(self):
281 for part in self:
282 if part.has_fws:
283 return True
284 return False
285
286 def has_leading_comment(self):
287 return self[0].has_leading_comment()
288
289 @property
290 def comments(self):
291 comments = []
292 for token in self:
293 comments.extend(token.comments)
294 return comments
295
296 def fold(self, *, policy):
297 # max_line_length 0/None means no limit, ie: infinitely long.
298 maxlen = policy.max_line_length or float("+inf")
299 folded = _Folded(maxlen, policy)
300 self._fold(folded)
301 folded.finalize()
302 return str(folded)
303
304 def as_encoded_word(self, charset):
305 # This works only for things returned by 'parts', which include
306 # the leading fws, if any, that should be used.
307 res = []
308 ws = self.pop_leading_fws()
309 if ws:
310 res.append(ws)
311 trailer = self.pop(-1) if self[-1].token_type=='fws' else ''
312 res.append(_ew.encode(str(self), charset))
313 res.append(trailer)
314 return ''.join(res)
315
316 def cte_encode(self, charset, policy):
317 res = []
318 for part in self:
319 res.append(part.cte_encode(charset, policy))
320 return ''.join(res)
321
322 def _fold(self, folded):
323 for part in self.parts:
324 tstr = str(part)
325 tlen = len(tstr)
326 try:
327 str(part).encode('us-ascii')
328 except UnicodeEncodeError:
329 if any(isinstance(x, errors.UndecodableBytesDefect)
330 for x in part.all_defects):
331 charset = 'unknown-8bit'
332 else:
333 # XXX: this should be a policy setting
334 charset = 'utf-8'
335 tstr = part.cte_encode(charset, folded.policy)
336 tlen = len(tstr)
337 if folded.append_if_fits(part, tstr):
338 continue
339 # Peel off the leading whitespace if any and make it sticky, to
340 # avoid infinite recursion.
341 ws = part.pop_leading_fws()
342 if ws is not None:
343 # Peel off the leading whitespace and make it sticky, to
344 # avoid infinite recursion.
345 folded.stickyspace = str(part.pop(0))
346 if folded.append_if_fits(part):
347 continue
348 if part.has_fws:
349 part._fold(folded)
350 continue
351 # There are no fold points in this one; it is too long for a single
352 # line and can't be split...we just have to put it on its own line.
353 folded.append(tstr)
354 folded.newline()
355
356 def pprint(self, indent=''):
357 print('\n'.join(self._pp(indent='')))
358
359 def ppstr(self, indent=''):
360 return '\n'.join(self._pp(indent=''))
361
362 def _pp(self, indent=''):
363 yield '{}{}/{}('.format(
364 indent,
365 self.__class__.__name__,
366 self.token_type)
367 for token in self:
R David Murray97f43c02012-06-24 05:03:27 -0400368 if not hasattr(token, '_pp'):
369 yield (indent + ' !! invalid element in token '
370 'list: {!r}'.format(token))
371 else:
Philip Jenvey4993cc02012-10-01 12:53:43 -0700372 yield from token._pp(indent+' ')
R David Murray0b6f6c82012-05-25 18:42:14 -0400373 if self.defects:
374 extra = ' Defects: {}'.format(self.defects)
375 else:
376 extra = ''
377 yield '{}){}'.format(indent, extra)
378
379
380class WhiteSpaceTokenList(TokenList):
381
382 @property
383 def value(self):
384 return ' '
385
386 @property
387 def comments(self):
388 return [x.content for x in self if x.token_type=='comment']
389
390
391class UnstructuredTokenList(TokenList):
392
393 token_type = 'unstructured'
394
395 def _fold(self, folded):
R David Murray0b6f6c82012-05-25 18:42:14 -0400396 last_ew = None
397 for part in self.parts:
398 tstr = str(part)
399 is_ew = False
400 try:
401 str(part).encode('us-ascii')
402 except UnicodeEncodeError:
403 if any(isinstance(x, errors.UndecodableBytesDefect)
404 for x in part.all_defects):
405 charset = 'unknown-8bit'
406 else:
407 charset = 'utf-8'
408 if last_ew is not None:
409 # We've already done an EW, combine this one with it
410 # if there's room.
411 chunk = get_unstructured(
412 ''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset)
413 oldlastlen = sum(len(x) for x in folded.current[:last_ew])
414 schunk = str(chunk)
415 lchunk = len(schunk)
416 if oldlastlen + lchunk <= folded.maxlen:
417 del folded.current[last_ew:]
418 folded.append(schunk)
419 folded.lastlen = oldlastlen + lchunk
420 continue
421 tstr = part.as_encoded_word(charset)
422 is_ew = True
423 if folded.append_if_fits(part, tstr):
424 if is_ew:
425 last_ew = len(folded.current) - 1
426 continue
427 if is_ew or last_ew:
428 # It's too big to fit on the line, but since we've
429 # got encoded words we can use encoded word folding.
430 part._fold_as_ew(folded)
431 continue
432 # Peel off the leading whitespace if any and make it sticky, to
433 # avoid infinite recursion.
434 ws = part.pop_leading_fws()
435 if ws is not None:
436 folded.stickyspace = str(ws)
437 if folded.append_if_fits(part):
438 continue
439 if part.has_fws:
440 part.fold(folded)
441 continue
442 # It can't be split...we just have to put it on its own line.
443 folded.append(tstr)
444 folded.newline()
445 last_ew = None
446
447 def cte_encode(self, charset, policy):
448 res = []
449 last_ew = None
450 for part in self:
451 spart = str(part)
452 try:
453 spart.encode('us-ascii')
454 res.append(spart)
455 except UnicodeEncodeError:
456 if last_ew is None:
457 res.append(part.cte_encode(charset, policy))
458 last_ew = len(res)
459 else:
460 tl = get_unstructured(''.join(res[last_ew:] + [spart]))
461 res.append(tl.as_encoded_word())
462 return ''.join(res)
463
464
465class Phrase(TokenList):
466
467 token_type = 'phrase'
468
469 def _fold(self, folded):
470 # As with Unstructured, we can have pure ASCII with or without
471 # surrogateescape encoded bytes, or we could have unicode. But this
472 # case is more complicated, since we have to deal with the various
473 # sub-token types and how they can be composed in the face of
474 # unicode-that-needs-CTE-encoding, and the fact that if a token a
475 # comment that becomes a barrier across which we can't compose encoded
476 # words.
477 last_ew = None
478 for part in self.parts:
479 tstr = str(part)
480 tlen = len(tstr)
481 has_ew = False
482 try:
483 str(part).encode('us-ascii')
484 except UnicodeEncodeError:
485 if any(isinstance(x, errors.UndecodableBytesDefect)
486 for x in part.all_defects):
487 charset = 'unknown-8bit'
488 else:
489 charset = 'utf-8'
490 if last_ew is not None and not part.has_leading_comment():
491 # We've already done an EW, let's see if we can combine
492 # this one with it. The last_ew logic ensures that all we
493 # have at this point is atoms, no comments or quoted
494 # strings. So we can treat the text between the last
495 # encoded word and the content of this token as
496 # unstructured text, and things will work correctly. But
497 # we have to strip off any trailing comment on this token
498 # first, and if it is a quoted string we have to pull out
499 # the content (we're encoding it, so it no longer needs to
500 # be quoted).
501 if part[-1].token_type == 'cfws' and part.comments:
502 remainder = part.pop(-1)
503 else:
504 remainder = ''
505 for i, token in enumerate(part):
506 if token.token_type == 'bare-quoted-string':
507 part[i] = UnstructuredTokenList(token[:])
508 chunk = get_unstructured(
509 ''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset)
510 schunk = str(chunk)
511 lchunk = len(schunk)
512 if last_ew + lchunk <= folded.maxlen:
513 del folded.current[last_ew:]
514 folded.append(schunk)
515 folded.lastlen = sum(len(x) for x in folded.current)
516 continue
517 tstr = part.as_encoded_word(charset)
518 tlen = len(tstr)
519 has_ew = True
520 if folded.append_if_fits(part, tstr):
521 if has_ew and not part.comments:
522 last_ew = len(folded.current) - 1
523 elif part.comments or part.token_type == 'quoted-string':
524 # If a comment is involved we can't combine EWs. And if a
525 # quoted string is involved, it's not worth the effort to
526 # try to combine them.
527 last_ew = None
528 continue
529 part._fold(folded)
530
531 def cte_encode(self, charset, policy):
532 res = []
533 last_ew = None
534 is_ew = False
535 for part in self:
536 spart = str(part)
537 try:
538 spart.encode('us-ascii')
539 res.append(spart)
540 except UnicodeEncodeError:
541 is_ew = True
542 if last_ew is None:
543 if not part.comments:
544 last_ew = len(res)
545 res.append(part.cte_encode(charset, policy))
546 elif not part.has_leading_comment():
547 if part[-1].token_type == 'cfws' and part.comments:
548 remainder = part.pop(-1)
549 else:
550 remainder = ''
551 for i, token in enumerate(part):
552 if token.token_type == 'bare-quoted-string':
553 part[i] = UnstructuredTokenList(token[:])
554 tl = get_unstructured(''.join(res[last_ew:] + [spart]))
555 res[last_ew:] = [tl.as_encoded_word(charset)]
556 if part.comments or (not is_ew and part.token_type == 'quoted-string'):
557 last_ew = None
558 return ''.join(res)
559
560class Word(TokenList):
561
562 token_type = 'word'
563
564
565class CFWSList(WhiteSpaceTokenList):
566
567 token_type = 'cfws'
568
569 def has_leading_comment(self):
570 return bool(self.comments)
571
572
573class Atom(TokenList):
574
575 token_type = 'atom'
576
577
R David Murray97f43c02012-06-24 05:03:27 -0400578class Token(TokenList):
579
580 token_type = 'token'
581
582
R David Murray0b6f6c82012-05-25 18:42:14 -0400583class EncodedWord(TokenList):
584
585 token_type = 'encoded-word'
586 cte = None
587 charset = None
588 lang = None
589
590 @property
591 def encoded(self):
592 if self.cte is not None:
593 return self.cte
594 _ew.encode(str(self), self.charset)
595
596
597
598class QuotedString(TokenList):
599
600 token_type = 'quoted-string'
601
602 @property
603 def content(self):
604 for x in self:
605 if x.token_type == 'bare-quoted-string':
606 return x.value
607
608 @property
609 def quoted_value(self):
610 res = []
611 for x in self:
612 if x.token_type == 'bare-quoted-string':
613 res.append(str(x))
614 else:
615 res.append(x.value)
616 return ''.join(res)
617
R David Murray97f43c02012-06-24 05:03:27 -0400618 @property
619 def stripped_value(self):
620 for token in self:
621 if token.token_type == 'bare-quoted-string':
622 return token.value
623
R David Murray0b6f6c82012-05-25 18:42:14 -0400624
625class BareQuotedString(QuotedString):
626
627 token_type = 'bare-quoted-string'
628
629 def __str__(self):
R David Murray97f43c02012-06-24 05:03:27 -0400630 return quote_string(''.join(str(x) for x in self))
R David Murray0b6f6c82012-05-25 18:42:14 -0400631
632 @property
633 def value(self):
634 return ''.join(str(x) for x in self)
635
636
637class Comment(WhiteSpaceTokenList):
638
639 token_type = 'comment'
640
641 def __str__(self):
642 return ''.join(sum([
643 ["("],
644 [self.quote(x) for x in self],
645 [")"],
646 ], []))
647
648 def quote(self, value):
649 if value.token_type == 'comment':
650 return str(value)
651 return str(value).replace('\\', '\\\\').replace(
652 '(', '\(').replace(
653 ')', '\)')
654
655 @property
656 def content(self):
657 return ''.join(str(x) for x in self)
658
659 @property
660 def comments(self):
661 return [self.content]
662
663class AddressList(TokenList):
664
665 token_type = 'address-list'
666
667 @property
668 def addresses(self):
669 return [x for x in self if x.token_type=='address']
670
671 @property
672 def mailboxes(self):
673 return sum((x.mailboxes
674 for x in self if x.token_type=='address'), [])
675
676 @property
677 def all_mailboxes(self):
678 return sum((x.all_mailboxes
679 for x in self if x.token_type=='address'), [])
680
681
682class Address(TokenList):
683
684 token_type = 'address'
685
686 @property
687 def display_name(self):
688 if self[0].token_type == 'group':
689 return self[0].display_name
690
691 @property
692 def mailboxes(self):
693 if self[0].token_type == 'mailbox':
694 return [self[0]]
695 elif self[0].token_type == 'invalid-mailbox':
696 return []
697 return self[0].mailboxes
698
699 @property
700 def all_mailboxes(self):
701 if self[0].token_type == 'mailbox':
702 return [self[0]]
703 elif self[0].token_type == 'invalid-mailbox':
704 return [self[0]]
705 return self[0].all_mailboxes
706
707class MailboxList(TokenList):
708
709 token_type = 'mailbox-list'
710
711 @property
712 def mailboxes(self):
713 return [x for x in self if x.token_type=='mailbox']
714
715 @property
716 def all_mailboxes(self):
717 return [x for x in self
718 if x.token_type in ('mailbox', 'invalid-mailbox')]
719
720
721class GroupList(TokenList):
722
723 token_type = 'group-list'
724
725 @property
726 def mailboxes(self):
727 if not self or self[0].token_type != 'mailbox-list':
728 return []
729 return self[0].mailboxes
730
731 @property
732 def all_mailboxes(self):
733 if not self or self[0].token_type != 'mailbox-list':
734 return []
735 return self[0].all_mailboxes
736
737
738class Group(TokenList):
739
740 token_type = "group"
741
742 @property
743 def mailboxes(self):
744 if self[2].token_type != 'group-list':
745 return []
746 return self[2].mailboxes
747
748 @property
749 def all_mailboxes(self):
750 if self[2].token_type != 'group-list':
751 return []
752 return self[2].all_mailboxes
753
754 @property
755 def display_name(self):
756 return self[0].display_name
757
758
759class NameAddr(TokenList):
760
761 token_type = 'name-addr'
762
763 @property
764 def display_name(self):
765 if len(self) == 1:
766 return None
767 return self[0].display_name
768
769 @property
770 def local_part(self):
771 return self[-1].local_part
772
773 @property
774 def domain(self):
775 return self[-1].domain
776
777 @property
778 def route(self):
779 return self[-1].route
780
781 @property
782 def addr_spec(self):
783 return self[-1].addr_spec
784
785
786class AngleAddr(TokenList):
787
788 token_type = 'angle-addr'
789
790 @property
791 def local_part(self):
792 for x in self:
793 if x.token_type == 'addr-spec':
794 return x.local_part
795
796 @property
797 def domain(self):
798 for x in self:
799 if x.token_type == 'addr-spec':
800 return x.domain
801
802 @property
803 def route(self):
804 for x in self:
805 if x.token_type == 'obs-route':
806 return x.domains
807
808 @property
809 def addr_spec(self):
810 for x in self:
811 if x.token_type == 'addr-spec':
812 return x.addr_spec
R David Murray032eed32012-05-26 14:31:12 -0400813 else:
814 return '<>'
R David Murray0b6f6c82012-05-25 18:42:14 -0400815
816
817class ObsRoute(TokenList):
818
819 token_type = 'obs-route'
820
821 @property
822 def domains(self):
823 return [x.domain for x in self if x.token_type == 'domain']
824
825
826class Mailbox(TokenList):
827
828 token_type = 'mailbox'
829
830 @property
831 def display_name(self):
832 if self[0].token_type == 'name-addr':
833 return self[0].display_name
834
835 @property
836 def local_part(self):
837 return self[0].local_part
838
839 @property
840 def domain(self):
841 return self[0].domain
842
843 @property
844 def route(self):
845 if self[0].token_type == 'name-addr':
846 return self[0].route
847
848 @property
849 def addr_spec(self):
850 return self[0].addr_spec
851
852
853class InvalidMailbox(TokenList):
854
855 token_type = 'invalid-mailbox'
856
857 @property
858 def display_name(self):
859 return None
860
861 local_part = domain = route = addr_spec = display_name
862
863
864class Domain(TokenList):
865
866 token_type = 'domain'
867
868 @property
869 def domain(self):
870 return ''.join(super().value.split())
871
872
873class DotAtom(TokenList):
874
875 token_type = 'dot-atom'
876
877
878class DotAtomText(TokenList):
879
880 token_type = 'dot-atom-text'
881
882
883class AddrSpec(TokenList):
884
885 token_type = 'addr-spec'
886
887 @property
888 def local_part(self):
889 return self[0].local_part
890
891 @property
892 def domain(self):
893 if len(self) < 3:
894 return None
895 return self[-1].domain
896
897 @property
898 def value(self):
899 if len(self) < 3:
900 return self[0].value
901 return self[0].value.rstrip()+self[1].value+self[2].value.lstrip()
902
903 @property
904 def addr_spec(self):
905 nameset = set(self.local_part)
906 if len(nameset) > len(nameset-DOT_ATOM_ENDS):
907 lp = quote_string(self.local_part)
908 else:
909 lp = self.local_part
910 if self.domain is not None:
911 return lp + '@' + self.domain
912 return lp
913
914
915class ObsLocalPart(TokenList):
916
917 token_type = 'obs-local-part'
918
919
920class DisplayName(Phrase):
921
922 token_type = 'display-name'
923
924 @property
925 def display_name(self):
926 res = TokenList(self)
927 if res[0].token_type == 'cfws':
928 res.pop(0)
929 else:
930 if res[0][0].token_type == 'cfws':
931 res[0] = TokenList(res[0][1:])
932 if res[-1].token_type == 'cfws':
933 res.pop()
934 else:
935 if res[-1][-1].token_type == 'cfws':
936 res[-1] = TokenList(res[-1][:-1])
937 return res.value
938
939 @property
940 def value(self):
941 quote = False
942 if self.defects:
943 quote = True
944 else:
945 for x in self:
946 if x.token_type == 'quoted-string':
947 quote = True
948 if quote:
949 pre = post = ''
950 if self[0].token_type=='cfws' or self[0][0].token_type=='cfws':
951 pre = ' '
952 if self[-1].token_type=='cfws' or self[-1][-1].token_type=='cfws':
953 post = ' '
954 return pre+quote_string(self.display_name)+post
955 else:
956 return super().value
957
958
959class LocalPart(TokenList):
960
961 token_type = 'local-part'
962
963 @property
964 def value(self):
965 if self[0].token_type == "quoted-string":
966 return self[0].quoted_value
967 else:
968 return self[0].value
969
970 @property
971 def local_part(self):
972 # Strip whitespace from front, back, and around dots.
973 res = [DOT]
974 last = DOT
975 last_is_tl = False
976 for tok in self[0] + [DOT]:
977 if tok.token_type == 'cfws':
978 continue
979 if (last_is_tl and tok.token_type == 'dot' and
980 last[-1].token_type == 'cfws'):
981 res[-1] = TokenList(last[:-1])
982 is_tl = isinstance(tok, TokenList)
983 if (is_tl and last.token_type == 'dot' and
984 tok[0].token_type == 'cfws'):
985 res.append(TokenList(tok[1:]))
986 else:
987 res.append(tok)
988 last = res[-1]
989 last_is_tl = is_tl
990 res = TokenList(res[1:-1])
991 return res.value
992
993
994class DomainLiteral(TokenList):
995
996 token_type = 'domain-literal'
997
998 @property
999 def domain(self):
1000 return ''.join(super().value.split())
1001
1002 @property
1003 def ip(self):
1004 for x in self:
1005 if x.token_type == 'ptext':
1006 return x.value
1007
1008
R David Murray97f43c02012-06-24 05:03:27 -04001009class MIMEVersion(TokenList):
1010
1011 token_type = 'mime-version'
1012 major = None
1013 minor = None
1014
1015
1016class Parameter(TokenList):
1017
1018 token_type = 'parameter'
1019 sectioned = False
1020 extended = False
1021 charset = 'us-ascii'
1022
1023 @property
1024 def section_number(self):
1025 # Because the first token, the attribute (name) eats CFWS, the second
1026 # token is always the section if there is one.
1027 return self[1].number if self.sectioned else 0
1028
1029 @property
1030 def param_value(self):
1031 # This is part of the "handle quoted extended parameters" hack.
1032 for token in self:
1033 if token.token_type == 'value':
1034 return token.stripped_value
1035 if token.token_type == 'quoted-string':
1036 for token in token:
1037 if token.token_type == 'bare-quoted-string':
1038 for token in token:
1039 if token.token_type == 'value':
1040 return token.stripped_value
1041 return ''
1042
1043
1044class InvalidParameter(Parameter):
1045
1046 token_type = 'invalid-parameter'
1047
1048
1049class Attribute(TokenList):
1050
1051 token_type = 'attribute'
1052
1053 @property
1054 def stripped_value(self):
1055 for token in self:
1056 if token.token_type.endswith('attrtext'):
1057 return token.value
1058
1059class Section(TokenList):
1060
1061 token_type = 'section'
1062 number = None
1063
1064
1065class Value(TokenList):
1066
1067 token_type = 'value'
1068
1069 @property
1070 def stripped_value(self):
1071 token = self[0]
1072 if token.token_type == 'cfws':
1073 token = self[1]
1074 if token.token_type.endswith(
1075 ('quoted-string', 'attribute', 'extended-attribute')):
1076 return token.stripped_value
1077 return self.value
1078
1079
1080class MimeParameters(TokenList):
1081
1082 token_type = 'mime-parameters'
1083
1084 @property
1085 def params(self):
1086 # The RFC specifically states that the ordering of parameters is not
1087 # guaranteed and may be reordered by the transport layer. So we have
1088 # to assume the RFC 2231 pieces can come in any order. However, we
1089 # output them in the order that we first see a given name, which gives
1090 # us a stable __str__.
1091 params = OrderedDict()
1092 for token in self:
1093 if not token.token_type.endswith('parameter'):
1094 continue
1095 if token[0].token_type != 'attribute':
1096 continue
1097 name = token[0].value.strip()
1098 if name not in params:
1099 params[name] = []
1100 params[name].append((token.section_number, token))
1101 for name, parts in params.items():
R David Murray7d0325d2015-03-29 21:53:05 -04001102 parts = sorted(parts, key=itemgetter(0))
1103 first_param = parts[0][1]
1104 charset = first_param.charset
1105 # Our arbitrary error recovery is to ignore duplicate parameters,
1106 # to use appearance order if there are duplicate rfc 2231 parts,
1107 # and to ignore gaps. This mimics the error recovery of get_param.
1108 if not first_param.extended and len(parts) > 1:
1109 if parts[1][0] == 0:
1110 parts[1][1].defects.append(errors.InvalidHeaderDefect(
1111 'duplicate parameter name; duplicate(s) ignored'))
1112 parts = parts[:1]
1113 # Else assume the *0* was missing...note that this is different
1114 # from get_param, but we registered a defect for this earlier.
R David Murray97f43c02012-06-24 05:03:27 -04001115 value_parts = []
R David Murray7d0325d2015-03-29 21:53:05 -04001116 i = 0
1117 for section_number, param in parts:
R David Murray97f43c02012-06-24 05:03:27 -04001118 if section_number != i:
R David Murray7d0325d2015-03-29 21:53:05 -04001119 # We could get fancier here and look for a complete
1120 # duplicate extended parameter and ignore the second one
1121 # seen. But we're not doing that. The old code didn't.
1122 if not param.extended:
1123 param.defects.append(errors.InvalidHeaderDefect(
1124 'duplicate parameter name; duplicate ignored'))
1125 continue
1126 else:
1127 param.defects.append(errors.InvalidHeaderDefect(
1128 "inconsistent RFC2231 parameter numbering"))
1129 i += 1
R David Murray97f43c02012-06-24 05:03:27 -04001130 value = param.param_value
1131 if param.extended:
1132 try:
1133 value = urllib.parse.unquote_to_bytes(value)
1134 except UnicodeEncodeError:
1135 # source had surrogate escaped bytes. What we do now
1136 # is a bit of an open question. I'm not sure this is
1137 # the best choice, but it is what the old algorithm did
1138 value = urllib.parse.unquote(value, encoding='latin-1')
1139 else:
1140 try:
1141 value = value.decode(charset, 'surrogateescape')
1142 except LookupError:
1143 # XXX: there should really be a custom defect for
1144 # unknown character set to make it easy to find,
1145 # because otherwise unknown charset is a silent
1146 # failure.
1147 value = value.decode('us-ascii', 'surrogateescape')
1148 if utils._has_surrogates(value):
1149 param.defects.append(errors.UndecodableBytesDefect())
1150 value_parts.append(value)
1151 value = ''.join(value_parts)
1152 yield name, value
1153
1154 def __str__(self):
1155 params = []
1156 for name, value in self.params:
1157 if value:
1158 params.append('{}={}'.format(name, quote_string(value)))
1159 else:
1160 params.append(name)
1161 params = '; '.join(params)
1162 return ' ' + params if params else ''
1163
1164
1165class ParameterizedHeaderValue(TokenList):
1166
1167 @property
1168 def params(self):
1169 for token in reversed(self):
1170 if token.token_type == 'mime-parameters':
1171 return token.params
1172 return {}
1173
1174 @property
1175 def parts(self):
1176 if self and self[-1].token_type == 'mime-parameters':
1177 # We don't want to start a new line if all of the params don't fit
1178 # after the value, so unwrap the parameter list.
1179 return TokenList(self[:-1] + self[-1])
1180 return TokenList(self).parts
1181
1182
1183class ContentType(ParameterizedHeaderValue):
1184
1185 token_type = 'content-type'
1186 maintype = 'text'
1187 subtype = 'plain'
1188
1189
1190class ContentDisposition(ParameterizedHeaderValue):
1191
1192 token_type = 'content-disposition'
1193 content_disposition = None
1194
1195
1196class ContentTransferEncoding(TokenList):
1197
1198 token_type = 'content-transfer-encoding'
1199 cte = '7bit'
1200
1201
R David Murray0b6f6c82012-05-25 18:42:14 -04001202class HeaderLabel(TokenList):
1203
1204 token_type = 'header-label'
1205
1206
1207class Header(TokenList):
1208
1209 token_type = 'header'
1210
1211 def _fold(self, folded):
1212 folded.append(str(self.pop(0)))
1213 folded.lastlen = len(folded.current[0])
1214 # The first line of the header is different from all others: we don't
1215 # want to start a new object on a new line if it has any fold points in
1216 # it that would allow part of it to be on the first header line.
1217 # Further, if the first fold point would fit on the new line, we want
1218 # to do that, but if it doesn't we want to put it on the first line.
1219 # Folded supports this via the stickyspace attribute. If this
1220 # attribute is not None, it does the special handling.
1221 folded.stickyspace = str(self.pop(0)) if self[0].token_type == 'cfws' else ''
1222 rest = self.pop(0)
1223 if self:
1224 raise ValueError("Malformed Header token list")
1225 rest._fold(folded)
1226
1227
1228#
1229# Terminal classes and instances
1230#
1231
1232class Terminal(str):
1233
1234 def __new__(cls, value, token_type):
1235 self = super().__new__(cls, value)
1236 self.token_type = token_type
1237 self.defects = []
1238 return self
1239
1240 def __repr__(self):
1241 return "{}({})".format(self.__class__.__name__, super().__repr__())
1242
1243 @property
1244 def all_defects(self):
1245 return list(self.defects)
1246
1247 def _pp(self, indent=''):
1248 return ["{}{}/{}({}){}".format(
1249 indent,
1250 self.__class__.__name__,
1251 self.token_type,
1252 super().__repr__(),
1253 '' if not self.defects else ' {}'.format(self.defects),
1254 )]
1255
1256 def cte_encode(self, charset, policy):
1257 value = str(self)
1258 try:
1259 value.encode('us-ascii')
1260 return value
1261 except UnicodeEncodeError:
1262 return _ew.encode(value, charset)
1263
1264 def pop_trailing_ws(self):
1265 # This terminates the recursion.
1266 return None
1267
1268 def pop_leading_fws(self):
1269 # This terminates the recursion.
1270 return None
1271
1272 @property
1273 def comments(self):
1274 return []
1275
1276 def has_leading_comment(self):
1277 return False
1278
1279 def __getnewargs__(self):
1280 return(str(self), self.token_type)
1281
1282
1283class WhiteSpaceTerminal(Terminal):
1284
1285 @property
1286 def value(self):
1287 return ' '
1288
1289 def startswith_fws(self):
1290 return True
1291
1292 has_fws = True
1293
1294
1295class ValueTerminal(Terminal):
1296
1297 @property
1298 def value(self):
1299 return self
1300
1301 def startswith_fws(self):
1302 return False
1303
1304 has_fws = False
1305
1306 def as_encoded_word(self, charset):
1307 return _ew.encode(str(self), charset)
1308
1309
1310class EWWhiteSpaceTerminal(WhiteSpaceTerminal):
1311
1312 @property
1313 def value(self):
1314 return ''
1315
1316 @property
1317 def encoded(self):
1318 return self[:]
1319
1320 def __str__(self):
1321 return ''
1322
1323 has_fws = True
1324
1325
1326# XXX these need to become classes and used as instances so
1327# that a program can't change them in a parse tree and screw
1328# up other parse trees. Maybe should have tests for that, too.
1329DOT = ValueTerminal('.', 'dot')
1330ListSeparator = ValueTerminal(',', 'list-separator')
1331RouteComponentMarker = ValueTerminal('@', 'route-component-marker')
1332
1333#
1334# Parser
1335#
1336
Victor Stinner765531d2013-03-26 01:11:54 +01001337# Parse strings according to RFC822/2047/2822/5322 rules.
1338#
1339# This is a stateless parser. Each get_XXX function accepts a string and
1340# returns either a Terminal or a TokenList representing the RFC object named
1341# by the method and a string containing the remaining unparsed characters
1342# from the input. Thus a parser method consumes the next syntactic construct
1343# of a given type and returns a token representing the construct plus the
1344# unparsed remainder of the input string.
1345#
1346# For example, if the first element of a structured header is a 'phrase',
1347# then:
1348#
1349# phrase, value = get_phrase(value)
1350#
1351# returns the complete phrase from the start of the string value, plus any
1352# characters left in the string after the phrase is removed.
R David Murray0b6f6c82012-05-25 18:42:14 -04001353
1354_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split
1355_non_atom_end_matcher = re.compile(r"[^{}]+".format(
1356 ''.join(ATOM_ENDS).replace('\\','\\\\').replace(']','\]'))).match
1357_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall
R David Murray97f43c02012-06-24 05:03:27 -04001358_non_token_end_matcher = re.compile(r"[^{}]+".format(
1359 ''.join(TOKEN_ENDS).replace('\\','\\\\').replace(']','\]'))).match
1360_non_attribute_end_matcher = re.compile(r"[^{}]+".format(
1361 ''.join(ATTRIBUTE_ENDS).replace('\\','\\\\').replace(']','\]'))).match
1362_non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format(
1363 ''.join(EXTENDED_ATTRIBUTE_ENDS).replace(
1364 '\\','\\\\').replace(']','\]'))).match
R David Murray0b6f6c82012-05-25 18:42:14 -04001365
1366def _validate_xtext(xtext):
1367 """If input token contains ASCII non-printables, register a defect."""
1368
1369 non_printables = _non_printable_finder(xtext)
1370 if non_printables:
1371 xtext.defects.append(errors.NonPrintableDefect(non_printables))
1372 if utils._has_surrogates(xtext):
1373 xtext.defects.append(errors.UndecodableBytesDefect(
1374 "Non-ASCII characters found in header token"))
1375
1376def _get_ptext_to_endchars(value, endchars):
1377 """Scan printables/quoted-pairs until endchars and return unquoted ptext.
1378
1379 This function turns a run of qcontent, ccontent-without-comments, or
1380 dtext-with-quoted-printables into a single string by unquoting any
1381 quoted printables. It returns the string, the remaining value, and
1382 a flag that is True iff there were any quoted printables decoded.
1383
1384 """
1385 fragment, *remainder = _wsp_splitter(value, 1)
1386 vchars = []
1387 escape = False
1388 had_qp = False
1389 for pos in range(len(fragment)):
1390 if fragment[pos] == '\\':
1391 if escape:
1392 escape = False
1393 had_qp = True
1394 else:
1395 escape = True
1396 continue
1397 if escape:
1398 escape = False
1399 elif fragment[pos] in endchars:
1400 break
1401 vchars.append(fragment[pos])
1402 else:
1403 pos = pos + 1
1404 return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp
1405
R David Murray0b6f6c82012-05-25 18:42:14 -04001406def get_fws(value):
1407 """FWS = 1*WSP
1408
1409 This isn't the RFC definition. We're using fws to represent tokens where
1410 folding can be done, but when we are parsing the *un*folding has already
1411 been done so we don't need to watch out for CRLF.
1412
1413 """
1414 newvalue = value.lstrip()
1415 fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws')
1416 return fws, newvalue
1417
1418def get_encoded_word(value):
1419 """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
1420
1421 """
1422 ew = EncodedWord()
1423 if not value.startswith('=?'):
1424 raise errors.HeaderParseError(
1425 "expected encoded word but found {}".format(value))
1426 tok, *remainder = value[2:].split('?=', 1)
1427 if tok == value[2:]:
1428 raise errors.HeaderParseError(
1429 "expected encoded word but found {}".format(value))
1430 remstr = ''.join(remainder)
R David Murray65171b22013-07-11 15:52:57 -04001431 if len(remstr) > 1 and remstr[0] in hexdigits and remstr[1] in hexdigits:
1432 # The ? after the CTE was followed by an encoded word escape (=XX).
R David Murray0b6f6c82012-05-25 18:42:14 -04001433 rest, *remainder = remstr.split('?=', 1)
1434 tok = tok + '?=' + rest
1435 if len(tok.split()) > 1:
1436 ew.defects.append(errors.InvalidHeaderDefect(
1437 "whitespace inside encoded word"))
1438 ew.cte = value
1439 value = ''.join(remainder)
1440 try:
1441 text, charset, lang, defects = _ew.decode('=?' + tok + '?=')
1442 except ValueError:
1443 raise errors.HeaderParseError(
1444 "encoded word format invalid: '{}'".format(ew.cte))
1445 ew.charset = charset
1446 ew.lang = lang
1447 ew.defects.extend(defects)
1448 while text:
1449 if text[0] in WSP:
1450 token, text = get_fws(text)
1451 ew.append(token)
1452 continue
1453 chars, *remainder = _wsp_splitter(text, 1)
1454 vtext = ValueTerminal(chars, 'vtext')
1455 _validate_xtext(vtext)
1456 ew.append(vtext)
1457 text = ''.join(remainder)
1458 return ew, value
1459
1460def get_unstructured(value):
1461 """unstructured = (*([FWS] vchar) *WSP) / obs-unstruct
1462 obs-unstruct = *((*LF *CR *(obs-utext) *LF *CR)) / FWS)
1463 obs-utext = %d0 / obs-NO-WS-CTL / LF / CR
1464
1465 obs-NO-WS-CTL is control characters except WSP/CR/LF.
1466
1467 So, basically, we have printable runs, plus control characters or nulls in
1468 the obsolete syntax, separated by whitespace. Since RFC 2047 uses the
1469 obsolete syntax in its specification, but requires whitespace on either
1470 side of the encoded words, I can see no reason to need to separate the
1471 non-printable-non-whitespace from the printable runs if they occur, so we
1472 parse this into xtext tokens separated by WSP tokens.
1473
1474 Because an 'unstructured' value must by definition constitute the entire
1475 value, this 'get' routine does not return a remaining value, only the
1476 parsed TokenList.
1477
1478 """
1479 # XXX: but what about bare CR and LF? They might signal the start or
R David Murray65171b22013-07-11 15:52:57 -04001480 # end of an encoded word. YAGNI for now, since our current parsers
1481 # will never send us strings with bare CR or LF.
R David Murray0b6f6c82012-05-25 18:42:14 -04001482
1483 unstructured = UnstructuredTokenList()
1484 while value:
1485 if value[0] in WSP:
1486 token, value = get_fws(value)
1487 unstructured.append(token)
1488 continue
1489 if value.startswith('=?'):
1490 try:
1491 token, value = get_encoded_word(value)
1492 except errors.HeaderParseError:
R David Murray65171b22013-07-11 15:52:57 -04001493 # XXX: Need to figure out how to register defects when
1494 # appropriate here.
R David Murray0b6f6c82012-05-25 18:42:14 -04001495 pass
1496 else:
1497 have_ws = True
1498 if len(unstructured) > 0:
1499 if unstructured[-1].token_type != 'fws':
1500 unstructured.defects.append(errors.InvalidHeaderDefect(
1501 "missing whitespace before encoded word"))
1502 have_ws = False
1503 if have_ws and len(unstructured) > 1:
1504 if unstructured[-2].token_type == 'encoded-word':
1505 unstructured[-1] = EWWhiteSpaceTerminal(
1506 unstructured[-1], 'fws')
1507 unstructured.append(token)
1508 continue
1509 tok, *remainder = _wsp_splitter(value, 1)
1510 vtext = ValueTerminal(tok, 'vtext')
1511 _validate_xtext(vtext)
1512 unstructured.append(vtext)
1513 value = ''.join(remainder)
1514 return unstructured
1515
1516def get_qp_ctext(value):
1517 """ctext = <printable ascii except \ ( )>
1518
1519 This is not the RFC ctext, since we are handling nested comments in comment
1520 and unquoting quoted-pairs here. We allow anything except the '()'
1521 characters, but if we find any ASCII other than the RFC defined printable
1522 ASCII an NonPrintableDefect is added to the token's defects list. Since
1523 quoted pairs are converted to their unquoted values, what is returned is
1524 a 'ptext' token. In this case it is a WhiteSpaceTerminal, so it's value
1525 is ' '.
1526
1527 """
1528 ptext, value, _ = _get_ptext_to_endchars(value, '()')
1529 ptext = WhiteSpaceTerminal(ptext, 'ptext')
1530 _validate_xtext(ptext)
1531 return ptext, value
1532
1533def get_qcontent(value):
1534 """qcontent = qtext / quoted-pair
1535
1536 We allow anything except the DQUOTE character, but if we find any ASCII
1537 other than the RFC defined printable ASCII an NonPrintableDefect is
1538 added to the token's defects list. Any quoted pairs are converted to their
1539 unquoted values, so what is returned is a 'ptext' token. In this case it
1540 is a ValueTerminal.
1541
1542 """
1543 ptext, value, _ = _get_ptext_to_endchars(value, '"')
1544 ptext = ValueTerminal(ptext, 'ptext')
1545 _validate_xtext(ptext)
1546 return ptext, value
1547
1548def get_atext(value):
1549 """atext = <matches _atext_matcher>
1550
1551 We allow any non-ATOM_ENDS in atext, but add an InvalidATextDefect to
1552 the token's defects list if we find non-atext characters.
1553 """
1554 m = _non_atom_end_matcher(value)
1555 if not m:
1556 raise errors.HeaderParseError(
1557 "expected atext but found '{}'".format(value))
1558 atext = m.group()
1559 value = value[len(atext):]
1560 atext = ValueTerminal(atext, 'atext')
1561 _validate_xtext(atext)
1562 return atext, value
1563
1564def get_bare_quoted_string(value):
1565 """bare-quoted-string = DQUOTE *([FWS] qcontent) [FWS] DQUOTE
1566
1567 A quoted-string without the leading or trailing white space. Its
1568 value is the text between the quote marks, with whitespace
1569 preserved and quoted pairs decoded.
1570 """
1571 if value[0] != '"':
1572 raise errors.HeaderParseError(
1573 "expected '\"' but found '{}'".format(value))
1574 bare_quoted_string = BareQuotedString()
1575 value = value[1:]
1576 while value and value[0] != '"':
1577 if value[0] in WSP:
1578 token, value = get_fws(value)
R David Murray0400d332014-02-08 13:12:00 -05001579 elif value[:2] == '=?':
1580 try:
1581 token, value = get_encoded_word(value)
1582 bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
1583 "encoded word inside quoted string"))
1584 except errors.HeaderParseError:
1585 token, value = get_qcontent(value)
R David Murray0b6f6c82012-05-25 18:42:14 -04001586 else:
1587 token, value = get_qcontent(value)
1588 bare_quoted_string.append(token)
1589 if not value:
1590 bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
1591 "end of header inside quoted string"))
1592 return bare_quoted_string, value
1593 return bare_quoted_string, value[1:]
1594
1595def get_comment(value):
1596 """comment = "(" *([FWS] ccontent) [FWS] ")"
1597 ccontent = ctext / quoted-pair / comment
1598
1599 We handle nested comments here, and quoted-pair in our qp-ctext routine.
1600 """
1601 if value and value[0] != '(':
1602 raise errors.HeaderParseError(
1603 "expected '(' but found '{}'".format(value))
1604 comment = Comment()
1605 value = value[1:]
1606 while value and value[0] != ")":
1607 if value[0] in WSP:
1608 token, value = get_fws(value)
1609 elif value[0] == '(':
1610 token, value = get_comment(value)
1611 else:
1612 token, value = get_qp_ctext(value)
1613 comment.append(token)
1614 if not value:
1615 comment.defects.append(errors.InvalidHeaderDefect(
1616 "end of header inside comment"))
1617 return comment, value
1618 return comment, value[1:]
1619
1620def get_cfws(value):
1621 """CFWS = (1*([FWS] comment) [FWS]) / FWS
1622
1623 """
1624 cfws = CFWSList()
1625 while value and value[0] in CFWS_LEADER:
1626 if value[0] in WSP:
1627 token, value = get_fws(value)
1628 else:
1629 token, value = get_comment(value)
1630 cfws.append(token)
1631 return cfws, value
1632
1633def get_quoted_string(value):
1634 """quoted-string = [CFWS] <bare-quoted-string> [CFWS]
1635
1636 'bare-quoted-string' is an intermediate class defined by this
1637 parser and not by the RFC grammar. It is the quoted string
1638 without any attached CFWS.
1639 """
1640 quoted_string = QuotedString()
1641 if value and value[0] in CFWS_LEADER:
1642 token, value = get_cfws(value)
1643 quoted_string.append(token)
1644 token, value = get_bare_quoted_string(value)
1645 quoted_string.append(token)
1646 if value and value[0] in CFWS_LEADER:
1647 token, value = get_cfws(value)
1648 quoted_string.append(token)
1649 return quoted_string, value
1650
1651def get_atom(value):
1652 """atom = [CFWS] 1*atext [CFWS]
1653
R David Murray923512f2013-07-12 16:00:28 -04001654 An atom could be an rfc2047 encoded word.
R David Murray0b6f6c82012-05-25 18:42:14 -04001655 """
1656 atom = Atom()
1657 if value and value[0] in CFWS_LEADER:
1658 token, value = get_cfws(value)
1659 atom.append(token)
1660 if value and value[0] in ATOM_ENDS:
1661 raise errors.HeaderParseError(
1662 "expected atom but found '{}'".format(value))
R David Murray923512f2013-07-12 16:00:28 -04001663 if value.startswith('=?'):
1664 try:
1665 token, value = get_encoded_word(value)
1666 except errors.HeaderParseError:
1667 # XXX: need to figure out how to register defects when
1668 # appropriate here.
1669 token, value = get_atext(value)
1670 else:
1671 token, value = get_atext(value)
R David Murray0b6f6c82012-05-25 18:42:14 -04001672 atom.append(token)
1673 if value and value[0] in CFWS_LEADER:
1674 token, value = get_cfws(value)
1675 atom.append(token)
1676 return atom, value
1677
1678def get_dot_atom_text(value):
1679 """ dot-text = 1*atext *("." 1*atext)
1680
1681 """
1682 dot_atom_text = DotAtomText()
1683 if not value or value[0] in ATOM_ENDS:
1684 raise errors.HeaderParseError("expected atom at a start of "
1685 "dot-atom-text but found '{}'".format(value))
1686 while value and value[0] not in ATOM_ENDS:
1687 token, value = get_atext(value)
1688 dot_atom_text.append(token)
1689 if value and value[0] == '.':
1690 dot_atom_text.append(DOT)
1691 value = value[1:]
1692 if dot_atom_text[-1] is DOT:
1693 raise errors.HeaderParseError("expected atom at end of dot-atom-text "
1694 "but found '{}'".format('.'+value))
1695 return dot_atom_text, value
1696
1697def get_dot_atom(value):
1698 """ dot-atom = [CFWS] dot-atom-text [CFWS]
1699
R David Murray923512f2013-07-12 16:00:28 -04001700 Any place we can have a dot atom, we could instead have an rfc2047 encoded
1701 word.
R David Murray0b6f6c82012-05-25 18:42:14 -04001702 """
1703 dot_atom = DotAtom()
1704 if value[0] in CFWS_LEADER:
1705 token, value = get_cfws(value)
1706 dot_atom.append(token)
R David Murray923512f2013-07-12 16:00:28 -04001707 if value.startswith('=?'):
1708 try:
1709 token, value = get_encoded_word(value)
1710 except errors.HeaderParseError:
1711 # XXX: need to figure out how to register defects when
1712 # appropriate here.
1713 token, value = get_dot_atom_text(value)
1714 else:
1715 token, value = get_dot_atom_text(value)
R David Murray0b6f6c82012-05-25 18:42:14 -04001716 dot_atom.append(token)
1717 if value and value[0] in CFWS_LEADER:
1718 token, value = get_cfws(value)
1719 dot_atom.append(token)
1720 return dot_atom, value
1721
1722def get_word(value):
1723 """word = atom / quoted-string
1724
1725 Either atom or quoted-string may start with CFWS. We have to peel off this
1726 CFWS first to determine which type of word to parse. Afterward we splice
1727 the leading CFWS, if any, into the parsed sub-token.
1728
1729 If neither an atom or a quoted-string is found before the next special, a
1730 HeaderParseError is raised.
1731
1732 The token returned is either an Atom or a QuotedString, as appropriate.
1733 This means the 'word' level of the formal grammar is not represented in the
1734 parse tree; this is because having that extra layer when manipulating the
1735 parse tree is more confusing than it is helpful.
1736
1737 """
1738 if value[0] in CFWS_LEADER:
1739 leader, value = get_cfws(value)
1740 else:
1741 leader = None
1742 if value[0]=='"':
1743 token, value = get_quoted_string(value)
1744 elif value[0] in SPECIALS:
1745 raise errors.HeaderParseError("Expected 'atom' or 'quoted-string' "
1746 "but found '{}'".format(value))
1747 else:
1748 token, value = get_atom(value)
1749 if leader is not None:
1750 token[:0] = [leader]
1751 return token, value
1752
1753def get_phrase(value):
1754 """ phrase = 1*word / obs-phrase
1755 obs-phrase = word *(word / "." / CFWS)
1756
1757 This means a phrase can be a sequence of words, periods, and CFWS in any
1758 order as long as it starts with at least one word. If anything other than
1759 words is detected, an ObsoleteHeaderDefect is added to the token's defect
1760 list. We also accept a phrase that starts with CFWS followed by a dot;
1761 this is registered as an InvalidHeaderDefect, since it is not supported by
1762 even the obsolete grammar.
1763
1764 """
1765 phrase = Phrase()
1766 try:
1767 token, value = get_word(value)
1768 phrase.append(token)
1769 except errors.HeaderParseError:
1770 phrase.defects.append(errors.InvalidHeaderDefect(
1771 "phrase does not start with word"))
1772 while value and value[0] not in PHRASE_ENDS:
1773 if value[0]=='.':
1774 phrase.append(DOT)
1775 phrase.defects.append(errors.ObsoleteHeaderDefect(
1776 "period in 'phrase'"))
1777 value = value[1:]
1778 else:
1779 try:
1780 token, value = get_word(value)
1781 except errors.HeaderParseError:
1782 if value[0] in CFWS_LEADER:
1783 token, value = get_cfws(value)
1784 phrase.defects.append(errors.ObsoleteHeaderDefect(
1785 "comment found without atom"))
1786 else:
1787 raise
1788 phrase.append(token)
1789 return phrase, value
1790
1791def get_local_part(value):
1792 """ local-part = dot-atom / quoted-string / obs-local-part
1793
1794 """
1795 local_part = LocalPart()
1796 leader = None
1797 if value[0] in CFWS_LEADER:
1798 leader, value = get_cfws(value)
1799 if not value:
1800 raise errors.HeaderParseError(
1801 "expected local-part but found '{}'".format(value))
1802 try:
1803 token, value = get_dot_atom(value)
1804 except errors.HeaderParseError:
1805 try:
1806 token, value = get_word(value)
1807 except errors.HeaderParseError:
1808 if value[0] != '\\' and value[0] in PHRASE_ENDS:
1809 raise
1810 token = TokenList()
1811 if leader is not None:
1812 token[:0] = [leader]
1813 local_part.append(token)
1814 if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
1815 obs_local_part, value = get_obs_local_part(str(local_part) + value)
1816 if obs_local_part.token_type == 'invalid-obs-local-part':
1817 local_part.defects.append(errors.InvalidHeaderDefect(
1818 "local-part is not dot-atom, quoted-string, or obs-local-part"))
1819 else:
1820 local_part.defects.append(errors.ObsoleteHeaderDefect(
1821 "local-part is not a dot-atom (contains CFWS)"))
1822 local_part[0] = obs_local_part
1823 try:
1824 local_part.value.encode('ascii')
1825 except UnicodeEncodeError:
1826 local_part.defects.append(errors.NonASCIILocalPartDefect(
1827 "local-part contains non-ASCII characters)"))
1828 return local_part, value
1829
1830def get_obs_local_part(value):
1831 """ obs-local-part = word *("." word)
1832 """
1833 obs_local_part = ObsLocalPart()
1834 last_non_ws_was_dot = False
1835 while value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
1836 if value[0] == '.':
1837 if last_non_ws_was_dot:
1838 obs_local_part.defects.append(errors.InvalidHeaderDefect(
1839 "invalid repeated '.'"))
1840 obs_local_part.append(DOT)
1841 last_non_ws_was_dot = True
1842 value = value[1:]
1843 continue
1844 elif value[0]=='\\':
1845 obs_local_part.append(ValueTerminal(value[0],
1846 'misplaced-special'))
1847 value = value[1:]
1848 obs_local_part.defects.append(errors.InvalidHeaderDefect(
1849 "'\\' character outside of quoted-string/ccontent"))
1850 last_non_ws_was_dot = False
1851 continue
1852 if obs_local_part and obs_local_part[-1].token_type != 'dot':
1853 obs_local_part.defects.append(errors.InvalidHeaderDefect(
1854 "missing '.' between words"))
1855 try:
1856 token, value = get_word(value)
1857 last_non_ws_was_dot = False
1858 except errors.HeaderParseError:
1859 if value[0] not in CFWS_LEADER:
1860 raise
1861 token, value = get_cfws(value)
1862 obs_local_part.append(token)
1863 if (obs_local_part[0].token_type == 'dot' or
1864 obs_local_part[0].token_type=='cfws' and
1865 obs_local_part[1].token_type=='dot'):
1866 obs_local_part.defects.append(errors.InvalidHeaderDefect(
1867 "Invalid leading '.' in local part"))
1868 if (obs_local_part[-1].token_type == 'dot' or
1869 obs_local_part[-1].token_type=='cfws' and
1870 obs_local_part[-2].token_type=='dot'):
1871 obs_local_part.defects.append(errors.InvalidHeaderDefect(
1872 "Invalid trailing '.' in local part"))
1873 if obs_local_part.defects:
1874 obs_local_part.token_type = 'invalid-obs-local-part'
1875 return obs_local_part, value
1876
1877def get_dtext(value):
1878 """ dtext = <printable ascii except \ [ ]> / obs-dtext
1879 obs-dtext = obs-NO-WS-CTL / quoted-pair
1880
Terry Jan Reedy0f847642013-03-11 18:34:00 -04001881 We allow anything except the excluded characters, but if we find any
R David Murray0b6f6c82012-05-25 18:42:14 -04001882 ASCII other than the RFC defined printable ASCII an NonPrintableDefect is
1883 added to the token's defects list. Quoted pairs are converted to their
1884 unquoted values, so what is returned is a ptext token, in this case a
1885 ValueTerminal. If there were quoted-printables, an ObsoleteHeaderDefect is
1886 added to the returned token's defect list.
1887
1888 """
1889 ptext, value, had_qp = _get_ptext_to_endchars(value, '[]')
1890 ptext = ValueTerminal(ptext, 'ptext')
1891 if had_qp:
1892 ptext.defects.append(errors.ObsoleteHeaderDefect(
1893 "quoted printable found in domain-literal"))
1894 _validate_xtext(ptext)
1895 return ptext, value
1896
1897def _check_for_early_dl_end(value, domain_literal):
1898 if value:
1899 return False
1900 domain_literal.append(errors.InvalidHeaderDefect(
1901 "end of input inside domain-literal"))
1902 domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
1903 return True
1904
1905def get_domain_literal(value):
1906 """ domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS]
1907
1908 """
1909 domain_literal = DomainLiteral()
1910 if value[0] in CFWS_LEADER:
1911 token, value = get_cfws(value)
1912 domain_literal.append(token)
1913 if not value:
1914 raise errors.HeaderParseError("expected domain-literal")
1915 if value[0] != '[':
1916 raise errors.HeaderParseError("expected '[' at start of domain-literal "
1917 "but found '{}'".format(value))
1918 value = value[1:]
1919 if _check_for_early_dl_end(value, domain_literal):
1920 return domain_literal, value
1921 domain_literal.append(ValueTerminal('[', 'domain-literal-start'))
1922 if value[0] in WSP:
1923 token, value = get_fws(value)
1924 domain_literal.append(token)
1925 token, value = get_dtext(value)
1926 domain_literal.append(token)
1927 if _check_for_early_dl_end(value, domain_literal):
1928 return domain_literal, value
1929 if value[0] in WSP:
1930 token, value = get_fws(value)
1931 domain_literal.append(token)
1932 if _check_for_early_dl_end(value, domain_literal):
1933 return domain_literal, value
1934 if value[0] != ']':
1935 raise errors.HeaderParseError("expected ']' at end of domain-literal "
1936 "but found '{}'".format(value))
1937 domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
1938 value = value[1:]
1939 if value and value[0] in CFWS_LEADER:
1940 token, value = get_cfws(value)
1941 domain_literal.append(token)
1942 return domain_literal, value
1943
1944def get_domain(value):
1945 """ domain = dot-atom / domain-literal / obs-domain
1946 obs-domain = atom *("." atom))
1947
1948 """
1949 domain = Domain()
1950 leader = None
1951 if value[0] in CFWS_LEADER:
1952 leader, value = get_cfws(value)
1953 if not value:
1954 raise errors.HeaderParseError(
1955 "expected domain but found '{}'".format(value))
1956 if value[0] == '[':
1957 token, value = get_domain_literal(value)
1958 if leader is not None:
1959 token[:0] = [leader]
1960 domain.append(token)
1961 return domain, value
1962 try:
1963 token, value = get_dot_atom(value)
1964 except errors.HeaderParseError:
1965 token, value = get_atom(value)
1966 if leader is not None:
1967 token[:0] = [leader]
1968 domain.append(token)
1969 if value and value[0] == '.':
1970 domain.defects.append(errors.ObsoleteHeaderDefect(
1971 "domain is not a dot-atom (contains CFWS)"))
1972 if domain[0].token_type == 'dot-atom':
1973 domain[:] = domain[0]
1974 while value and value[0] == '.':
1975 domain.append(DOT)
1976 token, value = get_atom(value[1:])
1977 domain.append(token)
1978 return domain, value
1979
1980def get_addr_spec(value):
1981 """ addr-spec = local-part "@" domain
1982
1983 """
1984 addr_spec = AddrSpec()
1985 token, value = get_local_part(value)
1986 addr_spec.append(token)
1987 if not value or value[0] != '@':
1988 addr_spec.defects.append(errors.InvalidHeaderDefect(
1989 "add-spec local part with no domain"))
1990 return addr_spec, value
1991 addr_spec.append(ValueTerminal('@', 'address-at-symbol'))
1992 token, value = get_domain(value[1:])
1993 addr_spec.append(token)
1994 return addr_spec, value
1995
1996def get_obs_route(value):
1997 """ obs-route = obs-domain-list ":"
1998 obs-domain-list = *(CFWS / ",") "@" domain *("," [CFWS] ["@" domain])
1999
2000 Returns an obs-route token with the appropriate sub-tokens (that is,
2001 there is no obs-domain-list in the parse tree).
2002 """
2003 obs_route = ObsRoute()
2004 while value and (value[0]==',' or value[0] in CFWS_LEADER):
2005 if value[0] in CFWS_LEADER:
2006 token, value = get_cfws(value)
2007 obs_route.append(token)
2008 elif value[0] == ',':
2009 obs_route.append(ListSeparator)
2010 value = value[1:]
2011 if not value or value[0] != '@':
2012 raise errors.HeaderParseError(
2013 "expected obs-route domain but found '{}'".format(value))
2014 obs_route.append(RouteComponentMarker)
2015 token, value = get_domain(value[1:])
2016 obs_route.append(token)
2017 while value and value[0]==',':
2018 obs_route.append(ListSeparator)
2019 value = value[1:]
2020 if not value:
2021 break
2022 if value[0] in CFWS_LEADER:
2023 token, value = get_cfws(value)
2024 obs_route.append(token)
2025 if value[0] == '@':
2026 obs_route.append(RouteComponentMarker)
2027 token, value = get_domain(value[1:])
2028 obs_route.append(token)
2029 if not value:
2030 raise errors.HeaderParseError("end of header while parsing obs-route")
2031 if value[0] != ':':
2032 raise errors.HeaderParseError( "expected ':' marking end of "
2033 "obs-route but found '{}'".format(value))
2034 obs_route.append(ValueTerminal(':', 'end-of-obs-route-marker'))
2035 return obs_route, value[1:]
2036
2037def get_angle_addr(value):
2038 """ angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr
2039 obs-angle-addr = [CFWS] "<" obs-route addr-spec ">" [CFWS]
2040
2041 """
2042 angle_addr = AngleAddr()
2043 if value[0] in CFWS_LEADER:
2044 token, value = get_cfws(value)
2045 angle_addr.append(token)
2046 if not value or value[0] != '<':
2047 raise errors.HeaderParseError(
2048 "expected angle-addr but found '{}'".format(value))
2049 angle_addr.append(ValueTerminal('<', 'angle-addr-start'))
2050 value = value[1:]
R David Murray032eed32012-05-26 14:31:12 -04002051 # Although it is not legal per RFC5322, SMTP uses '<>' in certain
2052 # circumstances.
2053 if value[0] == '>':
2054 angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
2055 angle_addr.defects.append(errors.InvalidHeaderDefect(
2056 "null addr-spec in angle-addr"))
2057 value = value[1:]
2058 return angle_addr, value
R David Murray0b6f6c82012-05-25 18:42:14 -04002059 try:
2060 token, value = get_addr_spec(value)
2061 except errors.HeaderParseError:
2062 try:
2063 token, value = get_obs_route(value)
2064 angle_addr.defects.append(errors.ObsoleteHeaderDefect(
2065 "obsolete route specification in angle-addr"))
2066 except errors.HeaderParseError:
2067 raise errors.HeaderParseError(
R David Murray032eed32012-05-26 14:31:12 -04002068 "expected addr-spec or obs-route but found '{}'".format(value))
R David Murray0b6f6c82012-05-25 18:42:14 -04002069 angle_addr.append(token)
2070 token, value = get_addr_spec(value)
2071 angle_addr.append(token)
2072 if value and value[0] == '>':
2073 value = value[1:]
2074 else:
2075 angle_addr.defects.append(errors.InvalidHeaderDefect(
2076 "missing trailing '>' on angle-addr"))
2077 angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
2078 if value and value[0] in CFWS_LEADER:
2079 token, value = get_cfws(value)
2080 angle_addr.append(token)
2081 return angle_addr, value
2082
2083def get_display_name(value):
2084 """ display-name = phrase
2085
2086 Because this is simply a name-rule, we don't return a display-name
2087 token containing a phrase, but rather a display-name token with
2088 the content of the phrase.
2089
2090 """
2091 display_name = DisplayName()
2092 token, value = get_phrase(value)
2093 display_name.extend(token[:])
2094 display_name.defects = token.defects[:]
2095 return display_name, value
2096
2097
2098def get_name_addr(value):
2099 """ name-addr = [display-name] angle-addr
2100
2101 """
2102 name_addr = NameAddr()
2103 # Both the optional display name and the angle-addr can start with cfws.
2104 leader = None
2105 if value[0] in CFWS_LEADER:
2106 leader, value = get_cfws(value)
2107 if not value:
2108 raise errors.HeaderParseError(
2109 "expected name-addr but found '{}'".format(leader))
2110 if value[0] != '<':
2111 if value[0] in PHRASE_ENDS:
2112 raise errors.HeaderParseError(
2113 "expected name-addr but found '{}'".format(value))
2114 token, value = get_display_name(value)
2115 if not value:
2116 raise errors.HeaderParseError(
2117 "expected name-addr but found '{}'".format(token))
2118 if leader is not None:
2119 token[0][:0] = [leader]
2120 leader = None
2121 name_addr.append(token)
2122 token, value = get_angle_addr(value)
2123 if leader is not None:
2124 token[:0] = [leader]
2125 name_addr.append(token)
2126 return name_addr, value
2127
2128def get_mailbox(value):
2129 """ mailbox = name-addr / addr-spec
2130
2131 """
2132 # The only way to figure out if we are dealing with a name-addr or an
2133 # addr-spec is to try parsing each one.
2134 mailbox = Mailbox()
2135 try:
2136 token, value = get_name_addr(value)
2137 except errors.HeaderParseError:
2138 try:
2139 token, value = get_addr_spec(value)
2140 except errors.HeaderParseError:
2141 raise errors.HeaderParseError(
2142 "expected mailbox but found '{}'".format(value))
2143 if any(isinstance(x, errors.InvalidHeaderDefect)
2144 for x in token.all_defects):
2145 mailbox.token_type = 'invalid-mailbox'
2146 mailbox.append(token)
2147 return mailbox, value
2148
2149def get_invalid_mailbox(value, endchars):
2150 """ Read everything up to one of the chars in endchars.
2151
2152 This is outside the formal grammar. The InvalidMailbox TokenList that is
2153 returned acts like a Mailbox, but the data attributes are None.
2154
2155 """
2156 invalid_mailbox = InvalidMailbox()
2157 while value and value[0] not in endchars:
2158 if value[0] in PHRASE_ENDS:
2159 invalid_mailbox.append(ValueTerminal(value[0],
2160 'misplaced-special'))
2161 value = value[1:]
2162 else:
2163 token, value = get_phrase(value)
2164 invalid_mailbox.append(token)
2165 return invalid_mailbox, value
2166
2167def get_mailbox_list(value):
2168 """ mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list
2169 obs-mbox-list = *([CFWS] ",") mailbox *("," [mailbox / CFWS])
2170
2171 For this routine we go outside the formal grammar in order to improve error
2172 handling. We recognize the end of the mailbox list only at the end of the
2173 value or at a ';' (the group terminator). This is so that we can turn
2174 invalid mailboxes into InvalidMailbox tokens and continue parsing any
2175 remaining valid mailboxes. We also allow all mailbox entries to be null,
2176 and this condition is handled appropriately at a higher level.
2177
2178 """
2179 mailbox_list = MailboxList()
2180 while value and value[0] != ';':
2181 try:
2182 token, value = get_mailbox(value)
2183 mailbox_list.append(token)
2184 except errors.HeaderParseError:
2185 leader = None
2186 if value[0] in CFWS_LEADER:
2187 leader, value = get_cfws(value)
2188 if not value or value[0] in ',;':
2189 mailbox_list.append(leader)
2190 mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
2191 "empty element in mailbox-list"))
2192 else:
2193 token, value = get_invalid_mailbox(value, ',;')
2194 if leader is not None:
2195 token[:0] = [leader]
2196 mailbox_list.append(token)
2197 mailbox_list.defects.append(errors.InvalidHeaderDefect(
2198 "invalid mailbox in mailbox-list"))
2199 elif value[0] == ',':
2200 mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
2201 "empty element in mailbox-list"))
2202 else:
2203 token, value = get_invalid_mailbox(value, ',;')
2204 if leader is not None:
2205 token[:0] = [leader]
2206 mailbox_list.append(token)
2207 mailbox_list.defects.append(errors.InvalidHeaderDefect(
2208 "invalid mailbox in mailbox-list"))
2209 if value and value[0] not in ',;':
2210 # Crap after mailbox; treat it as an invalid mailbox.
2211 # The mailbox info will still be available.
2212 mailbox = mailbox_list[-1]
2213 mailbox.token_type = 'invalid-mailbox'
2214 token, value = get_invalid_mailbox(value, ',;')
2215 mailbox.extend(token)
2216 mailbox_list.defects.append(errors.InvalidHeaderDefect(
2217 "invalid mailbox in mailbox-list"))
2218 if value and value[0] == ',':
2219 mailbox_list.append(ListSeparator)
2220 value = value[1:]
2221 return mailbox_list, value
2222
2223
2224def get_group_list(value):
2225 """ group-list = mailbox-list / CFWS / obs-group-list
2226 obs-group-list = 1*([CFWS] ",") [CFWS]
2227
2228 """
2229 group_list = GroupList()
2230 if not value:
2231 group_list.defects.append(errors.InvalidHeaderDefect(
2232 "end of header before group-list"))
2233 return group_list, value
2234 leader = None
2235 if value and value[0] in CFWS_LEADER:
2236 leader, value = get_cfws(value)
2237 if not value:
2238 # This should never happen in email parsing, since CFWS-only is a
2239 # legal alternative to group-list in a group, which is the only
2240 # place group-list appears.
2241 group_list.defects.append(errors.InvalidHeaderDefect(
2242 "end of header in group-list"))
2243 group_list.append(leader)
2244 return group_list, value
2245 if value[0] == ';':
2246 group_list.append(leader)
2247 return group_list, value
2248 token, value = get_mailbox_list(value)
2249 if len(token.all_mailboxes)==0:
2250 if leader is not None:
2251 group_list.append(leader)
2252 group_list.extend(token)
2253 group_list.defects.append(errors.ObsoleteHeaderDefect(
2254 "group-list with empty entries"))
2255 return group_list, value
2256 if leader is not None:
2257 token[:0] = [leader]
2258 group_list.append(token)
2259 return group_list, value
2260
2261def get_group(value):
2262 """ group = display-name ":" [group-list] ";" [CFWS]
2263
2264 """
2265 group = Group()
2266 token, value = get_display_name(value)
2267 if not value or value[0] != ':':
2268 raise errors.HeaderParseError("expected ':' at end of group "
2269 "display name but found '{}'".format(value))
2270 group.append(token)
2271 group.append(ValueTerminal(':', 'group-display-name-terminator'))
2272 value = value[1:]
2273 if value and value[0] == ';':
2274 group.append(ValueTerminal(';', 'group-terminator'))
2275 return group, value[1:]
2276 token, value = get_group_list(value)
2277 group.append(token)
2278 if not value:
2279 group.defects.append(errors.InvalidHeaderDefect(
2280 "end of header in group"))
2281 if value[0] != ';':
2282 raise errors.HeaderParseError(
2283 "expected ';' at end of group but found {}".format(value))
2284 group.append(ValueTerminal(';', 'group-terminator'))
2285 value = value[1:]
2286 if value and value[0] in CFWS_LEADER:
2287 token, value = get_cfws(value)
2288 group.append(token)
2289 return group, value
2290
2291def get_address(value):
2292 """ address = mailbox / group
2293
2294 Note that counter-intuitively, an address can be either a single address or
2295 a list of addresses (a group). This is why the returned Address object has
2296 a 'mailboxes' attribute which treats a single address as a list of length
2297 one. When you need to differentiate between to two cases, extract the single
2298 element, which is either a mailbox or a group token.
2299
2300 """
2301 # The formal grammar isn't very helpful when parsing an address. mailbox
2302 # and group, especially when allowing for obsolete forms, start off very
2303 # similarly. It is only when you reach one of @, <, or : that you know
2304 # what you've got. So, we try each one in turn, starting with the more
2305 # likely of the two. We could perhaps make this more efficient by looking
2306 # for a phrase and then branching based on the next character, but that
2307 # would be a premature optimization.
2308 address = Address()
2309 try:
2310 token, value = get_group(value)
2311 except errors.HeaderParseError:
2312 try:
2313 token, value = get_mailbox(value)
2314 except errors.HeaderParseError:
2315 raise errors.HeaderParseError(
2316 "expected address but found '{}'".format(value))
2317 address.append(token)
2318 return address, value
2319
2320def get_address_list(value):
2321 """ address_list = (address *("," address)) / obs-addr-list
2322 obs-addr-list = *([CFWS] ",") address *("," [address / CFWS])
2323
2324 We depart from the formal grammar here by continuing to parse until the end
2325 of the input, assuming the input to be entirely composed of an
2326 address-list. This is always true in email parsing, and allows us
2327 to skip invalid addresses to parse additional valid ones.
2328
2329 """
2330 address_list = AddressList()
2331 while value:
2332 try:
2333 token, value = get_address(value)
2334 address_list.append(token)
2335 except errors.HeaderParseError as err:
2336 leader = None
2337 if value[0] in CFWS_LEADER:
2338 leader, value = get_cfws(value)
2339 if not value or value[0] == ',':
2340 address_list.append(leader)
2341 address_list.defects.append(errors.ObsoleteHeaderDefect(
2342 "address-list entry with no content"))
2343 else:
2344 token, value = get_invalid_mailbox(value, ',')
2345 if leader is not None:
2346 token[:0] = [leader]
2347 address_list.append(Address([token]))
2348 address_list.defects.append(errors.InvalidHeaderDefect(
2349 "invalid address in address-list"))
2350 elif value[0] == ',':
2351 address_list.defects.append(errors.ObsoleteHeaderDefect(
2352 "empty element in address-list"))
2353 else:
2354 token, value = get_invalid_mailbox(value, ',')
2355 if leader is not None:
2356 token[:0] = [leader]
2357 address_list.append(Address([token]))
2358 address_list.defects.append(errors.InvalidHeaderDefect(
2359 "invalid address in address-list"))
2360 if value and value[0] != ',':
2361 # Crap after address; treat it as an invalid mailbox.
2362 # The mailbox info will still be available.
2363 mailbox = address_list[-1][0]
2364 mailbox.token_type = 'invalid-mailbox'
2365 token, value = get_invalid_mailbox(value, ',')
2366 mailbox.extend(token)
2367 address_list.defects.append(errors.InvalidHeaderDefect(
2368 "invalid address in address-list"))
2369 if value: # Must be a , at this point.
2370 address_list.append(ValueTerminal(',', 'list-separator'))
2371 value = value[1:]
2372 return address_list, value
R David Murray97f43c02012-06-24 05:03:27 -04002373
2374#
2375# XXX: As I begin to add additional header parsers, I'm realizing we probably
2376# have two level of parser routines: the get_XXX methods that get a token in
2377# the grammar, and parse_XXX methods that parse an entire field value. So
2378# get_address_list above should really be a parse_ method, as probably should
2379# be get_unstructured.
2380#
2381
2382def parse_mime_version(value):
2383 """ mime-version = [CFWS] 1*digit [CFWS] "." [CFWS] 1*digit [CFWS]
2384
2385 """
2386 # The [CFWS] is implicit in the RFC 2045 BNF.
2387 # XXX: This routine is a bit verbose, should factor out a get_int method.
2388 mime_version = MIMEVersion()
2389 if not value:
2390 mime_version.defects.append(errors.HeaderMissingRequiredValue(
2391 "Missing MIME version number (eg: 1.0)"))
2392 return mime_version
2393 if value[0] in CFWS_LEADER:
2394 token, value = get_cfws(value)
2395 mime_version.append(token)
2396 if not value:
2397 mime_version.defects.append(errors.HeaderMissingRequiredValue(
2398 "Expected MIME version number but found only CFWS"))
2399 digits = ''
2400 while value and value[0] != '.' and value[0] not in CFWS_LEADER:
2401 digits += value[0]
2402 value = value[1:]
2403 if not digits.isdigit():
2404 mime_version.defects.append(errors.InvalidHeaderDefect(
2405 "Expected MIME major version number but found {!r}".format(digits)))
2406 mime_version.append(ValueTerminal(digits, 'xtext'))
2407 else:
2408 mime_version.major = int(digits)
2409 mime_version.append(ValueTerminal(digits, 'digits'))
2410 if value and value[0] in CFWS_LEADER:
2411 token, value = get_cfws(value)
2412 mime_version.append(token)
2413 if not value or value[0] != '.':
2414 if mime_version.major is not None:
2415 mime_version.defects.append(errors.InvalidHeaderDefect(
2416 "Incomplete MIME version; found only major number"))
2417 if value:
2418 mime_version.append(ValueTerminal(value, 'xtext'))
2419 return mime_version
2420 mime_version.append(ValueTerminal('.', 'version-separator'))
2421 value = value[1:]
2422 if value and value[0] in CFWS_LEADER:
2423 token, value = get_cfws(value)
2424 mime_version.append(token)
2425 if not value:
2426 if mime_version.major is not None:
2427 mime_version.defects.append(errors.InvalidHeaderDefect(
2428 "Incomplete MIME version; found only major number"))
2429 return mime_version
2430 digits = ''
2431 while value and value[0] not in CFWS_LEADER:
2432 digits += value[0]
2433 value = value[1:]
2434 if not digits.isdigit():
2435 mime_version.defects.append(errors.InvalidHeaderDefect(
2436 "Expected MIME minor version number but found {!r}".format(digits)))
2437 mime_version.append(ValueTerminal(digits, 'xtext'))
2438 else:
2439 mime_version.minor = int(digits)
2440 mime_version.append(ValueTerminal(digits, 'digits'))
2441 if value and value[0] in CFWS_LEADER:
2442 token, value = get_cfws(value)
2443 mime_version.append(token)
2444 if value:
2445 mime_version.defects.append(errors.InvalidHeaderDefect(
2446 "Excess non-CFWS text after MIME version"))
2447 mime_version.append(ValueTerminal(value, 'xtext'))
2448 return mime_version
2449
2450def get_invalid_parameter(value):
2451 """ Read everything up to the next ';'.
2452
2453 This is outside the formal grammar. The InvalidParameter TokenList that is
2454 returned acts like a Parameter, but the data attributes are None.
2455
2456 """
2457 invalid_parameter = InvalidParameter()
2458 while value and value[0] != ';':
2459 if value[0] in PHRASE_ENDS:
2460 invalid_parameter.append(ValueTerminal(value[0],
2461 'misplaced-special'))
2462 value = value[1:]
2463 else:
2464 token, value = get_phrase(value)
2465 invalid_parameter.append(token)
2466 return invalid_parameter, value
2467
2468def get_ttext(value):
2469 """ttext = <matches _ttext_matcher>
2470
2471 We allow any non-TOKEN_ENDS in ttext, but add defects to the token's
2472 defects list if we find non-ttext characters. We also register defects for
2473 *any* non-printables even though the RFC doesn't exclude all of them,
2474 because we follow the spirit of RFC 5322.
2475
2476 """
2477 m = _non_token_end_matcher(value)
2478 if not m:
2479 raise errors.HeaderParseError(
2480 "expected ttext but found '{}'".format(value))
2481 ttext = m.group()
2482 value = value[len(ttext):]
2483 ttext = ValueTerminal(ttext, 'ttext')
2484 _validate_xtext(ttext)
2485 return ttext, value
2486
2487def get_token(value):
2488 """token = [CFWS] 1*ttext [CFWS]
2489
2490 The RFC equivalent of ttext is any US-ASCII chars except space, ctls, or
2491 tspecials. We also exclude tabs even though the RFC doesn't.
2492
2493 The RFC implies the CFWS but is not explicit about it in the BNF.
2494
2495 """
2496 mtoken = Token()
2497 if value and value[0] in CFWS_LEADER:
2498 token, value = get_cfws(value)
2499 mtoken.append(token)
2500 if value and value[0] in TOKEN_ENDS:
2501 raise errors.HeaderParseError(
2502 "expected token but found '{}'".format(value))
2503 token, value = get_ttext(value)
2504 mtoken.append(token)
2505 if value and value[0] in CFWS_LEADER:
2506 token, value = get_cfws(value)
2507 mtoken.append(token)
2508 return mtoken, value
2509
2510def get_attrtext(value):
2511 """attrtext = 1*(any non-ATTRIBUTE_ENDS character)
2512
2513 We allow any non-ATTRIBUTE_ENDS in attrtext, but add defects to the
2514 token's defects list if we find non-attrtext characters. We also register
2515 defects for *any* non-printables even though the RFC doesn't exclude all of
2516 them, because we follow the spirit of RFC 5322.
2517
2518 """
2519 m = _non_attribute_end_matcher(value)
2520 if not m:
2521 raise errors.HeaderParseError(
2522 "expected attrtext but found {!r}".format(value))
2523 attrtext = m.group()
2524 value = value[len(attrtext):]
2525 attrtext = ValueTerminal(attrtext, 'attrtext')
2526 _validate_xtext(attrtext)
2527 return attrtext, value
2528
2529def get_attribute(value):
2530 """ [CFWS] 1*attrtext [CFWS]
2531
2532 This version of the BNF makes the CFWS explicit, and as usual we use a
2533 value terminal for the actual run of characters. The RFC equivalent of
2534 attrtext is the token characters, with the subtraction of '*', "'", and '%'.
2535 We include tab in the excluded set just as we do for token.
2536
2537 """
2538 attribute = Attribute()
2539 if value and value[0] in CFWS_LEADER:
2540 token, value = get_cfws(value)
2541 attribute.append(token)
2542 if value and value[0] in ATTRIBUTE_ENDS:
2543 raise errors.HeaderParseError(
2544 "expected token but found '{}'".format(value))
2545 token, value = get_attrtext(value)
2546 attribute.append(token)
2547 if value and value[0] in CFWS_LEADER:
2548 token, value = get_cfws(value)
2549 attribute.append(token)
2550 return attribute, value
2551
2552def get_extended_attrtext(value):
2553 """attrtext = 1*(any non-ATTRIBUTE_ENDS character plus '%')
2554
2555 This is a special parsing routine so that we get a value that
2556 includes % escapes as a single string (which we decode as a single
2557 string later).
2558
2559 """
2560 m = _non_extended_attribute_end_matcher(value)
2561 if not m:
2562 raise errors.HeaderParseError(
2563 "expected extended attrtext but found {!r}".format(value))
2564 attrtext = m.group()
2565 value = value[len(attrtext):]
2566 attrtext = ValueTerminal(attrtext, 'extended-attrtext')
2567 _validate_xtext(attrtext)
2568 return attrtext, value
2569
2570def get_extended_attribute(value):
2571 """ [CFWS] 1*extended_attrtext [CFWS]
2572
2573 This is like the non-extended version except we allow % characters, so that
2574 we can pick up an encoded value as a single string.
2575
2576 """
2577 # XXX: should we have an ExtendedAttribute TokenList?
2578 attribute = Attribute()
2579 if value and value[0] in CFWS_LEADER:
2580 token, value = get_cfws(value)
2581 attribute.append(token)
2582 if value and value[0] in EXTENDED_ATTRIBUTE_ENDS:
2583 raise errors.HeaderParseError(
2584 "expected token but found '{}'".format(value))
2585 token, value = get_extended_attrtext(value)
2586 attribute.append(token)
2587 if value and value[0] in CFWS_LEADER:
2588 token, value = get_cfws(value)
2589 attribute.append(token)
2590 return attribute, value
2591
2592def get_section(value):
2593 """ '*' digits
2594
2595 The formal BNF is more complicated because leading 0s are not allowed. We
2596 check for that and add a defect. We also assume no CFWS is allowed between
2597 the '*' and the digits, though the RFC is not crystal clear on that.
2598 The caller should already have dealt with leading CFWS.
2599
2600 """
2601 section = Section()
2602 if not value or value[0] != '*':
2603 raise errors.HeaderParseError("Expected section but found {}".format(
2604 value))
2605 section.append(ValueTerminal('*', 'section-marker'))
2606 value = value[1:]
2607 if not value or not value[0].isdigit():
2608 raise errors.HeaderParseError("Expected section number but "
2609 "found {}".format(value))
2610 digits = ''
2611 while value and value[0].isdigit():
2612 digits += value[0]
2613 value = value[1:]
2614 if digits[0] == '0' and digits != '0':
2615 section.defects.append(errors.InvalidHeaderError("section number"
2616 "has an invalid leading 0"))
2617 section.number = int(digits)
2618 section.append(ValueTerminal(digits, 'digits'))
2619 return section, value
2620
2621
2622def get_value(value):
2623 """ quoted-string / attribute
2624
2625 """
2626 v = Value()
2627 if not value:
2628 raise errors.HeaderParseError("Expected value but found end of string")
2629 leader = None
2630 if value[0] in CFWS_LEADER:
2631 leader, value = get_cfws(value)
2632 if not value:
2633 raise errors.HeaderParseError("Expected value but found "
2634 "only {}".format(leader))
2635 if value[0] == '"':
2636 token, value = get_quoted_string(value)
2637 else:
2638 token, value = get_extended_attribute(value)
2639 if leader is not None:
2640 token[:0] = [leader]
2641 v.append(token)
2642 return v, value
2643
2644def get_parameter(value):
2645 """ attribute [section] ["*"] [CFWS] "=" value
2646
2647 The CFWS is implied by the RFC but not made explicit in the BNF. This
2648 simplified form of the BNF from the RFC is made to conform with the RFC BNF
2649 through some extra checks. We do it this way because it makes both error
2650 recovery and working with the resulting parse tree easier.
2651 """
2652 # It is possible CFWS would also be implicitly allowed between the section
2653 # and the 'extended-attribute' marker (the '*') , but we've never seen that
2654 # in the wild and we will therefore ignore the possibility.
2655 param = Parameter()
2656 token, value = get_attribute(value)
2657 param.append(token)
2658 if not value or value[0] == ';':
2659 param.defects.append(errors.InvalidHeaderDefect("Parameter contains "
2660 "name ({}) but no value".format(token)))
2661 return param, value
2662 if value[0] == '*':
2663 try:
2664 token, value = get_section(value)
2665 param.sectioned = True
2666 param.append(token)
2667 except errors.HeaderParseError:
2668 pass
2669 if not value:
2670 raise errors.HeaderParseError("Incomplete parameter")
2671 if value[0] == '*':
2672 param.append(ValueTerminal('*', 'extended-parameter-marker'))
2673 value = value[1:]
2674 param.extended = True
2675 if value[0] != '=':
2676 raise errors.HeaderParseError("Parameter not followed by '='")
2677 param.append(ValueTerminal('=', 'parameter-separator'))
2678 value = value[1:]
2679 leader = None
2680 if value and value[0] in CFWS_LEADER:
2681 token, value = get_cfws(value)
2682 param.append(token)
2683 remainder = None
2684 appendto = param
2685 if param.extended and value and value[0] == '"':
2686 # Now for some serious hackery to handle the common invalid case of
2687 # double quotes around an extended value. We also accept (with defect)
2688 # a value marked as encoded that isn't really.
2689 qstring, remainder = get_quoted_string(value)
2690 inner_value = qstring.stripped_value
2691 semi_valid = False
2692 if param.section_number == 0:
2693 if inner_value and inner_value[0] == "'":
2694 semi_valid = True
2695 else:
2696 token, rest = get_attrtext(inner_value)
2697 if rest and rest[0] == "'":
2698 semi_valid = True
2699 else:
2700 try:
2701 token, rest = get_extended_attrtext(inner_value)
2702 except:
2703 pass
2704 else:
2705 if not rest:
2706 semi_valid = True
2707 if semi_valid:
2708 param.defects.append(errors.InvalidHeaderDefect(
2709 "Quoted string value for extended parameter is invalid"))
2710 param.append(qstring)
2711 for t in qstring:
2712 if t.token_type == 'bare-quoted-string':
2713 t[:] = []
2714 appendto = t
2715 break
2716 value = inner_value
2717 else:
2718 remainder = None
2719 param.defects.append(errors.InvalidHeaderDefect(
2720 "Parameter marked as extended but appears to have a "
2721 "quoted string value that is non-encoded"))
2722 if value and value[0] == "'":
2723 token = None
2724 else:
2725 token, value = get_value(value)
2726 if not param.extended or param.section_number > 0:
2727 if not value or value[0] != "'":
2728 appendto.append(token)
2729 if remainder is not None:
2730 assert not value, value
2731 value = remainder
2732 return param, value
2733 param.defects.append(errors.InvalidHeaderDefect(
2734 "Apparent initial-extended-value but attribute "
2735 "was not marked as extended or was not initial section"))
2736 if not value:
2737 # Assume the charset/lang is missing and the token is the value.
2738 param.defects.append(errors.InvalidHeaderDefect(
2739 "Missing required charset/lang delimiters"))
2740 appendto.append(token)
2741 if remainder is None:
2742 return param, value
2743 else:
2744 if token is not None:
2745 for t in token:
2746 if t.token_type == 'extended-attrtext':
2747 break
2748 t.token_type == 'attrtext'
2749 appendto.append(t)
2750 param.charset = t.value
2751 if value[0] != "'":
2752 raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
2753 "delimiter, but found {!r}".format(value))
2754 appendto.append(ValueTerminal("'", 'RFC2231 delimiter'))
2755 value = value[1:]
2756 if value and value[0] != "'":
2757 token, value = get_attrtext(value)
2758 appendto.append(token)
2759 param.lang = token.value
2760 if not value or value[0] != "'":
2761 raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
2762 "delimiter, but found {}".format(value))
2763 appendto.append(ValueTerminal("'", 'RFC2231 delimiter'))
2764 value = value[1:]
2765 if remainder is not None:
2766 # Treat the rest of value as bare quoted string content.
2767 v = Value()
2768 while value:
2769 if value[0] in WSP:
2770 token, value = get_fws(value)
2771 else:
2772 token, value = get_qcontent(value)
2773 v.append(token)
2774 token = v
2775 else:
2776 token, value = get_value(value)
2777 appendto.append(token)
2778 if remainder is not None:
2779 assert not value, value
2780 value = remainder
2781 return param, value
2782
2783def parse_mime_parameters(value):
2784 """ parameter *( ";" parameter )
2785
2786 That BNF is meant to indicate this routine should only be called after
2787 finding and handling the leading ';'. There is no corresponding rule in
2788 the formal RFC grammar, but it is more convenient for us for the set of
2789 parameters to be treated as its own TokenList.
2790
2791 This is 'parse' routine because it consumes the reminaing value, but it
2792 would never be called to parse a full header. Instead it is called to
2793 parse everything after the non-parameter value of a specific MIME header.
2794
2795 """
2796 mime_parameters = MimeParameters()
2797 while value:
2798 try:
2799 token, value = get_parameter(value)
2800 mime_parameters.append(token)
2801 except errors.HeaderParseError as err:
2802 leader = None
2803 if value[0] in CFWS_LEADER:
2804 leader, value = get_cfws(value)
2805 if not value:
2806 mime_parameters.append(leader)
2807 return mime_parameters
2808 if value[0] == ';':
2809 if leader is not None:
2810 mime_parameters.append(leader)
2811 mime_parameters.defects.append(errors.InvalidHeaderDefect(
2812 "parameter entry with no content"))
2813 else:
2814 token, value = get_invalid_parameter(value)
2815 if leader:
2816 token[:0] = [leader]
2817 mime_parameters.append(token)
2818 mime_parameters.defects.append(errors.InvalidHeaderDefect(
2819 "invalid parameter {!r}".format(token)))
2820 if value and value[0] != ';':
2821 # Junk after the otherwise valid parameter. Mark it as
2822 # invalid, but it will have a value.
2823 param = mime_parameters[-1]
2824 param.token_type = 'invalid-parameter'
2825 token, value = get_invalid_parameter(value)
2826 param.extend(token)
2827 mime_parameters.defects.append(errors.InvalidHeaderDefect(
2828 "parameter with invalid trailing text {!r}".format(token)))
2829 if value:
2830 # Must be a ';' at this point.
2831 mime_parameters.append(ValueTerminal(';', 'parameter-separator'))
2832 value = value[1:]
2833 return mime_parameters
2834
2835def _find_mime_parameters(tokenlist, value):
2836 """Do our best to find the parameters in an invalid MIME header
2837
2838 """
2839 while value and value[0] != ';':
2840 if value[0] in PHRASE_ENDS:
2841 tokenlist.append(ValueTerminal(value[0], 'misplaced-special'))
2842 value = value[1:]
2843 else:
2844 token, value = get_phrase(value)
2845 tokenlist.append(token)
2846 if not value:
2847 return
2848 tokenlist.append(ValueTerminal(';', 'parameter-separator'))
2849 tokenlist.append(parse_mime_parameters(value[1:]))
2850
2851def parse_content_type_header(value):
2852 """ maintype "/" subtype *( ";" parameter )
2853
2854 The maintype and substype are tokens. Theoretically they could
2855 be checked against the official IANA list + x-token, but we
2856 don't do that.
2857 """
2858 ctype = ContentType()
2859 recover = False
2860 if not value:
2861 ctype.defects.append(errors.HeaderMissingRequiredValue(
2862 "Missing content type specification"))
2863 return ctype
2864 try:
2865 token, value = get_token(value)
2866 except errors.HeaderParseError:
2867 ctype.defects.append(errors.InvalidHeaderDefect(
2868 "Expected content maintype but found {!r}".format(value)))
2869 _find_mime_parameters(ctype, value)
2870 return ctype
2871 ctype.append(token)
2872 # XXX: If we really want to follow the formal grammer we should make
2873 # mantype and subtype specialized TokenLists here. Probably not worth it.
2874 if not value or value[0] != '/':
2875 ctype.defects.append(errors.InvalidHeaderDefect(
2876 "Invalid content type"))
2877 if value:
2878 _find_mime_parameters(ctype, value)
2879 return ctype
2880 ctype.maintype = token.value.strip().lower()
2881 ctype.append(ValueTerminal('/', 'content-type-separator'))
2882 value = value[1:]
2883 try:
2884 token, value = get_token(value)
2885 except errors.HeaderParseError:
2886 ctype.defects.append(errors.InvalidHeaderDefect(
2887 "Expected content subtype but found {!r}".format(value)))
2888 _find_mime_parameters(ctype, value)
2889 return ctype
2890 ctype.append(token)
2891 ctype.subtype = token.value.strip().lower()
2892 if not value:
2893 return ctype
2894 if value[0] != ';':
2895 ctype.defects.append(errors.InvalidHeaderDefect(
2896 "Only parameters are valid after content type, but "
2897 "found {!r}".format(value)))
2898 # The RFC requires that a syntactically invalid content-type be treated
2899 # as text/plain. Perhaps we should postel this, but we should probably
2900 # only do that if we were checking the subtype value against IANA.
2901 del ctype.maintype, ctype.subtype
2902 _find_mime_parameters(ctype, value)
2903 return ctype
2904 ctype.append(ValueTerminal(';', 'parameter-separator'))
2905 ctype.append(parse_mime_parameters(value[1:]))
2906 return ctype
2907
2908def parse_content_disposition_header(value):
2909 """ disposition-type *( ";" parameter )
2910
2911 """
2912 disp_header = ContentDisposition()
2913 if not value:
2914 disp_header.defects.append(errors.HeaderMissingRequiredValue(
2915 "Missing content disposition"))
2916 return disp_header
2917 try:
2918 token, value = get_token(value)
2919 except errors.HeaderParseError:
Ezio Melottid5774802014-08-04 17:16:49 +03002920 disp_header.defects.append(errors.InvalidHeaderDefect(
R David Murray97f43c02012-06-24 05:03:27 -04002921 "Expected content disposition but found {!r}".format(value)))
2922 _find_mime_parameters(disp_header, value)
2923 return disp_header
2924 disp_header.append(token)
2925 disp_header.content_disposition = token.value.strip().lower()
2926 if not value:
2927 return disp_header
2928 if value[0] != ';':
2929 disp_header.defects.append(errors.InvalidHeaderDefect(
2930 "Only parameters are valid after content disposition, but "
2931 "found {!r}".format(value)))
2932 _find_mime_parameters(disp_header, value)
2933 return disp_header
2934 disp_header.append(ValueTerminal(';', 'parameter-separator'))
2935 disp_header.append(parse_mime_parameters(value[1:]))
2936 return disp_header
2937
2938def parse_content_transfer_encoding_header(value):
2939 """ mechanism
2940
2941 """
2942 # We should probably validate the values, since the list is fixed.
2943 cte_header = ContentTransferEncoding()
2944 if not value:
2945 cte_header.defects.append(errors.HeaderMissingRequiredValue(
2946 "Missing content transfer encoding"))
2947 return cte_header
2948 try:
2949 token, value = get_token(value)
2950 except errors.HeaderParseError:
Ezio Melottid5774802014-08-04 17:16:49 +03002951 cte_header.defects.append(errors.InvalidHeaderDefect(
2952 "Expected content transfer encoding but found {!r}".format(value)))
R David Murray97f43c02012-06-24 05:03:27 -04002953 else:
2954 cte_header.append(token)
2955 cte_header.cte = token.value.strip().lower()
2956 if not value:
2957 return cte_header
2958 while value:
2959 cte_header.defects.append(errors.InvalidHeaderDefect(
2960 "Extra text after content transfer encoding"))
2961 if value[0] in PHRASE_ENDS:
2962 cte_header.append(ValueTerminal(value[0], 'misplaced-special'))
2963 value = value[1:]
2964 else:
2965 token, value = get_phrase(value)
2966 cte_header.append(token)
2967 return cte_header