blob: 26cfa52723fe2db10419f618a259800ffa68316f [file] [log] [blame]
R David Murray0b6f6c82012-05-25 18:42:14 -04001"""Header value parser implementing various email-related RFC parsing rules.
2
3The parsing methods defined in this module implement various email related
4parsing rules. Principal among them is RFC 5322, which is the followon
5to RFC 2822 and primarily a clarification of the former. It also implements
6RFC 2047 encoded word decoding.
7
8RFC 5322 goes to considerable trouble to maintain backward compatibility with
9RFC 822 in the parse phase, while cleaning up the structure on the generation
10phase. This parser supports correct RFC 5322 generation by tagging white space
11as folding white space only when folding is allowed in the non-obsolete rule
12sets. Actually, the parser is even more generous when accepting input than RFC
135322 mandates, following the spirit of Postel's Law, which RFC 5322 encourages.
14Where possible deviations from the standard are annotated on the 'defects'
15attribute of tokens that deviate.
16
17The general structure of the parser follows RFC 5322, and uses its terminology
18where there is a direct correspondence. Where the implementation requires a
19somewhat different structure than that used by the formal grammar, new terms
20that mimic the closest existing terms are used. Thus, it really helps to have
21a copy of RFC 5322 handy when studying this code.
22
23Input to the parser is a string that has already been unfolded according to
24RFC 5322 rules. According to the RFC this unfolding is the very first step, and
25this parser leaves the unfolding step to a higher level message parser, which
26will have already detected the line breaks that need unfolding while
27determining the beginning and end of each header.
28
29The output of the parser is a TokenList object, which is a list subclass. A
30TokenList is a recursive data structure. The terminal nodes of the structure
31are Terminal objects, which are subclasses of str. These do not correspond
32directly to terminal objects in the formal grammar, but are instead more
33practical higher level combinations of true terminals.
34
35All TokenList and Terminal objects have a 'value' attribute, which produces the
36semantically meaningful value of that part of the parse subtree. The value of
37all whitespace tokens (no matter how many sub-tokens they may contain) is a
38single space, as per the RFC rules. This includes 'CFWS', which is herein
39included in the general class of whitespace tokens. There is one exception to
40the rule that whitespace tokens are collapsed into single spaces in values: in
41the value of a 'bare-quoted-string' (a quoted-string with no leading or
42trailing whitespace), any whitespace that appeared between the quotation marks
43is preserved in the returned value. Note that in all Terminal strings quoted
44pairs are turned into their unquoted values.
45
46All TokenList and Terminal objects also have a string value, which attempts to
47be a "canonical" representation of the RFC-compliant form of the substring that
48produced the parsed subtree, including minimal use of quoted pair quoting.
49Whitespace runs are not collapsed.
50
51Comment tokens also have a 'content' attribute providing the string found
52between the parens (including any nested comments) with whitespace preserved.
53
54All TokenList and Terminal objects have a 'defects' attribute which is a
55possibly empty list all of the defects found while creating the token. Defects
56may appear on any token in the tree, and a composite list of all defects in the
57subtree is available through the 'all_defects' attribute of any node. (For
58Terminal notes x.defects == x.all_defects.)
59
60Each object in a parse tree is called a 'token', and each has a 'token_type'
61attribute that gives the name from the RFC 5322 grammar that it represents.
62Not all RFC 5322 nodes are produced, and there is one non-RFC 5322 node that
63may be produced: 'ptext'. A 'ptext' is a string of printable ascii characters.
64It is returned in place of lists of (ctext/quoted-pair) and
65(qtext/quoted-pair).
66
67XXX: provide complete list of token types.
68"""
69
70import re
R David Murray97f43c02012-06-24 05:03:27 -040071import urllib # For urllib.parse.unquote
72from collections import namedtuple, OrderedDict
R David Murray0b6f6c82012-05-25 18:42:14 -040073from email import _encoded_words as _ew
74from email import errors
75from email import utils
76
77#
78# Useful constants and functions
79#
80
81WSP = set(' \t')
82CFWS_LEADER = WSP | set('(')
83SPECIALS = set(r'()<>@,:;.\"[]')
84ATOM_ENDS = SPECIALS | WSP
85DOT_ATOM_ENDS = ATOM_ENDS - set('.')
86# '.', '"', and '(' do not end phrases in order to support obs-phrase
87PHRASE_ENDS = SPECIALS - set('."(')
R David Murray97f43c02012-06-24 05:03:27 -040088TSPECIALS = (SPECIALS | set('/?=')) - set('.')
89TOKEN_ENDS = TSPECIALS | WSP
90ASPECIALS = TSPECIALS | set("*'%")
91ATTRIBUTE_ENDS = ASPECIALS | WSP
92EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
R David Murray0b6f6c82012-05-25 18:42:14 -040093
94def quote_string(value):
95 return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
96
97#
98# Accumulator for header folding
99#
100
101class _Folded:
102
103 def __init__(self, maxlen, policy):
104 self.maxlen = maxlen
105 self.policy = policy
106 self.lastlen = 0
107 self.stickyspace = None
108 self.firstline = True
109 self.done = []
110 self.current = []
111
112 def newline(self):
113 self.done.extend(self.current)
114 self.done.append(self.policy.linesep)
115 self.current.clear()
116 self.lastlen = 0
117
118 def finalize(self):
119 if self.current:
120 self.newline()
121
122 def __str__(self):
123 return ''.join(self.done)
124
125 def append(self, stoken):
126 self.current.append(stoken)
127
128 def append_if_fits(self, token, stoken=None):
129 if stoken is None:
130 stoken = str(token)
131 l = len(stoken)
132 if self.stickyspace is not None:
133 stickyspace_len = len(self.stickyspace)
134 if self.lastlen + stickyspace_len + l <= self.maxlen:
135 self.current.append(self.stickyspace)
136 self.lastlen += stickyspace_len
137 self.current.append(stoken)
138 self.lastlen += l
139 self.stickyspace = None
140 self.firstline = False
141 return True
142 if token.has_fws:
143 ws = token.pop_leading_fws()
144 if ws is not None:
145 self.stickyspace += str(ws)
146 stickyspace_len += len(ws)
147 token._fold(self)
148 return True
149 if stickyspace_len and l + 1 <= self.maxlen:
150 margin = self.maxlen - l
151 if 0 < margin < stickyspace_len:
152 trim = stickyspace_len - margin
153 self.current.append(self.stickyspace[:trim])
154 self.stickyspace = self.stickyspace[trim:]
155 stickyspace_len = trim
156 self.newline()
157 self.current.append(self.stickyspace)
158 self.current.append(stoken)
159 self.lastlen = l + stickyspace_len
160 self.stickyspace = None
161 self.firstline = False
162 return True
163 if not self.firstline:
164 self.newline()
165 self.current.append(self.stickyspace)
166 self.current.append(stoken)
167 self.stickyspace = None
168 self.firstline = False
169 return True
170 if self.lastlen + l <= self.maxlen:
171 self.current.append(stoken)
172 self.lastlen += l
173 return True
174 if l < self.maxlen:
175 self.newline()
176 self.current.append(stoken)
177 self.lastlen = l
178 return True
179 return False
180
181#
182# TokenList and its subclasses
183#
184
185class TokenList(list):
186
187 token_type = None
188
189 def __init__(self, *args, **kw):
190 super().__init__(*args, **kw)
191 self.defects = []
192
193 def __str__(self):
194 return ''.join(str(x) for x in self)
195
196 def __repr__(self):
197 return '{}({})'.format(self.__class__.__name__,
198 super().__repr__())
199
200 @property
201 def value(self):
202 return ''.join(x.value for x in self if x.value)
203
204 @property
205 def all_defects(self):
206 return sum((x.all_defects for x in self), self.defects)
207
208 #
209 # Folding API
210 #
211 # parts():
212 #
213 # return a list of objects that constitute the "higher level syntactic
214 # objects" specified by the RFC as the best places to fold a header line.
215 # The returned objects must include leading folding white space, even if
216 # this means mutating the underlying parse tree of the object. Each object
217 # is only responsible for returning *its* parts, and should not drill down
218 # to any lower level except as required to meet the leading folding white
219 # space constraint.
220 #
221 # _fold(folded):
222 #
223 # folded: the result accumulator. This is an instance of _Folded.
224 # (XXX: I haven't finished factoring this out yet, the folding code
225 # pretty much uses this as a state object.) When the folded.current
226 # contains as much text as will fit, the _fold method should call
227 # folded.newline.
228 # folded.lastlen: the current length of the test stored in folded.current.
229 # folded.maxlen: The maximum number of characters that may appear on a
230 # folded line. Differs from the policy setting in that "no limit" is
231 # represented by +inf, which means it can be used in the trivially
232 # logical fashion in comparisons.
233 #
234 # Currently no subclasses implement parts, and I think this will remain
235 # true. A subclass only needs to implement _fold when the generic version
236 # isn't sufficient. _fold will need to be implemented primarily when it is
237 # possible for encoded words to appear in the specialized token-list, since
238 # there is no generic algorithm that can know where exactly the encoded
239 # words are allowed. A _fold implementation is responsible for filling
240 # lines in the same general way that the top level _fold does. It may, and
241 # should, call the _fold method of sub-objects in a similar fashion to that
242 # of the top level _fold.
243 #
244 # XXX: I'm hoping it will be possible to factor the existing code further
245 # to reduce redundancy and make the logic clearer.
246
247 @property
248 def parts(self):
249 klass = self.__class__
250 this = []
251 for token in self:
252 if token.startswith_fws():
253 if this:
254 yield this[0] if len(this)==1 else klass(this)
255 this.clear()
256 end_ws = token.pop_trailing_ws()
257 this.append(token)
258 if end_ws:
259 yield klass(this)
260 this = [end_ws]
261 if this:
262 yield this[0] if len(this)==1 else klass(this)
263
264 def startswith_fws(self):
265 return self[0].startswith_fws()
266
267 def pop_leading_fws(self):
268 if self[0].token_type == 'fws':
269 return self.pop(0)
270 return self[0].pop_leading_fws()
271
272 def pop_trailing_ws(self):
273 if self[-1].token_type == 'cfws':
274 return self.pop(-1)
275 return self[-1].pop_trailing_ws()
276
277 @property
278 def has_fws(self):
279 for part in self:
280 if part.has_fws:
281 return True
282 return False
283
284 def has_leading_comment(self):
285 return self[0].has_leading_comment()
286
287 @property
288 def comments(self):
289 comments = []
290 for token in self:
291 comments.extend(token.comments)
292 return comments
293
294 def fold(self, *, policy):
295 # max_line_length 0/None means no limit, ie: infinitely long.
296 maxlen = policy.max_line_length or float("+inf")
297 folded = _Folded(maxlen, policy)
298 self._fold(folded)
299 folded.finalize()
300 return str(folded)
301
302 def as_encoded_word(self, charset):
303 # This works only for things returned by 'parts', which include
304 # the leading fws, if any, that should be used.
305 res = []
306 ws = self.pop_leading_fws()
307 if ws:
308 res.append(ws)
309 trailer = self.pop(-1) if self[-1].token_type=='fws' else ''
310 res.append(_ew.encode(str(self), charset))
311 res.append(trailer)
312 return ''.join(res)
313
314 def cte_encode(self, charset, policy):
315 res = []
316 for part in self:
317 res.append(part.cte_encode(charset, policy))
318 return ''.join(res)
319
320 def _fold(self, folded):
321 for part in self.parts:
322 tstr = str(part)
323 tlen = len(tstr)
324 try:
325 str(part).encode('us-ascii')
326 except UnicodeEncodeError:
327 if any(isinstance(x, errors.UndecodableBytesDefect)
328 for x in part.all_defects):
329 charset = 'unknown-8bit'
330 else:
331 # XXX: this should be a policy setting
332 charset = 'utf-8'
333 tstr = part.cte_encode(charset, folded.policy)
334 tlen = len(tstr)
335 if folded.append_if_fits(part, tstr):
336 continue
337 # Peel off the leading whitespace if any and make it sticky, to
338 # avoid infinite recursion.
339 ws = part.pop_leading_fws()
340 if ws is not None:
341 # Peel off the leading whitespace and make it sticky, to
342 # avoid infinite recursion.
343 folded.stickyspace = str(part.pop(0))
344 if folded.append_if_fits(part):
345 continue
346 if part.has_fws:
347 part._fold(folded)
348 continue
349 # There are no fold points in this one; it is too long for a single
350 # line and can't be split...we just have to put it on its own line.
351 folded.append(tstr)
352 folded.newline()
353
354 def pprint(self, indent=''):
355 print('\n'.join(self._pp(indent='')))
356
357 def ppstr(self, indent=''):
358 return '\n'.join(self._pp(indent=''))
359
360 def _pp(self, indent=''):
361 yield '{}{}/{}('.format(
362 indent,
363 self.__class__.__name__,
364 self.token_type)
365 for token in self:
R David Murray97f43c02012-06-24 05:03:27 -0400366 if not hasattr(token, '_pp'):
367 yield (indent + ' !! invalid element in token '
368 'list: {!r}'.format(token))
369 else:
370 for line in token._pp(indent+' '):
371 yield line
R David Murray0b6f6c82012-05-25 18:42:14 -0400372 if self.defects:
373 extra = ' Defects: {}'.format(self.defects)
374 else:
375 extra = ''
376 yield '{}){}'.format(indent, extra)
377
378
379class WhiteSpaceTokenList(TokenList):
380
381 @property
382 def value(self):
383 return ' '
384
385 @property
386 def comments(self):
387 return [x.content for x in self if x.token_type=='comment']
388
389
390class UnstructuredTokenList(TokenList):
391
392 token_type = 'unstructured'
393
394 def _fold(self, folded):
395 if any(x.token_type=='encoded-word' for x in self):
396 return self._fold_encoded(folded)
397 # Here we can have either a pure ASCII string that may or may not
398 # have surrogateescape encoded bytes, or a unicode string.
399 last_ew = None
400 for part in self.parts:
401 tstr = str(part)
402 is_ew = False
403 try:
404 str(part).encode('us-ascii')
405 except UnicodeEncodeError:
406 if any(isinstance(x, errors.UndecodableBytesDefect)
407 for x in part.all_defects):
408 charset = 'unknown-8bit'
409 else:
410 charset = 'utf-8'
411 if last_ew is not None:
412 # We've already done an EW, combine this one with it
413 # if there's room.
414 chunk = get_unstructured(
415 ''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset)
416 oldlastlen = sum(len(x) for x in folded.current[:last_ew])
417 schunk = str(chunk)
418 lchunk = len(schunk)
419 if oldlastlen + lchunk <= folded.maxlen:
420 del folded.current[last_ew:]
421 folded.append(schunk)
422 folded.lastlen = oldlastlen + lchunk
423 continue
424 tstr = part.as_encoded_word(charset)
425 is_ew = True
426 if folded.append_if_fits(part, tstr):
427 if is_ew:
428 last_ew = len(folded.current) - 1
429 continue
430 if is_ew or last_ew:
431 # It's too big to fit on the line, but since we've
432 # got encoded words we can use encoded word folding.
433 part._fold_as_ew(folded)
434 continue
435 # Peel off the leading whitespace if any and make it sticky, to
436 # avoid infinite recursion.
437 ws = part.pop_leading_fws()
438 if ws is not None:
439 folded.stickyspace = str(ws)
440 if folded.append_if_fits(part):
441 continue
442 if part.has_fws:
443 part.fold(folded)
444 continue
445 # It can't be split...we just have to put it on its own line.
446 folded.append(tstr)
447 folded.newline()
448 last_ew = None
449
450 def cte_encode(self, charset, policy):
451 res = []
452 last_ew = None
453 for part in self:
454 spart = str(part)
455 try:
456 spart.encode('us-ascii')
457 res.append(spart)
458 except UnicodeEncodeError:
459 if last_ew is None:
460 res.append(part.cte_encode(charset, policy))
461 last_ew = len(res)
462 else:
463 tl = get_unstructured(''.join(res[last_ew:] + [spart]))
464 res.append(tl.as_encoded_word())
465 return ''.join(res)
466
467
468class Phrase(TokenList):
469
470 token_type = 'phrase'
471
472 def _fold(self, folded):
473 # As with Unstructured, we can have pure ASCII with or without
474 # surrogateescape encoded bytes, or we could have unicode. But this
475 # case is more complicated, since we have to deal with the various
476 # sub-token types and how they can be composed in the face of
477 # unicode-that-needs-CTE-encoding, and the fact that if a token a
478 # comment that becomes a barrier across which we can't compose encoded
479 # words.
480 last_ew = None
481 for part in self.parts:
482 tstr = str(part)
483 tlen = len(tstr)
484 has_ew = False
485 try:
486 str(part).encode('us-ascii')
487 except UnicodeEncodeError:
488 if any(isinstance(x, errors.UndecodableBytesDefect)
489 for x in part.all_defects):
490 charset = 'unknown-8bit'
491 else:
492 charset = 'utf-8'
493 if last_ew is not None and not part.has_leading_comment():
494 # We've already done an EW, let's see if we can combine
495 # this one with it. The last_ew logic ensures that all we
496 # have at this point is atoms, no comments or quoted
497 # strings. So we can treat the text between the last
498 # encoded word and the content of this token as
499 # unstructured text, and things will work correctly. But
500 # we have to strip off any trailing comment on this token
501 # first, and if it is a quoted string we have to pull out
502 # the content (we're encoding it, so it no longer needs to
503 # be quoted).
504 if part[-1].token_type == 'cfws' and part.comments:
505 remainder = part.pop(-1)
506 else:
507 remainder = ''
508 for i, token in enumerate(part):
509 if token.token_type == 'bare-quoted-string':
510 part[i] = UnstructuredTokenList(token[:])
511 chunk = get_unstructured(
512 ''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset)
513 schunk = str(chunk)
514 lchunk = len(schunk)
515 if last_ew + lchunk <= folded.maxlen:
516 del folded.current[last_ew:]
517 folded.append(schunk)
518 folded.lastlen = sum(len(x) for x in folded.current)
519 continue
520 tstr = part.as_encoded_word(charset)
521 tlen = len(tstr)
522 has_ew = True
523 if folded.append_if_fits(part, tstr):
524 if has_ew and not part.comments:
525 last_ew = len(folded.current) - 1
526 elif part.comments or part.token_type == 'quoted-string':
527 # If a comment is involved we can't combine EWs. And if a
528 # quoted string is involved, it's not worth the effort to
529 # try to combine them.
530 last_ew = None
531 continue
532 part._fold(folded)
533
534 def cte_encode(self, charset, policy):
535 res = []
536 last_ew = None
537 is_ew = False
538 for part in self:
539 spart = str(part)
540 try:
541 spart.encode('us-ascii')
542 res.append(spart)
543 except UnicodeEncodeError:
544 is_ew = True
545 if last_ew is None:
546 if not part.comments:
547 last_ew = len(res)
548 res.append(part.cte_encode(charset, policy))
549 elif not part.has_leading_comment():
550 if part[-1].token_type == 'cfws' and part.comments:
551 remainder = part.pop(-1)
552 else:
553 remainder = ''
554 for i, token in enumerate(part):
555 if token.token_type == 'bare-quoted-string':
556 part[i] = UnstructuredTokenList(token[:])
557 tl = get_unstructured(''.join(res[last_ew:] + [spart]))
558 res[last_ew:] = [tl.as_encoded_word(charset)]
559 if part.comments or (not is_ew and part.token_type == 'quoted-string'):
560 last_ew = None
561 return ''.join(res)
562
563class Word(TokenList):
564
565 token_type = 'word'
566
567
568class CFWSList(WhiteSpaceTokenList):
569
570 token_type = 'cfws'
571
572 def has_leading_comment(self):
573 return bool(self.comments)
574
575
576class Atom(TokenList):
577
578 token_type = 'atom'
579
580
R David Murray97f43c02012-06-24 05:03:27 -0400581class Token(TokenList):
582
583 token_type = 'token'
584
585
R David Murray0b6f6c82012-05-25 18:42:14 -0400586class EncodedWord(TokenList):
587
588 token_type = 'encoded-word'
589 cte = None
590 charset = None
591 lang = None
592
593 @property
594 def encoded(self):
595 if self.cte is not None:
596 return self.cte
597 _ew.encode(str(self), self.charset)
598
599
600
601class QuotedString(TokenList):
602
603 token_type = 'quoted-string'
604
605 @property
606 def content(self):
607 for x in self:
608 if x.token_type == 'bare-quoted-string':
609 return x.value
610
611 @property
612 def quoted_value(self):
613 res = []
614 for x in self:
615 if x.token_type == 'bare-quoted-string':
616 res.append(str(x))
617 else:
618 res.append(x.value)
619 return ''.join(res)
620
R David Murray97f43c02012-06-24 05:03:27 -0400621 @property
622 def stripped_value(self):
623 for token in self:
624 if token.token_type == 'bare-quoted-string':
625 return token.value
626
R David Murray0b6f6c82012-05-25 18:42:14 -0400627
628class BareQuotedString(QuotedString):
629
630 token_type = 'bare-quoted-string'
631
632 def __str__(self):
R David Murray97f43c02012-06-24 05:03:27 -0400633 return quote_string(''.join(str(x) for x in self))
R David Murray0b6f6c82012-05-25 18:42:14 -0400634
635 @property
636 def value(self):
637 return ''.join(str(x) for x in self)
638
639
640class Comment(WhiteSpaceTokenList):
641
642 token_type = 'comment'
643
644 def __str__(self):
645 return ''.join(sum([
646 ["("],
647 [self.quote(x) for x in self],
648 [")"],
649 ], []))
650
651 def quote(self, value):
652 if value.token_type == 'comment':
653 return str(value)
654 return str(value).replace('\\', '\\\\').replace(
655 '(', '\(').replace(
656 ')', '\)')
657
658 @property
659 def content(self):
660 return ''.join(str(x) for x in self)
661
662 @property
663 def comments(self):
664 return [self.content]
665
666class AddressList(TokenList):
667
668 token_type = 'address-list'
669
670 @property
671 def addresses(self):
672 return [x for x in self if x.token_type=='address']
673
674 @property
675 def mailboxes(self):
676 return sum((x.mailboxes
677 for x in self if x.token_type=='address'), [])
678
679 @property
680 def all_mailboxes(self):
681 return sum((x.all_mailboxes
682 for x in self if x.token_type=='address'), [])
683
684
685class Address(TokenList):
686
687 token_type = 'address'
688
689 @property
690 def display_name(self):
691 if self[0].token_type == 'group':
692 return self[0].display_name
693
694 @property
695 def mailboxes(self):
696 if self[0].token_type == 'mailbox':
697 return [self[0]]
698 elif self[0].token_type == 'invalid-mailbox':
699 return []
700 return self[0].mailboxes
701
702 @property
703 def all_mailboxes(self):
704 if self[0].token_type == 'mailbox':
705 return [self[0]]
706 elif self[0].token_type == 'invalid-mailbox':
707 return [self[0]]
708 return self[0].all_mailboxes
709
710class MailboxList(TokenList):
711
712 token_type = 'mailbox-list'
713
714 @property
715 def mailboxes(self):
716 return [x for x in self if x.token_type=='mailbox']
717
718 @property
719 def all_mailboxes(self):
720 return [x for x in self
721 if x.token_type in ('mailbox', 'invalid-mailbox')]
722
723
724class GroupList(TokenList):
725
726 token_type = 'group-list'
727
728 @property
729 def mailboxes(self):
730 if not self or self[0].token_type != 'mailbox-list':
731 return []
732 return self[0].mailboxes
733
734 @property
735 def all_mailboxes(self):
736 if not self or self[0].token_type != 'mailbox-list':
737 return []
738 return self[0].all_mailboxes
739
740
741class Group(TokenList):
742
743 token_type = "group"
744
745 @property
746 def mailboxes(self):
747 if self[2].token_type != 'group-list':
748 return []
749 return self[2].mailboxes
750
751 @property
752 def all_mailboxes(self):
753 if self[2].token_type != 'group-list':
754 return []
755 return self[2].all_mailboxes
756
757 @property
758 def display_name(self):
759 return self[0].display_name
760
761
762class NameAddr(TokenList):
763
764 token_type = 'name-addr'
765
766 @property
767 def display_name(self):
768 if len(self) == 1:
769 return None
770 return self[0].display_name
771
772 @property
773 def local_part(self):
774 return self[-1].local_part
775
776 @property
777 def domain(self):
778 return self[-1].domain
779
780 @property
781 def route(self):
782 return self[-1].route
783
784 @property
785 def addr_spec(self):
786 return self[-1].addr_spec
787
788
789class AngleAddr(TokenList):
790
791 token_type = 'angle-addr'
792
793 @property
794 def local_part(self):
795 for x in self:
796 if x.token_type == 'addr-spec':
797 return x.local_part
798
799 @property
800 def domain(self):
801 for x in self:
802 if x.token_type == 'addr-spec':
803 return x.domain
804
805 @property
806 def route(self):
807 for x in self:
808 if x.token_type == 'obs-route':
809 return x.domains
810
811 @property
812 def addr_spec(self):
813 for x in self:
814 if x.token_type == 'addr-spec':
815 return x.addr_spec
R David Murray032eed32012-05-26 14:31:12 -0400816 else:
817 return '<>'
R David Murray0b6f6c82012-05-25 18:42:14 -0400818
819
820class ObsRoute(TokenList):
821
822 token_type = 'obs-route'
823
824 @property
825 def domains(self):
826 return [x.domain for x in self if x.token_type == 'domain']
827
828
829class Mailbox(TokenList):
830
831 token_type = 'mailbox'
832
833 @property
834 def display_name(self):
835 if self[0].token_type == 'name-addr':
836 return self[0].display_name
837
838 @property
839 def local_part(self):
840 return self[0].local_part
841
842 @property
843 def domain(self):
844 return self[0].domain
845
846 @property
847 def route(self):
848 if self[0].token_type == 'name-addr':
849 return self[0].route
850
851 @property
852 def addr_spec(self):
853 return self[0].addr_spec
854
855
856class InvalidMailbox(TokenList):
857
858 token_type = 'invalid-mailbox'
859
860 @property
861 def display_name(self):
862 return None
863
864 local_part = domain = route = addr_spec = display_name
865
866
867class Domain(TokenList):
868
869 token_type = 'domain'
870
871 @property
872 def domain(self):
873 return ''.join(super().value.split())
874
875
876class DotAtom(TokenList):
877
878 token_type = 'dot-atom'
879
880
881class DotAtomText(TokenList):
882
883 token_type = 'dot-atom-text'
884
885
886class AddrSpec(TokenList):
887
888 token_type = 'addr-spec'
889
890 @property
891 def local_part(self):
892 return self[0].local_part
893
894 @property
895 def domain(self):
896 if len(self) < 3:
897 return None
898 return self[-1].domain
899
900 @property
901 def value(self):
902 if len(self) < 3:
903 return self[0].value
904 return self[0].value.rstrip()+self[1].value+self[2].value.lstrip()
905
906 @property
907 def addr_spec(self):
908 nameset = set(self.local_part)
909 if len(nameset) > len(nameset-DOT_ATOM_ENDS):
910 lp = quote_string(self.local_part)
911 else:
912 lp = self.local_part
913 if self.domain is not None:
914 return lp + '@' + self.domain
915 return lp
916
917
918class ObsLocalPart(TokenList):
919
920 token_type = 'obs-local-part'
921
922
923class DisplayName(Phrase):
924
925 token_type = 'display-name'
926
927 @property
928 def display_name(self):
929 res = TokenList(self)
930 if res[0].token_type == 'cfws':
931 res.pop(0)
932 else:
933 if res[0][0].token_type == 'cfws':
934 res[0] = TokenList(res[0][1:])
935 if res[-1].token_type == 'cfws':
936 res.pop()
937 else:
938 if res[-1][-1].token_type == 'cfws':
939 res[-1] = TokenList(res[-1][:-1])
940 return res.value
941
942 @property
943 def value(self):
944 quote = False
945 if self.defects:
946 quote = True
947 else:
948 for x in self:
949 if x.token_type == 'quoted-string':
950 quote = True
951 if quote:
952 pre = post = ''
953 if self[0].token_type=='cfws' or self[0][0].token_type=='cfws':
954 pre = ' '
955 if self[-1].token_type=='cfws' or self[-1][-1].token_type=='cfws':
956 post = ' '
957 return pre+quote_string(self.display_name)+post
958 else:
959 return super().value
960
961
962class LocalPart(TokenList):
963
964 token_type = 'local-part'
965
966 @property
967 def value(self):
968 if self[0].token_type == "quoted-string":
969 return self[0].quoted_value
970 else:
971 return self[0].value
972
973 @property
974 def local_part(self):
975 # Strip whitespace from front, back, and around dots.
976 res = [DOT]
977 last = DOT
978 last_is_tl = False
979 for tok in self[0] + [DOT]:
980 if tok.token_type == 'cfws':
981 continue
982 if (last_is_tl and tok.token_type == 'dot' and
983 last[-1].token_type == 'cfws'):
984 res[-1] = TokenList(last[:-1])
985 is_tl = isinstance(tok, TokenList)
986 if (is_tl and last.token_type == 'dot' and
987 tok[0].token_type == 'cfws'):
988 res.append(TokenList(tok[1:]))
989 else:
990 res.append(tok)
991 last = res[-1]
992 last_is_tl = is_tl
993 res = TokenList(res[1:-1])
994 return res.value
995
996
997class DomainLiteral(TokenList):
998
999 token_type = 'domain-literal'
1000
1001 @property
1002 def domain(self):
1003 return ''.join(super().value.split())
1004
1005 @property
1006 def ip(self):
1007 for x in self:
1008 if x.token_type == 'ptext':
1009 return x.value
1010
1011
R David Murray97f43c02012-06-24 05:03:27 -04001012class MIMEVersion(TokenList):
1013
1014 token_type = 'mime-version'
1015 major = None
1016 minor = None
1017
1018
1019class Parameter(TokenList):
1020
1021 token_type = 'parameter'
1022 sectioned = False
1023 extended = False
1024 charset = 'us-ascii'
1025
1026 @property
1027 def section_number(self):
1028 # Because the first token, the attribute (name) eats CFWS, the second
1029 # token is always the section if there is one.
1030 return self[1].number if self.sectioned else 0
1031
1032 @property
1033 def param_value(self):
1034 # This is part of the "handle quoted extended parameters" hack.
1035 for token in self:
1036 if token.token_type == 'value':
1037 return token.stripped_value
1038 if token.token_type == 'quoted-string':
1039 for token in token:
1040 if token.token_type == 'bare-quoted-string':
1041 for token in token:
1042 if token.token_type == 'value':
1043 return token.stripped_value
1044 return ''
1045
1046
1047class InvalidParameter(Parameter):
1048
1049 token_type = 'invalid-parameter'
1050
1051
1052class Attribute(TokenList):
1053
1054 token_type = 'attribute'
1055
1056 @property
1057 def stripped_value(self):
1058 for token in self:
1059 if token.token_type.endswith('attrtext'):
1060 return token.value
1061
1062class Section(TokenList):
1063
1064 token_type = 'section'
1065 number = None
1066
1067
1068class Value(TokenList):
1069
1070 token_type = 'value'
1071
1072 @property
1073 def stripped_value(self):
1074 token = self[0]
1075 if token.token_type == 'cfws':
1076 token = self[1]
1077 if token.token_type.endswith(
1078 ('quoted-string', 'attribute', 'extended-attribute')):
1079 return token.stripped_value
1080 return self.value
1081
1082
1083class MimeParameters(TokenList):
1084
1085 token_type = 'mime-parameters'
1086
1087 @property
1088 def params(self):
1089 # The RFC specifically states that the ordering of parameters is not
1090 # guaranteed and may be reordered by the transport layer. So we have
1091 # to assume the RFC 2231 pieces can come in any order. However, we
1092 # output them in the order that we first see a given name, which gives
1093 # us a stable __str__.
1094 params = OrderedDict()
1095 for token in self:
1096 if not token.token_type.endswith('parameter'):
1097 continue
1098 if token[0].token_type != 'attribute':
1099 continue
1100 name = token[0].value.strip()
1101 if name not in params:
1102 params[name] = []
1103 params[name].append((token.section_number, token))
1104 for name, parts in params.items():
1105 parts = sorted(parts)
1106 # XXX: there might be more recovery we could do here if, for
1107 # example, this is really a case of a duplicate attribute name.
1108 value_parts = []
1109 charset = parts[0][1].charset
1110 for i, (section_number, param) in enumerate(parts):
1111 if section_number != i:
1112 param.defects.append(errors.InvalidHeaderDefect(
1113 "inconsistent multipart parameter numbering"))
1114 value = param.param_value
1115 if param.extended:
1116 try:
1117 value = urllib.parse.unquote_to_bytes(value)
1118 except UnicodeEncodeError:
1119 # source had surrogate escaped bytes. What we do now
1120 # is a bit of an open question. I'm not sure this is
1121 # the best choice, but it is what the old algorithm did
1122 value = urllib.parse.unquote(value, encoding='latin-1')
1123 else:
1124 try:
1125 value = value.decode(charset, 'surrogateescape')
1126 except LookupError:
1127 # XXX: there should really be a custom defect for
1128 # unknown character set to make it easy to find,
1129 # because otherwise unknown charset is a silent
1130 # failure.
1131 value = value.decode('us-ascii', 'surrogateescape')
1132 if utils._has_surrogates(value):
1133 param.defects.append(errors.UndecodableBytesDefect())
1134 value_parts.append(value)
1135 value = ''.join(value_parts)
1136 yield name, value
1137
1138 def __str__(self):
1139 params = []
1140 for name, value in self.params:
1141 if value:
1142 params.append('{}={}'.format(name, quote_string(value)))
1143 else:
1144 params.append(name)
1145 params = '; '.join(params)
1146 return ' ' + params if params else ''
1147
1148
1149class ParameterizedHeaderValue(TokenList):
1150
1151 @property
1152 def params(self):
1153 for token in reversed(self):
1154 if token.token_type == 'mime-parameters':
1155 return token.params
1156 return {}
1157
1158 @property
1159 def parts(self):
1160 if self and self[-1].token_type == 'mime-parameters':
1161 # We don't want to start a new line if all of the params don't fit
1162 # after the value, so unwrap the parameter list.
1163 return TokenList(self[:-1] + self[-1])
1164 return TokenList(self).parts
1165
1166
1167class ContentType(ParameterizedHeaderValue):
1168
1169 token_type = 'content-type'
1170 maintype = 'text'
1171 subtype = 'plain'
1172
1173
1174class ContentDisposition(ParameterizedHeaderValue):
1175
1176 token_type = 'content-disposition'
1177 content_disposition = None
1178
1179
1180class ContentTransferEncoding(TokenList):
1181
1182 token_type = 'content-transfer-encoding'
1183 cte = '7bit'
1184
1185
R David Murray0b6f6c82012-05-25 18:42:14 -04001186class HeaderLabel(TokenList):
1187
1188 token_type = 'header-label'
1189
1190
1191class Header(TokenList):
1192
1193 token_type = 'header'
1194
1195 def _fold(self, folded):
1196 folded.append(str(self.pop(0)))
1197 folded.lastlen = len(folded.current[0])
1198 # The first line of the header is different from all others: we don't
1199 # want to start a new object on a new line if it has any fold points in
1200 # it that would allow part of it to be on the first header line.
1201 # Further, if the first fold point would fit on the new line, we want
1202 # to do that, but if it doesn't we want to put it on the first line.
1203 # Folded supports this via the stickyspace attribute. If this
1204 # attribute is not None, it does the special handling.
1205 folded.stickyspace = str(self.pop(0)) if self[0].token_type == 'cfws' else ''
1206 rest = self.pop(0)
1207 if self:
1208 raise ValueError("Malformed Header token list")
1209 rest._fold(folded)
1210
1211
1212#
1213# Terminal classes and instances
1214#
1215
1216class Terminal(str):
1217
1218 def __new__(cls, value, token_type):
1219 self = super().__new__(cls, value)
1220 self.token_type = token_type
1221 self.defects = []
1222 return self
1223
1224 def __repr__(self):
1225 return "{}({})".format(self.__class__.__name__, super().__repr__())
1226
1227 @property
1228 def all_defects(self):
1229 return list(self.defects)
1230
1231 def _pp(self, indent=''):
1232 return ["{}{}/{}({}){}".format(
1233 indent,
1234 self.__class__.__name__,
1235 self.token_type,
1236 super().__repr__(),
1237 '' if not self.defects else ' {}'.format(self.defects),
1238 )]
1239
1240 def cte_encode(self, charset, policy):
1241 value = str(self)
1242 try:
1243 value.encode('us-ascii')
1244 return value
1245 except UnicodeEncodeError:
1246 return _ew.encode(value, charset)
1247
1248 def pop_trailing_ws(self):
1249 # This terminates the recursion.
1250 return None
1251
1252 def pop_leading_fws(self):
1253 # This terminates the recursion.
1254 return None
1255
1256 @property
1257 def comments(self):
1258 return []
1259
1260 def has_leading_comment(self):
1261 return False
1262
1263 def __getnewargs__(self):
1264 return(str(self), self.token_type)
1265
1266
1267class WhiteSpaceTerminal(Terminal):
1268
1269 @property
1270 def value(self):
1271 return ' '
1272
1273 def startswith_fws(self):
1274 return True
1275
1276 has_fws = True
1277
1278
1279class ValueTerminal(Terminal):
1280
1281 @property
1282 def value(self):
1283 return self
1284
1285 def startswith_fws(self):
1286 return False
1287
1288 has_fws = False
1289
1290 def as_encoded_word(self, charset):
1291 return _ew.encode(str(self), charset)
1292
1293
1294class EWWhiteSpaceTerminal(WhiteSpaceTerminal):
1295
1296 @property
1297 def value(self):
1298 return ''
1299
1300 @property
1301 def encoded(self):
1302 return self[:]
1303
1304 def __str__(self):
1305 return ''
1306
1307 has_fws = True
1308
1309
1310# XXX these need to become classes and used as instances so
1311# that a program can't change them in a parse tree and screw
1312# up other parse trees. Maybe should have tests for that, too.
1313DOT = ValueTerminal('.', 'dot')
1314ListSeparator = ValueTerminal(',', 'list-separator')
1315RouteComponentMarker = ValueTerminal('@', 'route-component-marker')
1316
1317#
1318# Parser
1319#
1320
1321"""Parse strings according to RFC822/2047/2822/5322 rules.
1322
1323This is a stateless parser. Each get_XXX function accepts a string and
1324returns either a Terminal or a TokenList representing the RFC object named
1325by the method and a string containing the remaining unparsed characters
1326from the input. Thus a parser method consumes the next syntactic construct
1327of a given type and returns a token representing the construct plus the
1328unparsed remainder of the input string.
1329
1330For example, if the first element of a structured header is a 'phrase',
1331then:
1332
1333 phrase, value = get_phrase(value)
1334
1335returns the complete phrase from the start of the string value, plus any
1336characters left in the string after the phrase is removed.
1337
1338"""
1339
1340_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split
1341_non_atom_end_matcher = re.compile(r"[^{}]+".format(
1342 ''.join(ATOM_ENDS).replace('\\','\\\\').replace(']','\]'))).match
1343_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall
R David Murray97f43c02012-06-24 05:03:27 -04001344_non_token_end_matcher = re.compile(r"[^{}]+".format(
1345 ''.join(TOKEN_ENDS).replace('\\','\\\\').replace(']','\]'))).match
1346_non_attribute_end_matcher = re.compile(r"[^{}]+".format(
1347 ''.join(ATTRIBUTE_ENDS).replace('\\','\\\\').replace(']','\]'))).match
1348_non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format(
1349 ''.join(EXTENDED_ATTRIBUTE_ENDS).replace(
1350 '\\','\\\\').replace(']','\]'))).match
R David Murray0b6f6c82012-05-25 18:42:14 -04001351
1352def _validate_xtext(xtext):
1353 """If input token contains ASCII non-printables, register a defect."""
1354
1355 non_printables = _non_printable_finder(xtext)
1356 if non_printables:
1357 xtext.defects.append(errors.NonPrintableDefect(non_printables))
1358 if utils._has_surrogates(xtext):
1359 xtext.defects.append(errors.UndecodableBytesDefect(
1360 "Non-ASCII characters found in header token"))
1361
1362def _get_ptext_to_endchars(value, endchars):
1363 """Scan printables/quoted-pairs until endchars and return unquoted ptext.
1364
1365 This function turns a run of qcontent, ccontent-without-comments, or
1366 dtext-with-quoted-printables into a single string by unquoting any
1367 quoted printables. It returns the string, the remaining value, and
1368 a flag that is True iff there were any quoted printables decoded.
1369
1370 """
1371 fragment, *remainder = _wsp_splitter(value, 1)
1372 vchars = []
1373 escape = False
1374 had_qp = False
1375 for pos in range(len(fragment)):
1376 if fragment[pos] == '\\':
1377 if escape:
1378 escape = False
1379 had_qp = True
1380 else:
1381 escape = True
1382 continue
1383 if escape:
1384 escape = False
1385 elif fragment[pos] in endchars:
1386 break
1387 vchars.append(fragment[pos])
1388 else:
1389 pos = pos + 1
1390 return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp
1391
1392def _decode_ew_run(value):
1393 """ Decode a run of RFC2047 encoded words.
1394
1395 _decode_ew_run(value) -> (text, value, defects)
1396
1397 Scans the supplied value for a run of tokens that look like they are RFC
1398 2047 encoded words, decodes those words into text according to RFC 2047
1399 rules (whitespace between encoded words is discarded), and returns the text
1400 and the remaining value (including any leading whitespace on the remaining
1401 value), as well as a list of any defects encountered while decoding. The
1402 input value may not have any leading whitespace.
1403
1404 """
1405 res = []
1406 defects = []
1407 last_ws = ''
1408 while value:
1409 try:
1410 tok, ws, value = _wsp_splitter(value, 1)
1411 except ValueError:
1412 tok, ws, value = value, '', ''
1413 if not (tok.startswith('=?') and tok.endswith('?=')):
1414 return ''.join(res), last_ws + tok + ws + value, defects
1415 text, charset, lang, new_defects = _ew.decode(tok)
1416 res.append(text)
1417 defects.extend(new_defects)
1418 last_ws = ws
1419 return ''.join(res), last_ws, defects
1420
1421def get_fws(value):
1422 """FWS = 1*WSP
1423
1424 This isn't the RFC definition. We're using fws to represent tokens where
1425 folding can be done, but when we are parsing the *un*folding has already
1426 been done so we don't need to watch out for CRLF.
1427
1428 """
1429 newvalue = value.lstrip()
1430 fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws')
1431 return fws, newvalue
1432
1433def get_encoded_word(value):
1434 """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
1435
1436 """
1437 ew = EncodedWord()
1438 if not value.startswith('=?'):
1439 raise errors.HeaderParseError(
1440 "expected encoded word but found {}".format(value))
1441 tok, *remainder = value[2:].split('?=', 1)
1442 if tok == value[2:]:
1443 raise errors.HeaderParseError(
1444 "expected encoded word but found {}".format(value))
1445 remstr = ''.join(remainder)
1446 if remstr[:2].isdigit():
1447 rest, *remainder = remstr.split('?=', 1)
1448 tok = tok + '?=' + rest
1449 if len(tok.split()) > 1:
1450 ew.defects.append(errors.InvalidHeaderDefect(
1451 "whitespace inside encoded word"))
1452 ew.cte = value
1453 value = ''.join(remainder)
1454 try:
1455 text, charset, lang, defects = _ew.decode('=?' + tok + '?=')
1456 except ValueError:
1457 raise errors.HeaderParseError(
1458 "encoded word format invalid: '{}'".format(ew.cte))
1459 ew.charset = charset
1460 ew.lang = lang
1461 ew.defects.extend(defects)
1462 while text:
1463 if text[0] in WSP:
1464 token, text = get_fws(text)
1465 ew.append(token)
1466 continue
1467 chars, *remainder = _wsp_splitter(text, 1)
1468 vtext = ValueTerminal(chars, 'vtext')
1469 _validate_xtext(vtext)
1470 ew.append(vtext)
1471 text = ''.join(remainder)
1472 return ew, value
1473
1474def get_unstructured(value):
1475 """unstructured = (*([FWS] vchar) *WSP) / obs-unstruct
1476 obs-unstruct = *((*LF *CR *(obs-utext) *LF *CR)) / FWS)
1477 obs-utext = %d0 / obs-NO-WS-CTL / LF / CR
1478
1479 obs-NO-WS-CTL is control characters except WSP/CR/LF.
1480
1481 So, basically, we have printable runs, plus control characters or nulls in
1482 the obsolete syntax, separated by whitespace. Since RFC 2047 uses the
1483 obsolete syntax in its specification, but requires whitespace on either
1484 side of the encoded words, I can see no reason to need to separate the
1485 non-printable-non-whitespace from the printable runs if they occur, so we
1486 parse this into xtext tokens separated by WSP tokens.
1487
1488 Because an 'unstructured' value must by definition constitute the entire
1489 value, this 'get' routine does not return a remaining value, only the
1490 parsed TokenList.
1491
1492 """
1493 # XXX: but what about bare CR and LF? They might signal the start or
1494 # end of an encoded word. YAGNI for now, since out current parsers
1495 # will never send us strings with bard CR or LF.
1496
1497 unstructured = UnstructuredTokenList()
1498 while value:
1499 if value[0] in WSP:
1500 token, value = get_fws(value)
1501 unstructured.append(token)
1502 continue
1503 if value.startswith('=?'):
1504 try:
1505 token, value = get_encoded_word(value)
1506 except errors.HeaderParseError:
1507 pass
1508 else:
1509 have_ws = True
1510 if len(unstructured) > 0:
1511 if unstructured[-1].token_type != 'fws':
1512 unstructured.defects.append(errors.InvalidHeaderDefect(
1513 "missing whitespace before encoded word"))
1514 have_ws = False
1515 if have_ws and len(unstructured) > 1:
1516 if unstructured[-2].token_type == 'encoded-word':
1517 unstructured[-1] = EWWhiteSpaceTerminal(
1518 unstructured[-1], 'fws')
1519 unstructured.append(token)
1520 continue
1521 tok, *remainder = _wsp_splitter(value, 1)
1522 vtext = ValueTerminal(tok, 'vtext')
1523 _validate_xtext(vtext)
1524 unstructured.append(vtext)
1525 value = ''.join(remainder)
1526 return unstructured
1527
1528def get_qp_ctext(value):
1529 """ctext = <printable ascii except \ ( )>
1530
1531 This is not the RFC ctext, since we are handling nested comments in comment
1532 and unquoting quoted-pairs here. We allow anything except the '()'
1533 characters, but if we find any ASCII other than the RFC defined printable
1534 ASCII an NonPrintableDefect is added to the token's defects list. Since
1535 quoted pairs are converted to their unquoted values, what is returned is
1536 a 'ptext' token. In this case it is a WhiteSpaceTerminal, so it's value
1537 is ' '.
1538
1539 """
1540 ptext, value, _ = _get_ptext_to_endchars(value, '()')
1541 ptext = WhiteSpaceTerminal(ptext, 'ptext')
1542 _validate_xtext(ptext)
1543 return ptext, value
1544
1545def get_qcontent(value):
1546 """qcontent = qtext / quoted-pair
1547
1548 We allow anything except the DQUOTE character, but if we find any ASCII
1549 other than the RFC defined printable ASCII an NonPrintableDefect is
1550 added to the token's defects list. Any quoted pairs are converted to their
1551 unquoted values, so what is returned is a 'ptext' token. In this case it
1552 is a ValueTerminal.
1553
1554 """
1555 ptext, value, _ = _get_ptext_to_endchars(value, '"')
1556 ptext = ValueTerminal(ptext, 'ptext')
1557 _validate_xtext(ptext)
1558 return ptext, value
1559
1560def get_atext(value):
1561 """atext = <matches _atext_matcher>
1562
1563 We allow any non-ATOM_ENDS in atext, but add an InvalidATextDefect to
1564 the token's defects list if we find non-atext characters.
1565 """
1566 m = _non_atom_end_matcher(value)
1567 if not m:
1568 raise errors.HeaderParseError(
1569 "expected atext but found '{}'".format(value))
1570 atext = m.group()
1571 value = value[len(atext):]
1572 atext = ValueTerminal(atext, 'atext')
1573 _validate_xtext(atext)
1574 return atext, value
1575
1576def get_bare_quoted_string(value):
1577 """bare-quoted-string = DQUOTE *([FWS] qcontent) [FWS] DQUOTE
1578
1579 A quoted-string without the leading or trailing white space. Its
1580 value is the text between the quote marks, with whitespace
1581 preserved and quoted pairs decoded.
1582 """
1583 if value[0] != '"':
1584 raise errors.HeaderParseError(
1585 "expected '\"' but found '{}'".format(value))
1586 bare_quoted_string = BareQuotedString()
1587 value = value[1:]
1588 while value and value[0] != '"':
1589 if value[0] in WSP:
1590 token, value = get_fws(value)
1591 else:
1592 token, value = get_qcontent(value)
1593 bare_quoted_string.append(token)
1594 if not value:
1595 bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
1596 "end of header inside quoted string"))
1597 return bare_quoted_string, value
1598 return bare_quoted_string, value[1:]
1599
1600def get_comment(value):
1601 """comment = "(" *([FWS] ccontent) [FWS] ")"
1602 ccontent = ctext / quoted-pair / comment
1603
1604 We handle nested comments here, and quoted-pair in our qp-ctext routine.
1605 """
1606 if value and value[0] != '(':
1607 raise errors.HeaderParseError(
1608 "expected '(' but found '{}'".format(value))
1609 comment = Comment()
1610 value = value[1:]
1611 while value and value[0] != ")":
1612 if value[0] in WSP:
1613 token, value = get_fws(value)
1614 elif value[0] == '(':
1615 token, value = get_comment(value)
1616 else:
1617 token, value = get_qp_ctext(value)
1618 comment.append(token)
1619 if not value:
1620 comment.defects.append(errors.InvalidHeaderDefect(
1621 "end of header inside comment"))
1622 return comment, value
1623 return comment, value[1:]
1624
1625def get_cfws(value):
1626 """CFWS = (1*([FWS] comment) [FWS]) / FWS
1627
1628 """
1629 cfws = CFWSList()
1630 while value and value[0] in CFWS_LEADER:
1631 if value[0] in WSP:
1632 token, value = get_fws(value)
1633 else:
1634 token, value = get_comment(value)
1635 cfws.append(token)
1636 return cfws, value
1637
1638def get_quoted_string(value):
1639 """quoted-string = [CFWS] <bare-quoted-string> [CFWS]
1640
1641 'bare-quoted-string' is an intermediate class defined by this
1642 parser and not by the RFC grammar. It is the quoted string
1643 without any attached CFWS.
1644 """
1645 quoted_string = QuotedString()
1646 if value and value[0] in CFWS_LEADER:
1647 token, value = get_cfws(value)
1648 quoted_string.append(token)
1649 token, value = get_bare_quoted_string(value)
1650 quoted_string.append(token)
1651 if value and value[0] in CFWS_LEADER:
1652 token, value = get_cfws(value)
1653 quoted_string.append(token)
1654 return quoted_string, value
1655
1656def get_atom(value):
1657 """atom = [CFWS] 1*atext [CFWS]
1658
1659 """
1660 atom = Atom()
1661 if value and value[0] in CFWS_LEADER:
1662 token, value = get_cfws(value)
1663 atom.append(token)
1664 if value and value[0] in ATOM_ENDS:
1665 raise errors.HeaderParseError(
1666 "expected atom but found '{}'".format(value))
1667 token, value = get_atext(value)
1668 atom.append(token)
1669 if value and value[0] in CFWS_LEADER:
1670 token, value = get_cfws(value)
1671 atom.append(token)
1672 return atom, value
1673
1674def get_dot_atom_text(value):
1675 """ dot-text = 1*atext *("." 1*atext)
1676
1677 """
1678 dot_atom_text = DotAtomText()
1679 if not value or value[0] in ATOM_ENDS:
1680 raise errors.HeaderParseError("expected atom at a start of "
1681 "dot-atom-text but found '{}'".format(value))
1682 while value and value[0] not in ATOM_ENDS:
1683 token, value = get_atext(value)
1684 dot_atom_text.append(token)
1685 if value and value[0] == '.':
1686 dot_atom_text.append(DOT)
1687 value = value[1:]
1688 if dot_atom_text[-1] is DOT:
1689 raise errors.HeaderParseError("expected atom at end of dot-atom-text "
1690 "but found '{}'".format('.'+value))
1691 return dot_atom_text, value
1692
1693def get_dot_atom(value):
1694 """ dot-atom = [CFWS] dot-atom-text [CFWS]
1695
1696 """
1697 dot_atom = DotAtom()
1698 if value[0] in CFWS_LEADER:
1699 token, value = get_cfws(value)
1700 dot_atom.append(token)
1701 token, value = get_dot_atom_text(value)
1702 dot_atom.append(token)
1703 if value and value[0] in CFWS_LEADER:
1704 token, value = get_cfws(value)
1705 dot_atom.append(token)
1706 return dot_atom, value
1707
1708def get_word(value):
1709 """word = atom / quoted-string
1710
1711 Either atom or quoted-string may start with CFWS. We have to peel off this
1712 CFWS first to determine which type of word to parse. Afterward we splice
1713 the leading CFWS, if any, into the parsed sub-token.
1714
1715 If neither an atom or a quoted-string is found before the next special, a
1716 HeaderParseError is raised.
1717
1718 The token returned is either an Atom or a QuotedString, as appropriate.
1719 This means the 'word' level of the formal grammar is not represented in the
1720 parse tree; this is because having that extra layer when manipulating the
1721 parse tree is more confusing than it is helpful.
1722
1723 """
1724 if value[0] in CFWS_LEADER:
1725 leader, value = get_cfws(value)
1726 else:
1727 leader = None
1728 if value[0]=='"':
1729 token, value = get_quoted_string(value)
1730 elif value[0] in SPECIALS:
1731 raise errors.HeaderParseError("Expected 'atom' or 'quoted-string' "
1732 "but found '{}'".format(value))
1733 else:
1734 token, value = get_atom(value)
1735 if leader is not None:
1736 token[:0] = [leader]
1737 return token, value
1738
1739def get_phrase(value):
1740 """ phrase = 1*word / obs-phrase
1741 obs-phrase = word *(word / "." / CFWS)
1742
1743 This means a phrase can be a sequence of words, periods, and CFWS in any
1744 order as long as it starts with at least one word. If anything other than
1745 words is detected, an ObsoleteHeaderDefect is added to the token's defect
1746 list. We also accept a phrase that starts with CFWS followed by a dot;
1747 this is registered as an InvalidHeaderDefect, since it is not supported by
1748 even the obsolete grammar.
1749
1750 """
1751 phrase = Phrase()
1752 try:
1753 token, value = get_word(value)
1754 phrase.append(token)
1755 except errors.HeaderParseError:
1756 phrase.defects.append(errors.InvalidHeaderDefect(
1757 "phrase does not start with word"))
1758 while value and value[0] not in PHRASE_ENDS:
1759 if value[0]=='.':
1760 phrase.append(DOT)
1761 phrase.defects.append(errors.ObsoleteHeaderDefect(
1762 "period in 'phrase'"))
1763 value = value[1:]
1764 else:
1765 try:
1766 token, value = get_word(value)
1767 except errors.HeaderParseError:
1768 if value[0] in CFWS_LEADER:
1769 token, value = get_cfws(value)
1770 phrase.defects.append(errors.ObsoleteHeaderDefect(
1771 "comment found without atom"))
1772 else:
1773 raise
1774 phrase.append(token)
1775 return phrase, value
1776
1777def get_local_part(value):
1778 """ local-part = dot-atom / quoted-string / obs-local-part
1779
1780 """
1781 local_part = LocalPart()
1782 leader = None
1783 if value[0] in CFWS_LEADER:
1784 leader, value = get_cfws(value)
1785 if not value:
1786 raise errors.HeaderParseError(
1787 "expected local-part but found '{}'".format(value))
1788 try:
1789 token, value = get_dot_atom(value)
1790 except errors.HeaderParseError:
1791 try:
1792 token, value = get_word(value)
1793 except errors.HeaderParseError:
1794 if value[0] != '\\' and value[0] in PHRASE_ENDS:
1795 raise
1796 token = TokenList()
1797 if leader is not None:
1798 token[:0] = [leader]
1799 local_part.append(token)
1800 if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
1801 obs_local_part, value = get_obs_local_part(str(local_part) + value)
1802 if obs_local_part.token_type == 'invalid-obs-local-part':
1803 local_part.defects.append(errors.InvalidHeaderDefect(
1804 "local-part is not dot-atom, quoted-string, or obs-local-part"))
1805 else:
1806 local_part.defects.append(errors.ObsoleteHeaderDefect(
1807 "local-part is not a dot-atom (contains CFWS)"))
1808 local_part[0] = obs_local_part
1809 try:
1810 local_part.value.encode('ascii')
1811 except UnicodeEncodeError:
1812 local_part.defects.append(errors.NonASCIILocalPartDefect(
1813 "local-part contains non-ASCII characters)"))
1814 return local_part, value
1815
1816def get_obs_local_part(value):
1817 """ obs-local-part = word *("." word)
1818 """
1819 obs_local_part = ObsLocalPart()
1820 last_non_ws_was_dot = False
1821 while value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
1822 if value[0] == '.':
1823 if last_non_ws_was_dot:
1824 obs_local_part.defects.append(errors.InvalidHeaderDefect(
1825 "invalid repeated '.'"))
1826 obs_local_part.append(DOT)
1827 last_non_ws_was_dot = True
1828 value = value[1:]
1829 continue
1830 elif value[0]=='\\':
1831 obs_local_part.append(ValueTerminal(value[0],
1832 'misplaced-special'))
1833 value = value[1:]
1834 obs_local_part.defects.append(errors.InvalidHeaderDefect(
1835 "'\\' character outside of quoted-string/ccontent"))
1836 last_non_ws_was_dot = False
1837 continue
1838 if obs_local_part and obs_local_part[-1].token_type != 'dot':
1839 obs_local_part.defects.append(errors.InvalidHeaderDefect(
1840 "missing '.' between words"))
1841 try:
1842 token, value = get_word(value)
1843 last_non_ws_was_dot = False
1844 except errors.HeaderParseError:
1845 if value[0] not in CFWS_LEADER:
1846 raise
1847 token, value = get_cfws(value)
1848 obs_local_part.append(token)
1849 if (obs_local_part[0].token_type == 'dot' or
1850 obs_local_part[0].token_type=='cfws' and
1851 obs_local_part[1].token_type=='dot'):
1852 obs_local_part.defects.append(errors.InvalidHeaderDefect(
1853 "Invalid leading '.' in local part"))
1854 if (obs_local_part[-1].token_type == 'dot' or
1855 obs_local_part[-1].token_type=='cfws' and
1856 obs_local_part[-2].token_type=='dot'):
1857 obs_local_part.defects.append(errors.InvalidHeaderDefect(
1858 "Invalid trailing '.' in local part"))
1859 if obs_local_part.defects:
1860 obs_local_part.token_type = 'invalid-obs-local-part'
1861 return obs_local_part, value
1862
1863def get_dtext(value):
1864 """ dtext = <printable ascii except \ [ ]> / obs-dtext
1865 obs-dtext = obs-NO-WS-CTL / quoted-pair
1866
Terry Jan Reedy0f847642013-03-11 18:34:00 -04001867 We allow anything except the excluded characters, but if we find any
R David Murray0b6f6c82012-05-25 18:42:14 -04001868 ASCII other than the RFC defined printable ASCII an NonPrintableDefect is
1869 added to the token's defects list. Quoted pairs are converted to their
1870 unquoted values, so what is returned is a ptext token, in this case a
1871 ValueTerminal. If there were quoted-printables, an ObsoleteHeaderDefect is
1872 added to the returned token's defect list.
1873
1874 """
1875 ptext, value, had_qp = _get_ptext_to_endchars(value, '[]')
1876 ptext = ValueTerminal(ptext, 'ptext')
1877 if had_qp:
1878 ptext.defects.append(errors.ObsoleteHeaderDefect(
1879 "quoted printable found in domain-literal"))
1880 _validate_xtext(ptext)
1881 return ptext, value
1882
1883def _check_for_early_dl_end(value, domain_literal):
1884 if value:
1885 return False
1886 domain_literal.append(errors.InvalidHeaderDefect(
1887 "end of input inside domain-literal"))
1888 domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
1889 return True
1890
1891def get_domain_literal(value):
1892 """ domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS]
1893
1894 """
1895 domain_literal = DomainLiteral()
1896 if value[0] in CFWS_LEADER:
1897 token, value = get_cfws(value)
1898 domain_literal.append(token)
1899 if not value:
1900 raise errors.HeaderParseError("expected domain-literal")
1901 if value[0] != '[':
1902 raise errors.HeaderParseError("expected '[' at start of domain-literal "
1903 "but found '{}'".format(value))
1904 value = value[1:]
1905 if _check_for_early_dl_end(value, domain_literal):
1906 return domain_literal, value
1907 domain_literal.append(ValueTerminal('[', 'domain-literal-start'))
1908 if value[0] in WSP:
1909 token, value = get_fws(value)
1910 domain_literal.append(token)
1911 token, value = get_dtext(value)
1912 domain_literal.append(token)
1913 if _check_for_early_dl_end(value, domain_literal):
1914 return domain_literal, value
1915 if value[0] in WSP:
1916 token, value = get_fws(value)
1917 domain_literal.append(token)
1918 if _check_for_early_dl_end(value, domain_literal):
1919 return domain_literal, value
1920 if value[0] != ']':
1921 raise errors.HeaderParseError("expected ']' at end of domain-literal "
1922 "but found '{}'".format(value))
1923 domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
1924 value = value[1:]
1925 if value and value[0] in CFWS_LEADER:
1926 token, value = get_cfws(value)
1927 domain_literal.append(token)
1928 return domain_literal, value
1929
1930def get_domain(value):
1931 """ domain = dot-atom / domain-literal / obs-domain
1932 obs-domain = atom *("." atom))
1933
1934 """
1935 domain = Domain()
1936 leader = None
1937 if value[0] in CFWS_LEADER:
1938 leader, value = get_cfws(value)
1939 if not value:
1940 raise errors.HeaderParseError(
1941 "expected domain but found '{}'".format(value))
1942 if value[0] == '[':
1943 token, value = get_domain_literal(value)
1944 if leader is not None:
1945 token[:0] = [leader]
1946 domain.append(token)
1947 return domain, value
1948 try:
1949 token, value = get_dot_atom(value)
1950 except errors.HeaderParseError:
1951 token, value = get_atom(value)
1952 if leader is not None:
1953 token[:0] = [leader]
1954 domain.append(token)
1955 if value and value[0] == '.':
1956 domain.defects.append(errors.ObsoleteHeaderDefect(
1957 "domain is not a dot-atom (contains CFWS)"))
1958 if domain[0].token_type == 'dot-atom':
1959 domain[:] = domain[0]
1960 while value and value[0] == '.':
1961 domain.append(DOT)
1962 token, value = get_atom(value[1:])
1963 domain.append(token)
1964 return domain, value
1965
1966def get_addr_spec(value):
1967 """ addr-spec = local-part "@" domain
1968
1969 """
1970 addr_spec = AddrSpec()
1971 token, value = get_local_part(value)
1972 addr_spec.append(token)
1973 if not value or value[0] != '@':
1974 addr_spec.defects.append(errors.InvalidHeaderDefect(
1975 "add-spec local part with no domain"))
1976 return addr_spec, value
1977 addr_spec.append(ValueTerminal('@', 'address-at-symbol'))
1978 token, value = get_domain(value[1:])
1979 addr_spec.append(token)
1980 return addr_spec, value
1981
1982def get_obs_route(value):
1983 """ obs-route = obs-domain-list ":"
1984 obs-domain-list = *(CFWS / ",") "@" domain *("," [CFWS] ["@" domain])
1985
1986 Returns an obs-route token with the appropriate sub-tokens (that is,
1987 there is no obs-domain-list in the parse tree).
1988 """
1989 obs_route = ObsRoute()
1990 while value and (value[0]==',' or value[0] in CFWS_LEADER):
1991 if value[0] in CFWS_LEADER:
1992 token, value = get_cfws(value)
1993 obs_route.append(token)
1994 elif value[0] == ',':
1995 obs_route.append(ListSeparator)
1996 value = value[1:]
1997 if not value or value[0] != '@':
1998 raise errors.HeaderParseError(
1999 "expected obs-route domain but found '{}'".format(value))
2000 obs_route.append(RouteComponentMarker)
2001 token, value = get_domain(value[1:])
2002 obs_route.append(token)
2003 while value and value[0]==',':
2004 obs_route.append(ListSeparator)
2005 value = value[1:]
2006 if not value:
2007 break
2008 if value[0] in CFWS_LEADER:
2009 token, value = get_cfws(value)
2010 obs_route.append(token)
2011 if value[0] == '@':
2012 obs_route.append(RouteComponentMarker)
2013 token, value = get_domain(value[1:])
2014 obs_route.append(token)
2015 if not value:
2016 raise errors.HeaderParseError("end of header while parsing obs-route")
2017 if value[0] != ':':
2018 raise errors.HeaderParseError( "expected ':' marking end of "
2019 "obs-route but found '{}'".format(value))
2020 obs_route.append(ValueTerminal(':', 'end-of-obs-route-marker'))
2021 return obs_route, value[1:]
2022
2023def get_angle_addr(value):
2024 """ angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr
2025 obs-angle-addr = [CFWS] "<" obs-route addr-spec ">" [CFWS]
2026
2027 """
2028 angle_addr = AngleAddr()
2029 if value[0] in CFWS_LEADER:
2030 token, value = get_cfws(value)
2031 angle_addr.append(token)
2032 if not value or value[0] != '<':
2033 raise errors.HeaderParseError(
2034 "expected angle-addr but found '{}'".format(value))
2035 angle_addr.append(ValueTerminal('<', 'angle-addr-start'))
2036 value = value[1:]
R David Murray032eed32012-05-26 14:31:12 -04002037 # Although it is not legal per RFC5322, SMTP uses '<>' in certain
2038 # circumstances.
2039 if value[0] == '>':
2040 angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
2041 angle_addr.defects.append(errors.InvalidHeaderDefect(
2042 "null addr-spec in angle-addr"))
2043 value = value[1:]
2044 return angle_addr, value
R David Murray0b6f6c82012-05-25 18:42:14 -04002045 try:
2046 token, value = get_addr_spec(value)
2047 except errors.HeaderParseError:
2048 try:
2049 token, value = get_obs_route(value)
2050 angle_addr.defects.append(errors.ObsoleteHeaderDefect(
2051 "obsolete route specification in angle-addr"))
2052 except errors.HeaderParseError:
2053 raise errors.HeaderParseError(
R David Murray032eed32012-05-26 14:31:12 -04002054 "expected addr-spec or obs-route but found '{}'".format(value))
R David Murray0b6f6c82012-05-25 18:42:14 -04002055 angle_addr.append(token)
2056 token, value = get_addr_spec(value)
2057 angle_addr.append(token)
2058 if value and value[0] == '>':
2059 value = value[1:]
2060 else:
2061 angle_addr.defects.append(errors.InvalidHeaderDefect(
2062 "missing trailing '>' on angle-addr"))
2063 angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
2064 if value and value[0] in CFWS_LEADER:
2065 token, value = get_cfws(value)
2066 angle_addr.append(token)
2067 return angle_addr, value
2068
2069def get_display_name(value):
2070 """ display-name = phrase
2071
2072 Because this is simply a name-rule, we don't return a display-name
2073 token containing a phrase, but rather a display-name token with
2074 the content of the phrase.
2075
2076 """
2077 display_name = DisplayName()
2078 token, value = get_phrase(value)
2079 display_name.extend(token[:])
2080 display_name.defects = token.defects[:]
2081 return display_name, value
2082
2083
2084def get_name_addr(value):
2085 """ name-addr = [display-name] angle-addr
2086
2087 """
2088 name_addr = NameAddr()
2089 # Both the optional display name and the angle-addr can start with cfws.
2090 leader = None
2091 if value[0] in CFWS_LEADER:
2092 leader, value = get_cfws(value)
2093 if not value:
2094 raise errors.HeaderParseError(
2095 "expected name-addr but found '{}'".format(leader))
2096 if value[0] != '<':
2097 if value[0] in PHRASE_ENDS:
2098 raise errors.HeaderParseError(
2099 "expected name-addr but found '{}'".format(value))
2100 token, value = get_display_name(value)
2101 if not value:
2102 raise errors.HeaderParseError(
2103 "expected name-addr but found '{}'".format(token))
2104 if leader is not None:
2105 token[0][:0] = [leader]
2106 leader = None
2107 name_addr.append(token)
2108 token, value = get_angle_addr(value)
2109 if leader is not None:
2110 token[:0] = [leader]
2111 name_addr.append(token)
2112 return name_addr, value
2113
2114def get_mailbox(value):
2115 """ mailbox = name-addr / addr-spec
2116
2117 """
2118 # The only way to figure out if we are dealing with a name-addr or an
2119 # addr-spec is to try parsing each one.
2120 mailbox = Mailbox()
2121 try:
2122 token, value = get_name_addr(value)
2123 except errors.HeaderParseError:
2124 try:
2125 token, value = get_addr_spec(value)
2126 except errors.HeaderParseError:
2127 raise errors.HeaderParseError(
2128 "expected mailbox but found '{}'".format(value))
2129 if any(isinstance(x, errors.InvalidHeaderDefect)
2130 for x in token.all_defects):
2131 mailbox.token_type = 'invalid-mailbox'
2132 mailbox.append(token)
2133 return mailbox, value
2134
2135def get_invalid_mailbox(value, endchars):
2136 """ Read everything up to one of the chars in endchars.
2137
2138 This is outside the formal grammar. The InvalidMailbox TokenList that is
2139 returned acts like a Mailbox, but the data attributes are None.
2140
2141 """
2142 invalid_mailbox = InvalidMailbox()
2143 while value and value[0] not in endchars:
2144 if value[0] in PHRASE_ENDS:
2145 invalid_mailbox.append(ValueTerminal(value[0],
2146 'misplaced-special'))
2147 value = value[1:]
2148 else:
2149 token, value = get_phrase(value)
2150 invalid_mailbox.append(token)
2151 return invalid_mailbox, value
2152
2153def get_mailbox_list(value):
2154 """ mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list
2155 obs-mbox-list = *([CFWS] ",") mailbox *("," [mailbox / CFWS])
2156
2157 For this routine we go outside the formal grammar in order to improve error
2158 handling. We recognize the end of the mailbox list only at the end of the
2159 value or at a ';' (the group terminator). This is so that we can turn
2160 invalid mailboxes into InvalidMailbox tokens and continue parsing any
2161 remaining valid mailboxes. We also allow all mailbox entries to be null,
2162 and this condition is handled appropriately at a higher level.
2163
2164 """
2165 mailbox_list = MailboxList()
2166 while value and value[0] != ';':
2167 try:
2168 token, value = get_mailbox(value)
2169 mailbox_list.append(token)
2170 except errors.HeaderParseError:
2171 leader = None
2172 if value[0] in CFWS_LEADER:
2173 leader, value = get_cfws(value)
2174 if not value or value[0] in ',;':
2175 mailbox_list.append(leader)
2176 mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
2177 "empty element in mailbox-list"))
2178 else:
2179 token, value = get_invalid_mailbox(value, ',;')
2180 if leader is not None:
2181 token[:0] = [leader]
2182 mailbox_list.append(token)
2183 mailbox_list.defects.append(errors.InvalidHeaderDefect(
2184 "invalid mailbox in mailbox-list"))
2185 elif value[0] == ',':
2186 mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
2187 "empty element in mailbox-list"))
2188 else:
2189 token, value = get_invalid_mailbox(value, ',;')
2190 if leader is not None:
2191 token[:0] = [leader]
2192 mailbox_list.append(token)
2193 mailbox_list.defects.append(errors.InvalidHeaderDefect(
2194 "invalid mailbox in mailbox-list"))
2195 if value and value[0] not in ',;':
2196 # Crap after mailbox; treat it as an invalid mailbox.
2197 # The mailbox info will still be available.
2198 mailbox = mailbox_list[-1]
2199 mailbox.token_type = 'invalid-mailbox'
2200 token, value = get_invalid_mailbox(value, ',;')
2201 mailbox.extend(token)
2202 mailbox_list.defects.append(errors.InvalidHeaderDefect(
2203 "invalid mailbox in mailbox-list"))
2204 if value and value[0] == ',':
2205 mailbox_list.append(ListSeparator)
2206 value = value[1:]
2207 return mailbox_list, value
2208
2209
2210def get_group_list(value):
2211 """ group-list = mailbox-list / CFWS / obs-group-list
2212 obs-group-list = 1*([CFWS] ",") [CFWS]
2213
2214 """
2215 group_list = GroupList()
2216 if not value:
2217 group_list.defects.append(errors.InvalidHeaderDefect(
2218 "end of header before group-list"))
2219 return group_list, value
2220 leader = None
2221 if value and value[0] in CFWS_LEADER:
2222 leader, value = get_cfws(value)
2223 if not value:
2224 # This should never happen in email parsing, since CFWS-only is a
2225 # legal alternative to group-list in a group, which is the only
2226 # place group-list appears.
2227 group_list.defects.append(errors.InvalidHeaderDefect(
2228 "end of header in group-list"))
2229 group_list.append(leader)
2230 return group_list, value
2231 if value[0] == ';':
2232 group_list.append(leader)
2233 return group_list, value
2234 token, value = get_mailbox_list(value)
2235 if len(token.all_mailboxes)==0:
2236 if leader is not None:
2237 group_list.append(leader)
2238 group_list.extend(token)
2239 group_list.defects.append(errors.ObsoleteHeaderDefect(
2240 "group-list with empty entries"))
2241 return group_list, value
2242 if leader is not None:
2243 token[:0] = [leader]
2244 group_list.append(token)
2245 return group_list, value
2246
2247def get_group(value):
2248 """ group = display-name ":" [group-list] ";" [CFWS]
2249
2250 """
2251 group = Group()
2252 token, value = get_display_name(value)
2253 if not value or value[0] != ':':
2254 raise errors.HeaderParseError("expected ':' at end of group "
2255 "display name but found '{}'".format(value))
2256 group.append(token)
2257 group.append(ValueTerminal(':', 'group-display-name-terminator'))
2258 value = value[1:]
2259 if value and value[0] == ';':
2260 group.append(ValueTerminal(';', 'group-terminator'))
2261 return group, value[1:]
2262 token, value = get_group_list(value)
2263 group.append(token)
2264 if not value:
2265 group.defects.append(errors.InvalidHeaderDefect(
2266 "end of header in group"))
2267 if value[0] != ';':
2268 raise errors.HeaderParseError(
2269 "expected ';' at end of group but found {}".format(value))
2270 group.append(ValueTerminal(';', 'group-terminator'))
2271 value = value[1:]
2272 if value and value[0] in CFWS_LEADER:
2273 token, value = get_cfws(value)
2274 group.append(token)
2275 return group, value
2276
2277def get_address(value):
2278 """ address = mailbox / group
2279
2280 Note that counter-intuitively, an address can be either a single address or
2281 a list of addresses (a group). This is why the returned Address object has
2282 a 'mailboxes' attribute which treats a single address as a list of length
2283 one. When you need to differentiate between to two cases, extract the single
2284 element, which is either a mailbox or a group token.
2285
2286 """
2287 # The formal grammar isn't very helpful when parsing an address. mailbox
2288 # and group, especially when allowing for obsolete forms, start off very
2289 # similarly. It is only when you reach one of @, <, or : that you know
2290 # what you've got. So, we try each one in turn, starting with the more
2291 # likely of the two. We could perhaps make this more efficient by looking
2292 # for a phrase and then branching based on the next character, but that
2293 # would be a premature optimization.
2294 address = Address()
2295 try:
2296 token, value = get_group(value)
2297 except errors.HeaderParseError:
2298 try:
2299 token, value = get_mailbox(value)
2300 except errors.HeaderParseError:
2301 raise errors.HeaderParseError(
2302 "expected address but found '{}'".format(value))
2303 address.append(token)
2304 return address, value
2305
2306def get_address_list(value):
2307 """ address_list = (address *("," address)) / obs-addr-list
2308 obs-addr-list = *([CFWS] ",") address *("," [address / CFWS])
2309
2310 We depart from the formal grammar here by continuing to parse until the end
2311 of the input, assuming the input to be entirely composed of an
2312 address-list. This is always true in email parsing, and allows us
2313 to skip invalid addresses to parse additional valid ones.
2314
2315 """
2316 address_list = AddressList()
2317 while value:
2318 try:
2319 token, value = get_address(value)
2320 address_list.append(token)
2321 except errors.HeaderParseError as err:
2322 leader = None
2323 if value[0] in CFWS_LEADER:
2324 leader, value = get_cfws(value)
2325 if not value or value[0] == ',':
2326 address_list.append(leader)
2327 address_list.defects.append(errors.ObsoleteHeaderDefect(
2328 "address-list entry with no content"))
2329 else:
2330 token, value = get_invalid_mailbox(value, ',')
2331 if leader is not None:
2332 token[:0] = [leader]
2333 address_list.append(Address([token]))
2334 address_list.defects.append(errors.InvalidHeaderDefect(
2335 "invalid address in address-list"))
2336 elif value[0] == ',':
2337 address_list.defects.append(errors.ObsoleteHeaderDefect(
2338 "empty element in address-list"))
2339 else:
2340 token, value = get_invalid_mailbox(value, ',')
2341 if leader is not None:
2342 token[:0] = [leader]
2343 address_list.append(Address([token]))
2344 address_list.defects.append(errors.InvalidHeaderDefect(
2345 "invalid address in address-list"))
2346 if value and value[0] != ',':
2347 # Crap after address; treat it as an invalid mailbox.
2348 # The mailbox info will still be available.
2349 mailbox = address_list[-1][0]
2350 mailbox.token_type = 'invalid-mailbox'
2351 token, value = get_invalid_mailbox(value, ',')
2352 mailbox.extend(token)
2353 address_list.defects.append(errors.InvalidHeaderDefect(
2354 "invalid address in address-list"))
2355 if value: # Must be a , at this point.
2356 address_list.append(ValueTerminal(',', 'list-separator'))
2357 value = value[1:]
2358 return address_list, value
R David Murray97f43c02012-06-24 05:03:27 -04002359
2360#
2361# XXX: As I begin to add additional header parsers, I'm realizing we probably
2362# have two level of parser routines: the get_XXX methods that get a token in
2363# the grammar, and parse_XXX methods that parse an entire field value. So
2364# get_address_list above should really be a parse_ method, as probably should
2365# be get_unstructured.
2366#
2367
2368def parse_mime_version(value):
2369 """ mime-version = [CFWS] 1*digit [CFWS] "." [CFWS] 1*digit [CFWS]
2370
2371 """
2372 # The [CFWS] is implicit in the RFC 2045 BNF.
2373 # XXX: This routine is a bit verbose, should factor out a get_int method.
2374 mime_version = MIMEVersion()
2375 if not value:
2376 mime_version.defects.append(errors.HeaderMissingRequiredValue(
2377 "Missing MIME version number (eg: 1.0)"))
2378 return mime_version
2379 if value[0] in CFWS_LEADER:
2380 token, value = get_cfws(value)
2381 mime_version.append(token)
2382 if not value:
2383 mime_version.defects.append(errors.HeaderMissingRequiredValue(
2384 "Expected MIME version number but found only CFWS"))
2385 digits = ''
2386 while value and value[0] != '.' and value[0] not in CFWS_LEADER:
2387 digits += value[0]
2388 value = value[1:]
2389 if not digits.isdigit():
2390 mime_version.defects.append(errors.InvalidHeaderDefect(
2391 "Expected MIME major version number but found {!r}".format(digits)))
2392 mime_version.append(ValueTerminal(digits, 'xtext'))
2393 else:
2394 mime_version.major = int(digits)
2395 mime_version.append(ValueTerminal(digits, 'digits'))
2396 if value and value[0] in CFWS_LEADER:
2397 token, value = get_cfws(value)
2398 mime_version.append(token)
2399 if not value or value[0] != '.':
2400 if mime_version.major is not None:
2401 mime_version.defects.append(errors.InvalidHeaderDefect(
2402 "Incomplete MIME version; found only major number"))
2403 if value:
2404 mime_version.append(ValueTerminal(value, 'xtext'))
2405 return mime_version
2406 mime_version.append(ValueTerminal('.', 'version-separator'))
2407 value = value[1:]
2408 if value and value[0] in CFWS_LEADER:
2409 token, value = get_cfws(value)
2410 mime_version.append(token)
2411 if not value:
2412 if mime_version.major is not None:
2413 mime_version.defects.append(errors.InvalidHeaderDefect(
2414 "Incomplete MIME version; found only major number"))
2415 return mime_version
2416 digits = ''
2417 while value and value[0] not in CFWS_LEADER:
2418 digits += value[0]
2419 value = value[1:]
2420 if not digits.isdigit():
2421 mime_version.defects.append(errors.InvalidHeaderDefect(
2422 "Expected MIME minor version number but found {!r}".format(digits)))
2423 mime_version.append(ValueTerminal(digits, 'xtext'))
2424 else:
2425 mime_version.minor = int(digits)
2426 mime_version.append(ValueTerminal(digits, 'digits'))
2427 if value and value[0] in CFWS_LEADER:
2428 token, value = get_cfws(value)
2429 mime_version.append(token)
2430 if value:
2431 mime_version.defects.append(errors.InvalidHeaderDefect(
2432 "Excess non-CFWS text after MIME version"))
2433 mime_version.append(ValueTerminal(value, 'xtext'))
2434 return mime_version
2435
2436def get_invalid_parameter(value):
2437 """ Read everything up to the next ';'.
2438
2439 This is outside the formal grammar. The InvalidParameter TokenList that is
2440 returned acts like a Parameter, but the data attributes are None.
2441
2442 """
2443 invalid_parameter = InvalidParameter()
2444 while value and value[0] != ';':
2445 if value[0] in PHRASE_ENDS:
2446 invalid_parameter.append(ValueTerminal(value[0],
2447 'misplaced-special'))
2448 value = value[1:]
2449 else:
2450 token, value = get_phrase(value)
2451 invalid_parameter.append(token)
2452 return invalid_parameter, value
2453
2454def get_ttext(value):
2455 """ttext = <matches _ttext_matcher>
2456
2457 We allow any non-TOKEN_ENDS in ttext, but add defects to the token's
2458 defects list if we find non-ttext characters. We also register defects for
2459 *any* non-printables even though the RFC doesn't exclude all of them,
2460 because we follow the spirit of RFC 5322.
2461
2462 """
2463 m = _non_token_end_matcher(value)
2464 if not m:
2465 raise errors.HeaderParseError(
2466 "expected ttext but found '{}'".format(value))
2467 ttext = m.group()
2468 value = value[len(ttext):]
2469 ttext = ValueTerminal(ttext, 'ttext')
2470 _validate_xtext(ttext)
2471 return ttext, value
2472
2473def get_token(value):
2474 """token = [CFWS] 1*ttext [CFWS]
2475
2476 The RFC equivalent of ttext is any US-ASCII chars except space, ctls, or
2477 tspecials. We also exclude tabs even though the RFC doesn't.
2478
2479 The RFC implies the CFWS but is not explicit about it in the BNF.
2480
2481 """
2482 mtoken = Token()
2483 if value and value[0] in CFWS_LEADER:
2484 token, value = get_cfws(value)
2485 mtoken.append(token)
2486 if value and value[0] in TOKEN_ENDS:
2487 raise errors.HeaderParseError(
2488 "expected token but found '{}'".format(value))
2489 token, value = get_ttext(value)
2490 mtoken.append(token)
2491 if value and value[0] in CFWS_LEADER:
2492 token, value = get_cfws(value)
2493 mtoken.append(token)
2494 return mtoken, value
2495
2496def get_attrtext(value):
2497 """attrtext = 1*(any non-ATTRIBUTE_ENDS character)
2498
2499 We allow any non-ATTRIBUTE_ENDS in attrtext, but add defects to the
2500 token's defects list if we find non-attrtext characters. We also register
2501 defects for *any* non-printables even though the RFC doesn't exclude all of
2502 them, because we follow the spirit of RFC 5322.
2503
2504 """
2505 m = _non_attribute_end_matcher(value)
2506 if not m:
2507 raise errors.HeaderParseError(
2508 "expected attrtext but found {!r}".format(value))
2509 attrtext = m.group()
2510 value = value[len(attrtext):]
2511 attrtext = ValueTerminal(attrtext, 'attrtext')
2512 _validate_xtext(attrtext)
2513 return attrtext, value
2514
2515def get_attribute(value):
2516 """ [CFWS] 1*attrtext [CFWS]
2517
2518 This version of the BNF makes the CFWS explicit, and as usual we use a
2519 value terminal for the actual run of characters. The RFC equivalent of
2520 attrtext is the token characters, with the subtraction of '*', "'", and '%'.
2521 We include tab in the excluded set just as we do for token.
2522
2523 """
2524 attribute = Attribute()
2525 if value and value[0] in CFWS_LEADER:
2526 token, value = get_cfws(value)
2527 attribute.append(token)
2528 if value and value[0] in ATTRIBUTE_ENDS:
2529 raise errors.HeaderParseError(
2530 "expected token but found '{}'".format(value))
2531 token, value = get_attrtext(value)
2532 attribute.append(token)
2533 if value and value[0] in CFWS_LEADER:
2534 token, value = get_cfws(value)
2535 attribute.append(token)
2536 return attribute, value
2537
2538def get_extended_attrtext(value):
2539 """attrtext = 1*(any non-ATTRIBUTE_ENDS character plus '%')
2540
2541 This is a special parsing routine so that we get a value that
2542 includes % escapes as a single string (which we decode as a single
2543 string later).
2544
2545 """
2546 m = _non_extended_attribute_end_matcher(value)
2547 if not m:
2548 raise errors.HeaderParseError(
2549 "expected extended attrtext but found {!r}".format(value))
2550 attrtext = m.group()
2551 value = value[len(attrtext):]
2552 attrtext = ValueTerminal(attrtext, 'extended-attrtext')
2553 _validate_xtext(attrtext)
2554 return attrtext, value
2555
2556def get_extended_attribute(value):
2557 """ [CFWS] 1*extended_attrtext [CFWS]
2558
2559 This is like the non-extended version except we allow % characters, so that
2560 we can pick up an encoded value as a single string.
2561
2562 """
2563 # XXX: should we have an ExtendedAttribute TokenList?
2564 attribute = Attribute()
2565 if value and value[0] in CFWS_LEADER:
2566 token, value = get_cfws(value)
2567 attribute.append(token)
2568 if value and value[0] in EXTENDED_ATTRIBUTE_ENDS:
2569 raise errors.HeaderParseError(
2570 "expected token but found '{}'".format(value))
2571 token, value = get_extended_attrtext(value)
2572 attribute.append(token)
2573 if value and value[0] in CFWS_LEADER:
2574 token, value = get_cfws(value)
2575 attribute.append(token)
2576 return attribute, value
2577
2578def get_section(value):
2579 """ '*' digits
2580
2581 The formal BNF is more complicated because leading 0s are not allowed. We
2582 check for that and add a defect. We also assume no CFWS is allowed between
2583 the '*' and the digits, though the RFC is not crystal clear on that.
2584 The caller should already have dealt with leading CFWS.
2585
2586 """
2587 section = Section()
2588 if not value or value[0] != '*':
2589 raise errors.HeaderParseError("Expected section but found {}".format(
2590 value))
2591 section.append(ValueTerminal('*', 'section-marker'))
2592 value = value[1:]
2593 if not value or not value[0].isdigit():
2594 raise errors.HeaderParseError("Expected section number but "
2595 "found {}".format(value))
2596 digits = ''
2597 while value and value[0].isdigit():
2598 digits += value[0]
2599 value = value[1:]
2600 if digits[0] == '0' and digits != '0':
2601 section.defects.append(errors.InvalidHeaderError("section number"
2602 "has an invalid leading 0"))
2603 section.number = int(digits)
2604 section.append(ValueTerminal(digits, 'digits'))
2605 return section, value
2606
2607
2608def get_value(value):
2609 """ quoted-string / attribute
2610
2611 """
2612 v = Value()
2613 if not value:
2614 raise errors.HeaderParseError("Expected value but found end of string")
2615 leader = None
2616 if value[0] in CFWS_LEADER:
2617 leader, value = get_cfws(value)
2618 if not value:
2619 raise errors.HeaderParseError("Expected value but found "
2620 "only {}".format(leader))
2621 if value[0] == '"':
2622 token, value = get_quoted_string(value)
2623 else:
2624 token, value = get_extended_attribute(value)
2625 if leader is not None:
2626 token[:0] = [leader]
2627 v.append(token)
2628 return v, value
2629
2630def get_parameter(value):
2631 """ attribute [section] ["*"] [CFWS] "=" value
2632
2633 The CFWS is implied by the RFC but not made explicit in the BNF. This
2634 simplified form of the BNF from the RFC is made to conform with the RFC BNF
2635 through some extra checks. We do it this way because it makes both error
2636 recovery and working with the resulting parse tree easier.
2637 """
2638 # It is possible CFWS would also be implicitly allowed between the section
2639 # and the 'extended-attribute' marker (the '*') , but we've never seen that
2640 # in the wild and we will therefore ignore the possibility.
2641 param = Parameter()
2642 token, value = get_attribute(value)
2643 param.append(token)
2644 if not value or value[0] == ';':
2645 param.defects.append(errors.InvalidHeaderDefect("Parameter contains "
2646 "name ({}) but no value".format(token)))
2647 return param, value
2648 if value[0] == '*':
2649 try:
2650 token, value = get_section(value)
2651 param.sectioned = True
2652 param.append(token)
2653 except errors.HeaderParseError:
2654 pass
2655 if not value:
2656 raise errors.HeaderParseError("Incomplete parameter")
2657 if value[0] == '*':
2658 param.append(ValueTerminal('*', 'extended-parameter-marker'))
2659 value = value[1:]
2660 param.extended = True
2661 if value[0] != '=':
2662 raise errors.HeaderParseError("Parameter not followed by '='")
2663 param.append(ValueTerminal('=', 'parameter-separator'))
2664 value = value[1:]
2665 leader = None
2666 if value and value[0] in CFWS_LEADER:
2667 token, value = get_cfws(value)
2668 param.append(token)
2669 remainder = None
2670 appendto = param
2671 if param.extended and value and value[0] == '"':
2672 # Now for some serious hackery to handle the common invalid case of
2673 # double quotes around an extended value. We also accept (with defect)
2674 # a value marked as encoded that isn't really.
2675 qstring, remainder = get_quoted_string(value)
2676 inner_value = qstring.stripped_value
2677 semi_valid = False
2678 if param.section_number == 0:
2679 if inner_value and inner_value[0] == "'":
2680 semi_valid = True
2681 else:
2682 token, rest = get_attrtext(inner_value)
2683 if rest and rest[0] == "'":
2684 semi_valid = True
2685 else:
2686 try:
2687 token, rest = get_extended_attrtext(inner_value)
2688 except:
2689 pass
2690 else:
2691 if not rest:
2692 semi_valid = True
2693 if semi_valid:
2694 param.defects.append(errors.InvalidHeaderDefect(
2695 "Quoted string value for extended parameter is invalid"))
2696 param.append(qstring)
2697 for t in qstring:
2698 if t.token_type == 'bare-quoted-string':
2699 t[:] = []
2700 appendto = t
2701 break
2702 value = inner_value
2703 else:
2704 remainder = None
2705 param.defects.append(errors.InvalidHeaderDefect(
2706 "Parameter marked as extended but appears to have a "
2707 "quoted string value that is non-encoded"))
2708 if value and value[0] == "'":
2709 token = None
2710 else:
2711 token, value = get_value(value)
2712 if not param.extended or param.section_number > 0:
2713 if not value or value[0] != "'":
2714 appendto.append(token)
2715 if remainder is not None:
2716 assert not value, value
2717 value = remainder
2718 return param, value
2719 param.defects.append(errors.InvalidHeaderDefect(
2720 "Apparent initial-extended-value but attribute "
2721 "was not marked as extended or was not initial section"))
2722 if not value:
2723 # Assume the charset/lang is missing and the token is the value.
2724 param.defects.append(errors.InvalidHeaderDefect(
2725 "Missing required charset/lang delimiters"))
2726 appendto.append(token)
2727 if remainder is None:
2728 return param, value
2729 else:
2730 if token is not None:
2731 for t in token:
2732 if t.token_type == 'extended-attrtext':
2733 break
2734 t.token_type == 'attrtext'
2735 appendto.append(t)
2736 param.charset = t.value
2737 if value[0] != "'":
2738 raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
2739 "delimiter, but found {!r}".format(value))
2740 appendto.append(ValueTerminal("'", 'RFC2231 delimiter'))
2741 value = value[1:]
2742 if value and value[0] != "'":
2743 token, value = get_attrtext(value)
2744 appendto.append(token)
2745 param.lang = token.value
2746 if not value or value[0] != "'":
2747 raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
2748 "delimiter, but found {}".format(value))
2749 appendto.append(ValueTerminal("'", 'RFC2231 delimiter'))
2750 value = value[1:]
2751 if remainder is not None:
2752 # Treat the rest of value as bare quoted string content.
2753 v = Value()
2754 while value:
2755 if value[0] in WSP:
2756 token, value = get_fws(value)
2757 else:
2758 token, value = get_qcontent(value)
2759 v.append(token)
2760 token = v
2761 else:
2762 token, value = get_value(value)
2763 appendto.append(token)
2764 if remainder is not None:
2765 assert not value, value
2766 value = remainder
2767 return param, value
2768
2769def parse_mime_parameters(value):
2770 """ parameter *( ";" parameter )
2771
2772 That BNF is meant to indicate this routine should only be called after
2773 finding and handling the leading ';'. There is no corresponding rule in
2774 the formal RFC grammar, but it is more convenient for us for the set of
2775 parameters to be treated as its own TokenList.
2776
2777 This is 'parse' routine because it consumes the reminaing value, but it
2778 would never be called to parse a full header. Instead it is called to
2779 parse everything after the non-parameter value of a specific MIME header.
2780
2781 """
2782 mime_parameters = MimeParameters()
2783 while value:
2784 try:
2785 token, value = get_parameter(value)
2786 mime_parameters.append(token)
2787 except errors.HeaderParseError as err:
2788 leader = None
2789 if value[0] in CFWS_LEADER:
2790 leader, value = get_cfws(value)
2791 if not value:
2792 mime_parameters.append(leader)
2793 return mime_parameters
2794 if value[0] == ';':
2795 if leader is not None:
2796 mime_parameters.append(leader)
2797 mime_parameters.defects.append(errors.InvalidHeaderDefect(
2798 "parameter entry with no content"))
2799 else:
2800 token, value = get_invalid_parameter(value)
2801 if leader:
2802 token[:0] = [leader]
2803 mime_parameters.append(token)
2804 mime_parameters.defects.append(errors.InvalidHeaderDefect(
2805 "invalid parameter {!r}".format(token)))
2806 if value and value[0] != ';':
2807 # Junk after the otherwise valid parameter. Mark it as
2808 # invalid, but it will have a value.
2809 param = mime_parameters[-1]
2810 param.token_type = 'invalid-parameter'
2811 token, value = get_invalid_parameter(value)
2812 param.extend(token)
2813 mime_parameters.defects.append(errors.InvalidHeaderDefect(
2814 "parameter with invalid trailing text {!r}".format(token)))
2815 if value:
2816 # Must be a ';' at this point.
2817 mime_parameters.append(ValueTerminal(';', 'parameter-separator'))
2818 value = value[1:]
2819 return mime_parameters
2820
2821def _find_mime_parameters(tokenlist, value):
2822 """Do our best to find the parameters in an invalid MIME header
2823
2824 """
2825 while value and value[0] != ';':
2826 if value[0] in PHRASE_ENDS:
2827 tokenlist.append(ValueTerminal(value[0], 'misplaced-special'))
2828 value = value[1:]
2829 else:
2830 token, value = get_phrase(value)
2831 tokenlist.append(token)
2832 if not value:
2833 return
2834 tokenlist.append(ValueTerminal(';', 'parameter-separator'))
2835 tokenlist.append(parse_mime_parameters(value[1:]))
2836
2837def parse_content_type_header(value):
2838 """ maintype "/" subtype *( ";" parameter )
2839
2840 The maintype and substype are tokens. Theoretically they could
2841 be checked against the official IANA list + x-token, but we
2842 don't do that.
2843 """
2844 ctype = ContentType()
2845 recover = False
2846 if not value:
2847 ctype.defects.append(errors.HeaderMissingRequiredValue(
2848 "Missing content type specification"))
2849 return ctype
2850 try:
2851 token, value = get_token(value)
2852 except errors.HeaderParseError:
2853 ctype.defects.append(errors.InvalidHeaderDefect(
2854 "Expected content maintype but found {!r}".format(value)))
2855 _find_mime_parameters(ctype, value)
2856 return ctype
2857 ctype.append(token)
2858 # XXX: If we really want to follow the formal grammer we should make
2859 # mantype and subtype specialized TokenLists here. Probably not worth it.
2860 if not value or value[0] != '/':
2861 ctype.defects.append(errors.InvalidHeaderDefect(
2862 "Invalid content type"))
2863 if value:
2864 _find_mime_parameters(ctype, value)
2865 return ctype
2866 ctype.maintype = token.value.strip().lower()
2867 ctype.append(ValueTerminal('/', 'content-type-separator'))
2868 value = value[1:]
2869 try:
2870 token, value = get_token(value)
2871 except errors.HeaderParseError:
2872 ctype.defects.append(errors.InvalidHeaderDefect(
2873 "Expected content subtype but found {!r}".format(value)))
2874 _find_mime_parameters(ctype, value)
2875 return ctype
2876 ctype.append(token)
2877 ctype.subtype = token.value.strip().lower()
2878 if not value:
2879 return ctype
2880 if value[0] != ';':
2881 ctype.defects.append(errors.InvalidHeaderDefect(
2882 "Only parameters are valid after content type, but "
2883 "found {!r}".format(value)))
2884 # The RFC requires that a syntactically invalid content-type be treated
2885 # as text/plain. Perhaps we should postel this, but we should probably
2886 # only do that if we were checking the subtype value against IANA.
2887 del ctype.maintype, ctype.subtype
2888 _find_mime_parameters(ctype, value)
2889 return ctype
2890 ctype.append(ValueTerminal(';', 'parameter-separator'))
2891 ctype.append(parse_mime_parameters(value[1:]))
2892 return ctype
2893
2894def parse_content_disposition_header(value):
2895 """ disposition-type *( ";" parameter )
2896
2897 """
2898 disp_header = ContentDisposition()
2899 if not value:
2900 disp_header.defects.append(errors.HeaderMissingRequiredValue(
2901 "Missing content disposition"))
2902 return disp_header
2903 try:
2904 token, value = get_token(value)
2905 except errors.HeaderParseError:
2906 ctype.defects.append(errors.InvalidHeaderDefect(
2907 "Expected content disposition but found {!r}".format(value)))
2908 _find_mime_parameters(disp_header, value)
2909 return disp_header
2910 disp_header.append(token)
2911 disp_header.content_disposition = token.value.strip().lower()
2912 if not value:
2913 return disp_header
2914 if value[0] != ';':
2915 disp_header.defects.append(errors.InvalidHeaderDefect(
2916 "Only parameters are valid after content disposition, but "
2917 "found {!r}".format(value)))
2918 _find_mime_parameters(disp_header, value)
2919 return disp_header
2920 disp_header.append(ValueTerminal(';', 'parameter-separator'))
2921 disp_header.append(parse_mime_parameters(value[1:]))
2922 return disp_header
2923
2924def parse_content_transfer_encoding_header(value):
2925 """ mechanism
2926
2927 """
2928 # We should probably validate the values, since the list is fixed.
2929 cte_header = ContentTransferEncoding()
2930 if not value:
2931 cte_header.defects.append(errors.HeaderMissingRequiredValue(
2932 "Missing content transfer encoding"))
2933 return cte_header
2934 try:
2935 token, value = get_token(value)
2936 except errors.HeaderParseError:
2937 ctype.defects.append(errors.InvalidHeaderDefect(
2938 "Expected content trnasfer encoding but found {!r}".format(value)))
2939 else:
2940 cte_header.append(token)
2941 cte_header.cte = token.value.strip().lower()
2942 if not value:
2943 return cte_header
2944 while value:
2945 cte_header.defects.append(errors.InvalidHeaderDefect(
2946 "Extra text after content transfer encoding"))
2947 if value[0] in PHRASE_ENDS:
2948 cte_header.append(ValueTerminal(value[0], 'misplaced-special'))
2949 value = value[1:]
2950 else:
2951 token, value = get_phrase(value)
2952 cte_header.append(token)
2953 return cte_header