blob: 9b9697f77346a60ab37321f46c4b58845135bd1a [file] [log] [blame]
R David Murray0b6f6c82012-05-25 18:42:14 -04001"""Header value parser implementing various email-related RFC parsing rules.
2
3The parsing methods defined in this module implement various email related
4parsing rules. Principal among them is RFC 5322, which is the followon
5to RFC 2822 and primarily a clarification of the former. It also implements
6RFC 2047 encoded word decoding.
7
8RFC 5322 goes to considerable trouble to maintain backward compatibility with
9RFC 822 in the parse phase, while cleaning up the structure on the generation
10phase. This parser supports correct RFC 5322 generation by tagging white space
11as folding white space only when folding is allowed in the non-obsolete rule
12sets. Actually, the parser is even more generous when accepting input than RFC
135322 mandates, following the spirit of Postel's Law, which RFC 5322 encourages.
14Where possible deviations from the standard are annotated on the 'defects'
15attribute of tokens that deviate.
16
17The general structure of the parser follows RFC 5322, and uses its terminology
18where there is a direct correspondence. Where the implementation requires a
19somewhat different structure than that used by the formal grammar, new terms
20that mimic the closest existing terms are used. Thus, it really helps to have
21a copy of RFC 5322 handy when studying this code.
22
23Input to the parser is a string that has already been unfolded according to
24RFC 5322 rules. According to the RFC this unfolding is the very first step, and
25this parser leaves the unfolding step to a higher level message parser, which
26will have already detected the line breaks that need unfolding while
27determining the beginning and end of each header.
28
29The output of the parser is a TokenList object, which is a list subclass. A
30TokenList is a recursive data structure. The terminal nodes of the structure
31are Terminal objects, which are subclasses of str. These do not correspond
32directly to terminal objects in the formal grammar, but are instead more
33practical higher level combinations of true terminals.
34
35All TokenList and Terminal objects have a 'value' attribute, which produces the
36semantically meaningful value of that part of the parse subtree. The value of
37all whitespace tokens (no matter how many sub-tokens they may contain) is a
38single space, as per the RFC rules. This includes 'CFWS', which is herein
39included in the general class of whitespace tokens. There is one exception to
40the rule that whitespace tokens are collapsed into single spaces in values: in
41the value of a 'bare-quoted-string' (a quoted-string with no leading or
42trailing whitespace), any whitespace that appeared between the quotation marks
43is preserved in the returned value. Note that in all Terminal strings quoted
44pairs are turned into their unquoted values.
45
46All TokenList and Terminal objects also have a string value, which attempts to
47be a "canonical" representation of the RFC-compliant form of the substring that
48produced the parsed subtree, including minimal use of quoted pair quoting.
49Whitespace runs are not collapsed.
50
51Comment tokens also have a 'content' attribute providing the string found
52between the parens (including any nested comments) with whitespace preserved.
53
54All TokenList and Terminal objects have a 'defects' attribute which is a
55possibly empty list all of the defects found while creating the token. Defects
56may appear on any token in the tree, and a composite list of all defects in the
57subtree is available through the 'all_defects' attribute of any node. (For
58Terminal notes x.defects == x.all_defects.)
59
60Each object in a parse tree is called a 'token', and each has a 'token_type'
61attribute that gives the name from the RFC 5322 grammar that it represents.
62Not all RFC 5322 nodes are produced, and there is one non-RFC 5322 node that
63may be produced: 'ptext'. A 'ptext' is a string of printable ascii characters.
64It is returned in place of lists of (ctext/quoted-pair) and
65(qtext/quoted-pair).
66
67XXX: provide complete list of token types.
68"""
69
70import re
R David Murray97f43c02012-06-24 05:03:27 -040071import urllib # For urllib.parse.unquote
R David Murray65171b22013-07-11 15:52:57 -040072from string import hexdigits
Victor Stinner7fa767e2014-03-20 09:16:38 +010073from collections import OrderedDict
R David Murray7d0325d2015-03-29 21:53:05 -040074from operator import itemgetter
R David Murray0b6f6c82012-05-25 18:42:14 -040075from email import _encoded_words as _ew
76from email import errors
77from email import utils
78
79#
80# Useful constants and functions
81#
82
83WSP = set(' \t')
84CFWS_LEADER = WSP | set('(')
85SPECIALS = set(r'()<>@,:;.\"[]')
86ATOM_ENDS = SPECIALS | WSP
87DOT_ATOM_ENDS = ATOM_ENDS - set('.')
88# '.', '"', and '(' do not end phrases in order to support obs-phrase
89PHRASE_ENDS = SPECIALS - set('."(')
R David Murray97f43c02012-06-24 05:03:27 -040090TSPECIALS = (SPECIALS | set('/?=')) - set('.')
91TOKEN_ENDS = TSPECIALS | WSP
92ASPECIALS = TSPECIALS | set("*'%")
93ATTRIBUTE_ENDS = ASPECIALS | WSP
94EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
R David Murray0b6f6c82012-05-25 18:42:14 -040095
96def quote_string(value):
97 return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
98
99#
100# Accumulator for header folding
101#
102
103class _Folded:
104
105 def __init__(self, maxlen, policy):
106 self.maxlen = maxlen
107 self.policy = policy
108 self.lastlen = 0
109 self.stickyspace = None
110 self.firstline = True
111 self.done = []
112 self.current = []
113
114 def newline(self):
115 self.done.extend(self.current)
116 self.done.append(self.policy.linesep)
117 self.current.clear()
118 self.lastlen = 0
119
120 def finalize(self):
121 if self.current:
122 self.newline()
123
124 def __str__(self):
125 return ''.join(self.done)
126
127 def append(self, stoken):
128 self.current.append(stoken)
129
130 def append_if_fits(self, token, stoken=None):
131 if stoken is None:
132 stoken = str(token)
133 l = len(stoken)
134 if self.stickyspace is not None:
135 stickyspace_len = len(self.stickyspace)
136 if self.lastlen + stickyspace_len + l <= self.maxlen:
137 self.current.append(self.stickyspace)
138 self.lastlen += stickyspace_len
139 self.current.append(stoken)
140 self.lastlen += l
141 self.stickyspace = None
142 self.firstline = False
143 return True
144 if token.has_fws:
145 ws = token.pop_leading_fws()
146 if ws is not None:
147 self.stickyspace += str(ws)
148 stickyspace_len += len(ws)
149 token._fold(self)
150 return True
151 if stickyspace_len and l + 1 <= self.maxlen:
152 margin = self.maxlen - l
153 if 0 < margin < stickyspace_len:
154 trim = stickyspace_len - margin
155 self.current.append(self.stickyspace[:trim])
156 self.stickyspace = self.stickyspace[trim:]
157 stickyspace_len = trim
158 self.newline()
159 self.current.append(self.stickyspace)
160 self.current.append(stoken)
161 self.lastlen = l + stickyspace_len
162 self.stickyspace = None
163 self.firstline = False
164 return True
165 if not self.firstline:
166 self.newline()
167 self.current.append(self.stickyspace)
168 self.current.append(stoken)
169 self.stickyspace = None
170 self.firstline = False
171 return True
172 if self.lastlen + l <= self.maxlen:
173 self.current.append(stoken)
174 self.lastlen += l
175 return True
176 if l < self.maxlen:
177 self.newline()
178 self.current.append(stoken)
179 self.lastlen = l
180 return True
181 return False
182
183#
184# TokenList and its subclasses
185#
186
187class TokenList(list):
188
189 token_type = None
190
191 def __init__(self, *args, **kw):
192 super().__init__(*args, **kw)
193 self.defects = []
194
195 def __str__(self):
196 return ''.join(str(x) for x in self)
197
198 def __repr__(self):
199 return '{}({})'.format(self.__class__.__name__,
200 super().__repr__())
201
202 @property
203 def value(self):
204 return ''.join(x.value for x in self if x.value)
205
206 @property
207 def all_defects(self):
208 return sum((x.all_defects for x in self), self.defects)
209
210 #
211 # Folding API
212 #
213 # parts():
214 #
215 # return a list of objects that constitute the "higher level syntactic
216 # objects" specified by the RFC as the best places to fold a header line.
217 # The returned objects must include leading folding white space, even if
218 # this means mutating the underlying parse tree of the object. Each object
219 # is only responsible for returning *its* parts, and should not drill down
220 # to any lower level except as required to meet the leading folding white
221 # space constraint.
222 #
223 # _fold(folded):
224 #
225 # folded: the result accumulator. This is an instance of _Folded.
226 # (XXX: I haven't finished factoring this out yet, the folding code
227 # pretty much uses this as a state object.) When the folded.current
228 # contains as much text as will fit, the _fold method should call
229 # folded.newline.
230 # folded.lastlen: the current length of the test stored in folded.current.
231 # folded.maxlen: The maximum number of characters that may appear on a
232 # folded line. Differs from the policy setting in that "no limit" is
233 # represented by +inf, which means it can be used in the trivially
234 # logical fashion in comparisons.
235 #
236 # Currently no subclasses implement parts, and I think this will remain
237 # true. A subclass only needs to implement _fold when the generic version
238 # isn't sufficient. _fold will need to be implemented primarily when it is
239 # possible for encoded words to appear in the specialized token-list, since
240 # there is no generic algorithm that can know where exactly the encoded
241 # words are allowed. A _fold implementation is responsible for filling
242 # lines in the same general way that the top level _fold does. It may, and
243 # should, call the _fold method of sub-objects in a similar fashion to that
244 # of the top level _fold.
245 #
246 # XXX: I'm hoping it will be possible to factor the existing code further
247 # to reduce redundancy and make the logic clearer.
248
249 @property
250 def parts(self):
251 klass = self.__class__
252 this = []
253 for token in self:
254 if token.startswith_fws():
255 if this:
256 yield this[0] if len(this)==1 else klass(this)
257 this.clear()
258 end_ws = token.pop_trailing_ws()
259 this.append(token)
260 if end_ws:
261 yield klass(this)
262 this = [end_ws]
263 if this:
264 yield this[0] if len(this)==1 else klass(this)
265
266 def startswith_fws(self):
267 return self[0].startswith_fws()
268
269 def pop_leading_fws(self):
270 if self[0].token_type == 'fws':
271 return self.pop(0)
272 return self[0].pop_leading_fws()
273
274 def pop_trailing_ws(self):
275 if self[-1].token_type == 'cfws':
276 return self.pop(-1)
277 return self[-1].pop_trailing_ws()
278
279 @property
280 def has_fws(self):
281 for part in self:
282 if part.has_fws:
283 return True
284 return False
285
286 def has_leading_comment(self):
287 return self[0].has_leading_comment()
288
289 @property
290 def comments(self):
291 comments = []
292 for token in self:
293 comments.extend(token.comments)
294 return comments
295
296 def fold(self, *, policy):
297 # max_line_length 0/None means no limit, ie: infinitely long.
298 maxlen = policy.max_line_length or float("+inf")
299 folded = _Folded(maxlen, policy)
300 self._fold(folded)
301 folded.finalize()
302 return str(folded)
303
304 def as_encoded_word(self, charset):
305 # This works only for things returned by 'parts', which include
306 # the leading fws, if any, that should be used.
307 res = []
308 ws = self.pop_leading_fws()
309 if ws:
310 res.append(ws)
311 trailer = self.pop(-1) if self[-1].token_type=='fws' else ''
312 res.append(_ew.encode(str(self), charset))
313 res.append(trailer)
314 return ''.join(res)
315
316 def cte_encode(self, charset, policy):
317 res = []
318 for part in self:
319 res.append(part.cte_encode(charset, policy))
320 return ''.join(res)
321
322 def _fold(self, folded):
R David Murray224ef3e2015-05-17 11:29:21 -0400323 encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
R David Murray0b6f6c82012-05-25 18:42:14 -0400324 for part in self.parts:
325 tstr = str(part)
326 tlen = len(tstr)
327 try:
R David Murray224ef3e2015-05-17 11:29:21 -0400328 str(part).encode(encoding)
R David Murray0b6f6c82012-05-25 18:42:14 -0400329 except UnicodeEncodeError:
330 if any(isinstance(x, errors.UndecodableBytesDefect)
331 for x in part.all_defects):
332 charset = 'unknown-8bit'
333 else:
R David Murray224ef3e2015-05-17 11:29:21 -0400334 # XXX: this should be a policy setting when utf8 is False.
R David Murray0b6f6c82012-05-25 18:42:14 -0400335 charset = 'utf-8'
336 tstr = part.cte_encode(charset, folded.policy)
337 tlen = len(tstr)
338 if folded.append_if_fits(part, tstr):
339 continue
340 # Peel off the leading whitespace if any and make it sticky, to
341 # avoid infinite recursion.
342 ws = part.pop_leading_fws()
343 if ws is not None:
Joel Hillacreb350c222017-06-26 15:41:35 -0600344 folded.stickyspace = str(ws)
R David Murray0b6f6c82012-05-25 18:42:14 -0400345 if folded.append_if_fits(part):
346 continue
347 if part.has_fws:
348 part._fold(folded)
349 continue
350 # There are no fold points in this one; it is too long for a single
351 # line and can't be split...we just have to put it on its own line.
352 folded.append(tstr)
353 folded.newline()
354
355 def pprint(self, indent=''):
356 print('\n'.join(self._pp(indent='')))
357
358 def ppstr(self, indent=''):
359 return '\n'.join(self._pp(indent=''))
360
361 def _pp(self, indent=''):
362 yield '{}{}/{}('.format(
363 indent,
364 self.__class__.__name__,
365 self.token_type)
366 for token in self:
R David Murray97f43c02012-06-24 05:03:27 -0400367 if not hasattr(token, '_pp'):
368 yield (indent + ' !! invalid element in token '
369 'list: {!r}'.format(token))
370 else:
Philip Jenvey4993cc02012-10-01 12:53:43 -0700371 yield from token._pp(indent+' ')
R David Murray0b6f6c82012-05-25 18:42:14 -0400372 if self.defects:
373 extra = ' Defects: {}'.format(self.defects)
374 else:
375 extra = ''
376 yield '{}){}'.format(indent, extra)
377
378
379class WhiteSpaceTokenList(TokenList):
380
381 @property
382 def value(self):
383 return ' '
384
385 @property
386 def comments(self):
387 return [x.content for x in self if x.token_type=='comment']
388
389
390class UnstructuredTokenList(TokenList):
391
392 token_type = 'unstructured'
393
394 def _fold(self, folded):
R David Murray0b6f6c82012-05-25 18:42:14 -0400395 last_ew = None
R David Murray224ef3e2015-05-17 11:29:21 -0400396 encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
R David Murray0b6f6c82012-05-25 18:42:14 -0400397 for part in self.parts:
398 tstr = str(part)
399 is_ew = False
400 try:
R David Murray224ef3e2015-05-17 11:29:21 -0400401 str(part).encode(encoding)
R David Murray0b6f6c82012-05-25 18:42:14 -0400402 except UnicodeEncodeError:
403 if any(isinstance(x, errors.UndecodableBytesDefect)
404 for x in part.all_defects):
405 charset = 'unknown-8bit'
406 else:
407 charset = 'utf-8'
408 if last_ew is not None:
409 # We've already done an EW, combine this one with it
410 # if there's room.
411 chunk = get_unstructured(
412 ''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset)
413 oldlastlen = sum(len(x) for x in folded.current[:last_ew])
414 schunk = str(chunk)
415 lchunk = len(schunk)
416 if oldlastlen + lchunk <= folded.maxlen:
417 del folded.current[last_ew:]
418 folded.append(schunk)
419 folded.lastlen = oldlastlen + lchunk
420 continue
421 tstr = part.as_encoded_word(charset)
422 is_ew = True
423 if folded.append_if_fits(part, tstr):
424 if is_ew:
425 last_ew = len(folded.current) - 1
426 continue
427 if is_ew or last_ew:
428 # It's too big to fit on the line, but since we've
429 # got encoded words we can use encoded word folding.
430 part._fold_as_ew(folded)
431 continue
432 # Peel off the leading whitespace if any and make it sticky, to
433 # avoid infinite recursion.
434 ws = part.pop_leading_fws()
435 if ws is not None:
436 folded.stickyspace = str(ws)
437 if folded.append_if_fits(part):
438 continue
439 if part.has_fws:
Serhiy Storchaka6c2f1fd2016-07-17 13:25:15 +0300440 part._fold(folded)
R David Murray0b6f6c82012-05-25 18:42:14 -0400441 continue
442 # It can't be split...we just have to put it on its own line.
443 folded.append(tstr)
444 folded.newline()
445 last_ew = None
446
447 def cte_encode(self, charset, policy):
448 res = []
449 last_ew = None
450 for part in self:
451 spart = str(part)
452 try:
453 spart.encode('us-ascii')
454 res.append(spart)
455 except UnicodeEncodeError:
456 if last_ew is None:
457 res.append(part.cte_encode(charset, policy))
458 last_ew = len(res)
459 else:
460 tl = get_unstructured(''.join(res[last_ew:] + [spart]))
Serhiy Storchaka6c2f1fd2016-07-17 13:25:15 +0300461 res.append(tl.as_encoded_word(charset))
R David Murray0b6f6c82012-05-25 18:42:14 -0400462 return ''.join(res)
463
464
465class Phrase(TokenList):
466
467 token_type = 'phrase'
468
469 def _fold(self, folded):
470 # As with Unstructured, we can have pure ASCII with or without
471 # surrogateescape encoded bytes, or we could have unicode. But this
472 # case is more complicated, since we have to deal with the various
473 # sub-token types and how they can be composed in the face of
474 # unicode-that-needs-CTE-encoding, and the fact that if a token a
475 # comment that becomes a barrier across which we can't compose encoded
476 # words.
477 last_ew = None
R David Murray224ef3e2015-05-17 11:29:21 -0400478 encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
R David Murray0b6f6c82012-05-25 18:42:14 -0400479 for part in self.parts:
480 tstr = str(part)
481 tlen = len(tstr)
482 has_ew = False
483 try:
R David Murray224ef3e2015-05-17 11:29:21 -0400484 str(part).encode(encoding)
R David Murray0b6f6c82012-05-25 18:42:14 -0400485 except UnicodeEncodeError:
486 if any(isinstance(x, errors.UndecodableBytesDefect)
487 for x in part.all_defects):
488 charset = 'unknown-8bit'
489 else:
490 charset = 'utf-8'
491 if last_ew is not None and not part.has_leading_comment():
492 # We've already done an EW, let's see if we can combine
493 # this one with it. The last_ew logic ensures that all we
494 # have at this point is atoms, no comments or quoted
495 # strings. So we can treat the text between the last
496 # encoded word and the content of this token as
497 # unstructured text, and things will work correctly. But
498 # we have to strip off any trailing comment on this token
499 # first, and if it is a quoted string we have to pull out
500 # the content (we're encoding it, so it no longer needs to
501 # be quoted).
502 if part[-1].token_type == 'cfws' and part.comments:
503 remainder = part.pop(-1)
504 else:
505 remainder = ''
506 for i, token in enumerate(part):
507 if token.token_type == 'bare-quoted-string':
508 part[i] = UnstructuredTokenList(token[:])
509 chunk = get_unstructured(
510 ''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset)
511 schunk = str(chunk)
512 lchunk = len(schunk)
513 if last_ew + lchunk <= folded.maxlen:
514 del folded.current[last_ew:]
515 folded.append(schunk)
516 folded.lastlen = sum(len(x) for x in folded.current)
517 continue
518 tstr = part.as_encoded_word(charset)
519 tlen = len(tstr)
520 has_ew = True
521 if folded.append_if_fits(part, tstr):
522 if has_ew and not part.comments:
523 last_ew = len(folded.current) - 1
524 elif part.comments or part.token_type == 'quoted-string':
525 # If a comment is involved we can't combine EWs. And if a
526 # quoted string is involved, it's not worth the effort to
527 # try to combine them.
528 last_ew = None
529 continue
530 part._fold(folded)
531
532 def cte_encode(self, charset, policy):
533 res = []
534 last_ew = None
535 is_ew = False
536 for part in self:
537 spart = str(part)
538 try:
539 spart.encode('us-ascii')
540 res.append(spart)
541 except UnicodeEncodeError:
542 is_ew = True
543 if last_ew is None:
544 if not part.comments:
545 last_ew = len(res)
546 res.append(part.cte_encode(charset, policy))
547 elif not part.has_leading_comment():
548 if part[-1].token_type == 'cfws' and part.comments:
549 remainder = part.pop(-1)
550 else:
551 remainder = ''
552 for i, token in enumerate(part):
553 if token.token_type == 'bare-quoted-string':
554 part[i] = UnstructuredTokenList(token[:])
555 tl = get_unstructured(''.join(res[last_ew:] + [spart]))
556 res[last_ew:] = [tl.as_encoded_word(charset)]
557 if part.comments or (not is_ew and part.token_type == 'quoted-string'):
558 last_ew = None
559 return ''.join(res)
560
561class Word(TokenList):
562
563 token_type = 'word'
564
565
566class CFWSList(WhiteSpaceTokenList):
567
568 token_type = 'cfws'
569
570 def has_leading_comment(self):
571 return bool(self.comments)
572
573
574class Atom(TokenList):
575
576 token_type = 'atom'
577
578
R David Murray97f43c02012-06-24 05:03:27 -0400579class Token(TokenList):
580
581 token_type = 'token'
582
583
R David Murray0b6f6c82012-05-25 18:42:14 -0400584class EncodedWord(TokenList):
585
586 token_type = 'encoded-word'
587 cte = None
588 charset = None
589 lang = None
590
591 @property
592 def encoded(self):
593 if self.cte is not None:
594 return self.cte
595 _ew.encode(str(self), self.charset)
596
597
598
599class QuotedString(TokenList):
600
601 token_type = 'quoted-string'
602
603 @property
604 def content(self):
605 for x in self:
606 if x.token_type == 'bare-quoted-string':
607 return x.value
608
609 @property
610 def quoted_value(self):
611 res = []
612 for x in self:
613 if x.token_type == 'bare-quoted-string':
614 res.append(str(x))
615 else:
616 res.append(x.value)
617 return ''.join(res)
618
R David Murray97f43c02012-06-24 05:03:27 -0400619 @property
620 def stripped_value(self):
621 for token in self:
622 if token.token_type == 'bare-quoted-string':
623 return token.value
624
R David Murray0b6f6c82012-05-25 18:42:14 -0400625
626class BareQuotedString(QuotedString):
627
628 token_type = 'bare-quoted-string'
629
630 def __str__(self):
R David Murray97f43c02012-06-24 05:03:27 -0400631 return quote_string(''.join(str(x) for x in self))
R David Murray0b6f6c82012-05-25 18:42:14 -0400632
633 @property
634 def value(self):
635 return ''.join(str(x) for x in self)
636
637
638class Comment(WhiteSpaceTokenList):
639
640 token_type = 'comment'
641
642 def __str__(self):
643 return ''.join(sum([
644 ["("],
645 [self.quote(x) for x in self],
646 [")"],
647 ], []))
648
649 def quote(self, value):
650 if value.token_type == 'comment':
651 return str(value)
652 return str(value).replace('\\', '\\\\').replace(
R David Murray44b548d2016-09-08 13:59:53 -0400653 '(', r'\(').replace(
654 ')', r'\)')
R David Murray0b6f6c82012-05-25 18:42:14 -0400655
656 @property
657 def content(self):
658 return ''.join(str(x) for x in self)
659
660 @property
661 def comments(self):
662 return [self.content]
663
664class AddressList(TokenList):
665
666 token_type = 'address-list'
667
668 @property
669 def addresses(self):
670 return [x for x in self if x.token_type=='address']
671
672 @property
673 def mailboxes(self):
674 return sum((x.mailboxes
675 for x in self if x.token_type=='address'), [])
676
677 @property
678 def all_mailboxes(self):
679 return sum((x.all_mailboxes
680 for x in self if x.token_type=='address'), [])
681
682
683class Address(TokenList):
684
685 token_type = 'address'
686
687 @property
688 def display_name(self):
689 if self[0].token_type == 'group':
690 return self[0].display_name
691
692 @property
693 def mailboxes(self):
694 if self[0].token_type == 'mailbox':
695 return [self[0]]
696 elif self[0].token_type == 'invalid-mailbox':
697 return []
698 return self[0].mailboxes
699
700 @property
701 def all_mailboxes(self):
702 if self[0].token_type == 'mailbox':
703 return [self[0]]
704 elif self[0].token_type == 'invalid-mailbox':
705 return [self[0]]
706 return self[0].all_mailboxes
707
708class MailboxList(TokenList):
709
710 token_type = 'mailbox-list'
711
712 @property
713 def mailboxes(self):
714 return [x for x in self if x.token_type=='mailbox']
715
716 @property
717 def all_mailboxes(self):
718 return [x for x in self
719 if x.token_type in ('mailbox', 'invalid-mailbox')]
720
721
722class GroupList(TokenList):
723
724 token_type = 'group-list'
725
726 @property
727 def mailboxes(self):
728 if not self or self[0].token_type != 'mailbox-list':
729 return []
730 return self[0].mailboxes
731
732 @property
733 def all_mailboxes(self):
734 if not self or self[0].token_type != 'mailbox-list':
735 return []
736 return self[0].all_mailboxes
737
738
739class Group(TokenList):
740
741 token_type = "group"
742
743 @property
744 def mailboxes(self):
745 if self[2].token_type != 'group-list':
746 return []
747 return self[2].mailboxes
748
749 @property
750 def all_mailboxes(self):
751 if self[2].token_type != 'group-list':
752 return []
753 return self[2].all_mailboxes
754
755 @property
756 def display_name(self):
757 return self[0].display_name
758
759
760class NameAddr(TokenList):
761
762 token_type = 'name-addr'
763
764 @property
765 def display_name(self):
766 if len(self) == 1:
767 return None
768 return self[0].display_name
769
770 @property
771 def local_part(self):
772 return self[-1].local_part
773
774 @property
775 def domain(self):
776 return self[-1].domain
777
778 @property
779 def route(self):
780 return self[-1].route
781
782 @property
783 def addr_spec(self):
784 return self[-1].addr_spec
785
786
787class AngleAddr(TokenList):
788
789 token_type = 'angle-addr'
790
791 @property
792 def local_part(self):
793 for x in self:
794 if x.token_type == 'addr-spec':
795 return x.local_part
796
797 @property
798 def domain(self):
799 for x in self:
800 if x.token_type == 'addr-spec':
801 return x.domain
802
803 @property
804 def route(self):
805 for x in self:
806 if x.token_type == 'obs-route':
807 return x.domains
808
809 @property
810 def addr_spec(self):
811 for x in self:
812 if x.token_type == 'addr-spec':
813 return x.addr_spec
R David Murray032eed32012-05-26 14:31:12 -0400814 else:
815 return '<>'
R David Murray0b6f6c82012-05-25 18:42:14 -0400816
817
818class ObsRoute(TokenList):
819
820 token_type = 'obs-route'
821
822 @property
823 def domains(self):
824 return [x.domain for x in self if x.token_type == 'domain']
825
826
827class Mailbox(TokenList):
828
829 token_type = 'mailbox'
830
831 @property
832 def display_name(self):
833 if self[0].token_type == 'name-addr':
834 return self[0].display_name
835
836 @property
837 def local_part(self):
838 return self[0].local_part
839
840 @property
841 def domain(self):
842 return self[0].domain
843
844 @property
845 def route(self):
846 if self[0].token_type == 'name-addr':
847 return self[0].route
848
849 @property
850 def addr_spec(self):
851 return self[0].addr_spec
852
853
854class InvalidMailbox(TokenList):
855
856 token_type = 'invalid-mailbox'
857
858 @property
859 def display_name(self):
860 return None
861
862 local_part = domain = route = addr_spec = display_name
863
864
865class Domain(TokenList):
866
867 token_type = 'domain'
868
869 @property
870 def domain(self):
871 return ''.join(super().value.split())
872
873
874class DotAtom(TokenList):
875
876 token_type = 'dot-atom'
877
878
879class DotAtomText(TokenList):
880
881 token_type = 'dot-atom-text'
882
883
884class AddrSpec(TokenList):
885
886 token_type = 'addr-spec'
887
888 @property
889 def local_part(self):
890 return self[0].local_part
891
892 @property
893 def domain(self):
894 if len(self) < 3:
895 return None
896 return self[-1].domain
897
898 @property
899 def value(self):
900 if len(self) < 3:
901 return self[0].value
902 return self[0].value.rstrip()+self[1].value+self[2].value.lstrip()
903
904 @property
905 def addr_spec(self):
906 nameset = set(self.local_part)
907 if len(nameset) > len(nameset-DOT_ATOM_ENDS):
908 lp = quote_string(self.local_part)
909 else:
910 lp = self.local_part
911 if self.domain is not None:
912 return lp + '@' + self.domain
913 return lp
914
915
916class ObsLocalPart(TokenList):
917
918 token_type = 'obs-local-part'
919
920
921class DisplayName(Phrase):
922
923 token_type = 'display-name'
924
925 @property
926 def display_name(self):
927 res = TokenList(self)
928 if res[0].token_type == 'cfws':
929 res.pop(0)
930 else:
931 if res[0][0].token_type == 'cfws':
932 res[0] = TokenList(res[0][1:])
933 if res[-1].token_type == 'cfws':
934 res.pop()
935 else:
936 if res[-1][-1].token_type == 'cfws':
937 res[-1] = TokenList(res[-1][:-1])
938 return res.value
939
940 @property
941 def value(self):
942 quote = False
943 if self.defects:
944 quote = True
945 else:
946 for x in self:
947 if x.token_type == 'quoted-string':
948 quote = True
949 if quote:
950 pre = post = ''
951 if self[0].token_type=='cfws' or self[0][0].token_type=='cfws':
952 pre = ' '
953 if self[-1].token_type=='cfws' or self[-1][-1].token_type=='cfws':
954 post = ' '
955 return pre+quote_string(self.display_name)+post
956 else:
957 return super().value
958
959
960class LocalPart(TokenList):
961
962 token_type = 'local-part'
963
964 @property
965 def value(self):
966 if self[0].token_type == "quoted-string":
967 return self[0].quoted_value
968 else:
969 return self[0].value
970
971 @property
972 def local_part(self):
973 # Strip whitespace from front, back, and around dots.
974 res = [DOT]
975 last = DOT
976 last_is_tl = False
977 for tok in self[0] + [DOT]:
978 if tok.token_type == 'cfws':
979 continue
980 if (last_is_tl and tok.token_type == 'dot' and
981 last[-1].token_type == 'cfws'):
982 res[-1] = TokenList(last[:-1])
983 is_tl = isinstance(tok, TokenList)
984 if (is_tl and last.token_type == 'dot' and
985 tok[0].token_type == 'cfws'):
986 res.append(TokenList(tok[1:]))
987 else:
988 res.append(tok)
989 last = res[-1]
990 last_is_tl = is_tl
991 res = TokenList(res[1:-1])
992 return res.value
993
994
995class DomainLiteral(TokenList):
996
997 token_type = 'domain-literal'
998
999 @property
1000 def domain(self):
1001 return ''.join(super().value.split())
1002
1003 @property
1004 def ip(self):
1005 for x in self:
1006 if x.token_type == 'ptext':
1007 return x.value
1008
1009
R David Murray97f43c02012-06-24 05:03:27 -04001010class MIMEVersion(TokenList):
1011
1012 token_type = 'mime-version'
1013 major = None
1014 minor = None
1015
1016
1017class Parameter(TokenList):
1018
1019 token_type = 'parameter'
1020 sectioned = False
1021 extended = False
1022 charset = 'us-ascii'
1023
1024 @property
1025 def section_number(self):
1026 # Because the first token, the attribute (name) eats CFWS, the second
1027 # token is always the section if there is one.
1028 return self[1].number if self.sectioned else 0
1029
1030 @property
1031 def param_value(self):
1032 # This is part of the "handle quoted extended parameters" hack.
1033 for token in self:
1034 if token.token_type == 'value':
1035 return token.stripped_value
1036 if token.token_type == 'quoted-string':
1037 for token in token:
1038 if token.token_type == 'bare-quoted-string':
1039 for token in token:
1040 if token.token_type == 'value':
1041 return token.stripped_value
1042 return ''
1043
1044
1045class InvalidParameter(Parameter):
1046
1047 token_type = 'invalid-parameter'
1048
1049
1050class Attribute(TokenList):
1051
1052 token_type = 'attribute'
1053
1054 @property
1055 def stripped_value(self):
1056 for token in self:
1057 if token.token_type.endswith('attrtext'):
1058 return token.value
1059
1060class Section(TokenList):
1061
1062 token_type = 'section'
1063 number = None
1064
1065
1066class Value(TokenList):
1067
1068 token_type = 'value'
1069
1070 @property
1071 def stripped_value(self):
1072 token = self[0]
1073 if token.token_type == 'cfws':
1074 token = self[1]
1075 if token.token_type.endswith(
1076 ('quoted-string', 'attribute', 'extended-attribute')):
1077 return token.stripped_value
1078 return self.value
1079
1080
1081class MimeParameters(TokenList):
1082
1083 token_type = 'mime-parameters'
1084
1085 @property
1086 def params(self):
1087 # The RFC specifically states that the ordering of parameters is not
1088 # guaranteed and may be reordered by the transport layer. So we have
1089 # to assume the RFC 2231 pieces can come in any order. However, we
1090 # output them in the order that we first see a given name, which gives
1091 # us a stable __str__.
1092 params = OrderedDict()
1093 for token in self:
1094 if not token.token_type.endswith('parameter'):
1095 continue
1096 if token[0].token_type != 'attribute':
1097 continue
1098 name = token[0].value.strip()
1099 if name not in params:
1100 params[name] = []
1101 params[name].append((token.section_number, token))
1102 for name, parts in params.items():
R David Murray7d0325d2015-03-29 21:53:05 -04001103 parts = sorted(parts, key=itemgetter(0))
1104 first_param = parts[0][1]
1105 charset = first_param.charset
1106 # Our arbitrary error recovery is to ignore duplicate parameters,
1107 # to use appearance order if there are duplicate rfc 2231 parts,
1108 # and to ignore gaps. This mimics the error recovery of get_param.
1109 if not first_param.extended and len(parts) > 1:
1110 if parts[1][0] == 0:
1111 parts[1][1].defects.append(errors.InvalidHeaderDefect(
1112 'duplicate parameter name; duplicate(s) ignored'))
1113 parts = parts[:1]
1114 # Else assume the *0* was missing...note that this is different
1115 # from get_param, but we registered a defect for this earlier.
R David Murray97f43c02012-06-24 05:03:27 -04001116 value_parts = []
R David Murray7d0325d2015-03-29 21:53:05 -04001117 i = 0
1118 for section_number, param in parts:
R David Murray97f43c02012-06-24 05:03:27 -04001119 if section_number != i:
R David Murray7d0325d2015-03-29 21:53:05 -04001120 # We could get fancier here and look for a complete
1121 # duplicate extended parameter and ignore the second one
1122 # seen. But we're not doing that. The old code didn't.
1123 if not param.extended:
1124 param.defects.append(errors.InvalidHeaderDefect(
1125 'duplicate parameter name; duplicate ignored'))
1126 continue
1127 else:
1128 param.defects.append(errors.InvalidHeaderDefect(
1129 "inconsistent RFC2231 parameter numbering"))
1130 i += 1
R David Murray97f43c02012-06-24 05:03:27 -04001131 value = param.param_value
1132 if param.extended:
1133 try:
1134 value = urllib.parse.unquote_to_bytes(value)
1135 except UnicodeEncodeError:
1136 # source had surrogate escaped bytes. What we do now
1137 # is a bit of an open question. I'm not sure this is
1138 # the best choice, but it is what the old algorithm did
1139 value = urllib.parse.unquote(value, encoding='latin-1')
1140 else:
1141 try:
1142 value = value.decode(charset, 'surrogateescape')
1143 except LookupError:
1144 # XXX: there should really be a custom defect for
1145 # unknown character set to make it easy to find,
1146 # because otherwise unknown charset is a silent
1147 # failure.
1148 value = value.decode('us-ascii', 'surrogateescape')
1149 if utils._has_surrogates(value):
1150 param.defects.append(errors.UndecodableBytesDefect())
1151 value_parts.append(value)
1152 value = ''.join(value_parts)
1153 yield name, value
1154
1155 def __str__(self):
1156 params = []
1157 for name, value in self.params:
1158 if value:
1159 params.append('{}={}'.format(name, quote_string(value)))
1160 else:
1161 params.append(name)
1162 params = '; '.join(params)
1163 return ' ' + params if params else ''
1164
1165
1166class ParameterizedHeaderValue(TokenList):
1167
1168 @property
1169 def params(self):
1170 for token in reversed(self):
1171 if token.token_type == 'mime-parameters':
1172 return token.params
1173 return {}
1174
1175 @property
1176 def parts(self):
1177 if self and self[-1].token_type == 'mime-parameters':
1178 # We don't want to start a new line if all of the params don't fit
1179 # after the value, so unwrap the parameter list.
1180 return TokenList(self[:-1] + self[-1])
1181 return TokenList(self).parts
1182
1183
1184class ContentType(ParameterizedHeaderValue):
1185
1186 token_type = 'content-type'
1187 maintype = 'text'
1188 subtype = 'plain'
1189
1190
1191class ContentDisposition(ParameterizedHeaderValue):
1192
1193 token_type = 'content-disposition'
1194 content_disposition = None
1195
1196
1197class ContentTransferEncoding(TokenList):
1198
1199 token_type = 'content-transfer-encoding'
1200 cte = '7bit'
1201
1202
R David Murray0b6f6c82012-05-25 18:42:14 -04001203class HeaderLabel(TokenList):
1204
1205 token_type = 'header-label'
1206
1207
1208class Header(TokenList):
1209
1210 token_type = 'header'
1211
1212 def _fold(self, folded):
1213 folded.append(str(self.pop(0)))
1214 folded.lastlen = len(folded.current[0])
1215 # The first line of the header is different from all others: we don't
1216 # want to start a new object on a new line if it has any fold points in
1217 # it that would allow part of it to be on the first header line.
1218 # Further, if the first fold point would fit on the new line, we want
1219 # to do that, but if it doesn't we want to put it on the first line.
1220 # Folded supports this via the stickyspace attribute. If this
1221 # attribute is not None, it does the special handling.
1222 folded.stickyspace = str(self.pop(0)) if self[0].token_type == 'cfws' else ''
1223 rest = self.pop(0)
1224 if self:
1225 raise ValueError("Malformed Header token list")
1226 rest._fold(folded)
1227
1228
1229#
1230# Terminal classes and instances
1231#
1232
1233class Terminal(str):
1234
1235 def __new__(cls, value, token_type):
1236 self = super().__new__(cls, value)
1237 self.token_type = token_type
1238 self.defects = []
1239 return self
1240
1241 def __repr__(self):
1242 return "{}({})".format(self.__class__.__name__, super().__repr__())
1243
1244 @property
1245 def all_defects(self):
1246 return list(self.defects)
1247
1248 def _pp(self, indent=''):
1249 return ["{}{}/{}({}){}".format(
1250 indent,
1251 self.__class__.__name__,
1252 self.token_type,
1253 super().__repr__(),
1254 '' if not self.defects else ' {}'.format(self.defects),
1255 )]
1256
1257 def cte_encode(self, charset, policy):
1258 value = str(self)
1259 try:
1260 value.encode('us-ascii')
1261 return value
1262 except UnicodeEncodeError:
1263 return _ew.encode(value, charset)
1264
1265 def pop_trailing_ws(self):
1266 # This terminates the recursion.
1267 return None
1268
1269 def pop_leading_fws(self):
1270 # This terminates the recursion.
1271 return None
1272
1273 @property
1274 def comments(self):
1275 return []
1276
1277 def has_leading_comment(self):
1278 return False
1279
1280 def __getnewargs__(self):
1281 return(str(self), self.token_type)
1282
1283
1284class WhiteSpaceTerminal(Terminal):
1285
1286 @property
1287 def value(self):
1288 return ' '
1289
1290 def startswith_fws(self):
1291 return True
1292
1293 has_fws = True
1294
1295
1296class ValueTerminal(Terminal):
1297
1298 @property
1299 def value(self):
1300 return self
1301
1302 def startswith_fws(self):
1303 return False
1304
1305 has_fws = False
1306
1307 def as_encoded_word(self, charset):
1308 return _ew.encode(str(self), charset)
1309
1310
1311class EWWhiteSpaceTerminal(WhiteSpaceTerminal):
1312
1313 @property
1314 def value(self):
1315 return ''
1316
1317 @property
1318 def encoded(self):
1319 return self[:]
1320
1321 def __str__(self):
1322 return ''
1323
1324 has_fws = True
1325
1326
1327# XXX these need to become classes and used as instances so
1328# that a program can't change them in a parse tree and screw
1329# up other parse trees. Maybe should have tests for that, too.
1330DOT = ValueTerminal('.', 'dot')
1331ListSeparator = ValueTerminal(',', 'list-separator')
1332RouteComponentMarker = ValueTerminal('@', 'route-component-marker')
1333
1334#
1335# Parser
1336#
1337
Victor Stinner765531d2013-03-26 01:11:54 +01001338# Parse strings according to RFC822/2047/2822/5322 rules.
1339#
1340# This is a stateless parser. Each get_XXX function accepts a string and
1341# returns either a Terminal or a TokenList representing the RFC object named
1342# by the method and a string containing the remaining unparsed characters
1343# from the input. Thus a parser method consumes the next syntactic construct
1344# of a given type and returns a token representing the construct plus the
1345# unparsed remainder of the input string.
1346#
1347# For example, if the first element of a structured header is a 'phrase',
1348# then:
1349#
1350# phrase, value = get_phrase(value)
1351#
1352# returns the complete phrase from the start of the string value, plus any
1353# characters left in the string after the phrase is removed.
R David Murray0b6f6c82012-05-25 18:42:14 -04001354
1355_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split
1356_non_atom_end_matcher = re.compile(r"[^{}]+".format(
R David Murray44b548d2016-09-08 13:59:53 -04001357 ''.join(ATOM_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
R David Murray0b6f6c82012-05-25 18:42:14 -04001358_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall
R David Murray97f43c02012-06-24 05:03:27 -04001359_non_token_end_matcher = re.compile(r"[^{}]+".format(
R David Murray44b548d2016-09-08 13:59:53 -04001360 ''.join(TOKEN_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
R David Murray97f43c02012-06-24 05:03:27 -04001361_non_attribute_end_matcher = re.compile(r"[^{}]+".format(
R David Murray44b548d2016-09-08 13:59:53 -04001362 ''.join(ATTRIBUTE_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
R David Murray97f43c02012-06-24 05:03:27 -04001363_non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format(
1364 ''.join(EXTENDED_ATTRIBUTE_ENDS).replace(
R David Murray44b548d2016-09-08 13:59:53 -04001365 '\\','\\\\').replace(']',r'\]'))).match
R David Murray0b6f6c82012-05-25 18:42:14 -04001366
1367def _validate_xtext(xtext):
1368 """If input token contains ASCII non-printables, register a defect."""
1369
1370 non_printables = _non_printable_finder(xtext)
1371 if non_printables:
1372 xtext.defects.append(errors.NonPrintableDefect(non_printables))
1373 if utils._has_surrogates(xtext):
1374 xtext.defects.append(errors.UndecodableBytesDefect(
1375 "Non-ASCII characters found in header token"))
1376
1377def _get_ptext_to_endchars(value, endchars):
1378 """Scan printables/quoted-pairs until endchars and return unquoted ptext.
1379
1380 This function turns a run of qcontent, ccontent-without-comments, or
1381 dtext-with-quoted-printables into a single string by unquoting any
1382 quoted printables. It returns the string, the remaining value, and
1383 a flag that is True iff there were any quoted printables decoded.
1384
1385 """
1386 fragment, *remainder = _wsp_splitter(value, 1)
1387 vchars = []
1388 escape = False
1389 had_qp = False
1390 for pos in range(len(fragment)):
1391 if fragment[pos] == '\\':
1392 if escape:
1393 escape = False
1394 had_qp = True
1395 else:
1396 escape = True
1397 continue
1398 if escape:
1399 escape = False
1400 elif fragment[pos] in endchars:
1401 break
1402 vchars.append(fragment[pos])
1403 else:
1404 pos = pos + 1
1405 return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp
1406
R David Murray0b6f6c82012-05-25 18:42:14 -04001407def get_fws(value):
1408 """FWS = 1*WSP
1409
1410 This isn't the RFC definition. We're using fws to represent tokens where
1411 folding can be done, but when we are parsing the *un*folding has already
1412 been done so we don't need to watch out for CRLF.
1413
1414 """
1415 newvalue = value.lstrip()
1416 fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws')
1417 return fws, newvalue
1418
1419def get_encoded_word(value):
1420 """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
1421
1422 """
1423 ew = EncodedWord()
1424 if not value.startswith('=?'):
1425 raise errors.HeaderParseError(
1426 "expected encoded word but found {}".format(value))
1427 tok, *remainder = value[2:].split('?=', 1)
1428 if tok == value[2:]:
1429 raise errors.HeaderParseError(
1430 "expected encoded word but found {}".format(value))
1431 remstr = ''.join(remainder)
R David Murray65171b22013-07-11 15:52:57 -04001432 if len(remstr) > 1 and remstr[0] in hexdigits and remstr[1] in hexdigits:
1433 # The ? after the CTE was followed by an encoded word escape (=XX).
R David Murray0b6f6c82012-05-25 18:42:14 -04001434 rest, *remainder = remstr.split('?=', 1)
1435 tok = tok + '?=' + rest
1436 if len(tok.split()) > 1:
1437 ew.defects.append(errors.InvalidHeaderDefect(
1438 "whitespace inside encoded word"))
1439 ew.cte = value
1440 value = ''.join(remainder)
1441 try:
1442 text, charset, lang, defects = _ew.decode('=?' + tok + '?=')
1443 except ValueError:
1444 raise errors.HeaderParseError(
1445 "encoded word format invalid: '{}'".format(ew.cte))
1446 ew.charset = charset
1447 ew.lang = lang
1448 ew.defects.extend(defects)
1449 while text:
1450 if text[0] in WSP:
1451 token, text = get_fws(text)
1452 ew.append(token)
1453 continue
1454 chars, *remainder = _wsp_splitter(text, 1)
1455 vtext = ValueTerminal(chars, 'vtext')
1456 _validate_xtext(vtext)
1457 ew.append(vtext)
1458 text = ''.join(remainder)
1459 return ew, value
1460
1461def get_unstructured(value):
1462 """unstructured = (*([FWS] vchar) *WSP) / obs-unstruct
1463 obs-unstruct = *((*LF *CR *(obs-utext) *LF *CR)) / FWS)
1464 obs-utext = %d0 / obs-NO-WS-CTL / LF / CR
1465
1466 obs-NO-WS-CTL is control characters except WSP/CR/LF.
1467
1468 So, basically, we have printable runs, plus control characters or nulls in
1469 the obsolete syntax, separated by whitespace. Since RFC 2047 uses the
1470 obsolete syntax in its specification, but requires whitespace on either
1471 side of the encoded words, I can see no reason to need to separate the
1472 non-printable-non-whitespace from the printable runs if they occur, so we
1473 parse this into xtext tokens separated by WSP tokens.
1474
1475 Because an 'unstructured' value must by definition constitute the entire
1476 value, this 'get' routine does not return a remaining value, only the
1477 parsed TokenList.
1478
1479 """
1480 # XXX: but what about bare CR and LF? They might signal the start or
R David Murray65171b22013-07-11 15:52:57 -04001481 # end of an encoded word. YAGNI for now, since our current parsers
1482 # will never send us strings with bare CR or LF.
R David Murray0b6f6c82012-05-25 18:42:14 -04001483
1484 unstructured = UnstructuredTokenList()
1485 while value:
1486 if value[0] in WSP:
1487 token, value = get_fws(value)
1488 unstructured.append(token)
1489 continue
1490 if value.startswith('=?'):
1491 try:
1492 token, value = get_encoded_word(value)
1493 except errors.HeaderParseError:
R David Murray65171b22013-07-11 15:52:57 -04001494 # XXX: Need to figure out how to register defects when
1495 # appropriate here.
R David Murray0b6f6c82012-05-25 18:42:14 -04001496 pass
1497 else:
1498 have_ws = True
1499 if len(unstructured) > 0:
1500 if unstructured[-1].token_type != 'fws':
1501 unstructured.defects.append(errors.InvalidHeaderDefect(
1502 "missing whitespace before encoded word"))
1503 have_ws = False
1504 if have_ws and len(unstructured) > 1:
1505 if unstructured[-2].token_type == 'encoded-word':
1506 unstructured[-1] = EWWhiteSpaceTerminal(
1507 unstructured[-1], 'fws')
1508 unstructured.append(token)
1509 continue
1510 tok, *remainder = _wsp_splitter(value, 1)
1511 vtext = ValueTerminal(tok, 'vtext')
1512 _validate_xtext(vtext)
1513 unstructured.append(vtext)
1514 value = ''.join(remainder)
1515 return unstructured
1516
1517def get_qp_ctext(value):
R David Murray44b548d2016-09-08 13:59:53 -04001518 r"""ctext = <printable ascii except \ ( )>
R David Murray0b6f6c82012-05-25 18:42:14 -04001519
1520 This is not the RFC ctext, since we are handling nested comments in comment
1521 and unquoting quoted-pairs here. We allow anything except the '()'
1522 characters, but if we find any ASCII other than the RFC defined printable
Serhiy Storchaka6a7b3a72016-04-17 08:32:47 +03001523 ASCII, a NonPrintableDefect is added to the token's defects list. Since
R David Murray0b6f6c82012-05-25 18:42:14 -04001524 quoted pairs are converted to their unquoted values, what is returned is
1525 a 'ptext' token. In this case it is a WhiteSpaceTerminal, so it's value
1526 is ' '.
1527
1528 """
1529 ptext, value, _ = _get_ptext_to_endchars(value, '()')
1530 ptext = WhiteSpaceTerminal(ptext, 'ptext')
1531 _validate_xtext(ptext)
1532 return ptext, value
1533
1534def get_qcontent(value):
1535 """qcontent = qtext / quoted-pair
1536
1537 We allow anything except the DQUOTE character, but if we find any ASCII
Serhiy Storchaka6a7b3a72016-04-17 08:32:47 +03001538 other than the RFC defined printable ASCII, a NonPrintableDefect is
R David Murray0b6f6c82012-05-25 18:42:14 -04001539 added to the token's defects list. Any quoted pairs are converted to their
1540 unquoted values, so what is returned is a 'ptext' token. In this case it
1541 is a ValueTerminal.
1542
1543 """
1544 ptext, value, _ = _get_ptext_to_endchars(value, '"')
1545 ptext = ValueTerminal(ptext, 'ptext')
1546 _validate_xtext(ptext)
1547 return ptext, value
1548
1549def get_atext(value):
1550 """atext = <matches _atext_matcher>
1551
1552 We allow any non-ATOM_ENDS in atext, but add an InvalidATextDefect to
1553 the token's defects list if we find non-atext characters.
1554 """
1555 m = _non_atom_end_matcher(value)
1556 if not m:
1557 raise errors.HeaderParseError(
1558 "expected atext but found '{}'".format(value))
1559 atext = m.group()
1560 value = value[len(atext):]
1561 atext = ValueTerminal(atext, 'atext')
1562 _validate_xtext(atext)
1563 return atext, value
1564
1565def get_bare_quoted_string(value):
1566 """bare-quoted-string = DQUOTE *([FWS] qcontent) [FWS] DQUOTE
1567
1568 A quoted-string without the leading or trailing white space. Its
1569 value is the text between the quote marks, with whitespace
1570 preserved and quoted pairs decoded.
1571 """
1572 if value[0] != '"':
1573 raise errors.HeaderParseError(
1574 "expected '\"' but found '{}'".format(value))
1575 bare_quoted_string = BareQuotedString()
1576 value = value[1:]
1577 while value and value[0] != '"':
1578 if value[0] in WSP:
1579 token, value = get_fws(value)
R David Murray0400d332014-02-08 13:12:00 -05001580 elif value[:2] == '=?':
1581 try:
1582 token, value = get_encoded_word(value)
1583 bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
1584 "encoded word inside quoted string"))
1585 except errors.HeaderParseError:
1586 token, value = get_qcontent(value)
R David Murray0b6f6c82012-05-25 18:42:14 -04001587 else:
1588 token, value = get_qcontent(value)
1589 bare_quoted_string.append(token)
1590 if not value:
1591 bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
1592 "end of header inside quoted string"))
1593 return bare_quoted_string, value
1594 return bare_quoted_string, value[1:]
1595
1596def get_comment(value):
1597 """comment = "(" *([FWS] ccontent) [FWS] ")"
1598 ccontent = ctext / quoted-pair / comment
1599
1600 We handle nested comments here, and quoted-pair in our qp-ctext routine.
1601 """
1602 if value and value[0] != '(':
1603 raise errors.HeaderParseError(
1604 "expected '(' but found '{}'".format(value))
1605 comment = Comment()
1606 value = value[1:]
1607 while value and value[0] != ")":
1608 if value[0] in WSP:
1609 token, value = get_fws(value)
1610 elif value[0] == '(':
1611 token, value = get_comment(value)
1612 else:
1613 token, value = get_qp_ctext(value)
1614 comment.append(token)
1615 if not value:
1616 comment.defects.append(errors.InvalidHeaderDefect(
1617 "end of header inside comment"))
1618 return comment, value
1619 return comment, value[1:]
1620
1621def get_cfws(value):
1622 """CFWS = (1*([FWS] comment) [FWS]) / FWS
1623
1624 """
1625 cfws = CFWSList()
1626 while value and value[0] in CFWS_LEADER:
1627 if value[0] in WSP:
1628 token, value = get_fws(value)
1629 else:
1630 token, value = get_comment(value)
1631 cfws.append(token)
1632 return cfws, value
1633
1634def get_quoted_string(value):
1635 """quoted-string = [CFWS] <bare-quoted-string> [CFWS]
1636
1637 'bare-quoted-string' is an intermediate class defined by this
1638 parser and not by the RFC grammar. It is the quoted string
1639 without any attached CFWS.
1640 """
1641 quoted_string = QuotedString()
1642 if value and value[0] in CFWS_LEADER:
1643 token, value = get_cfws(value)
1644 quoted_string.append(token)
1645 token, value = get_bare_quoted_string(value)
1646 quoted_string.append(token)
1647 if value and value[0] in CFWS_LEADER:
1648 token, value = get_cfws(value)
1649 quoted_string.append(token)
1650 return quoted_string, value
1651
1652def get_atom(value):
1653 """atom = [CFWS] 1*atext [CFWS]
1654
R David Murray923512f2013-07-12 16:00:28 -04001655 An atom could be an rfc2047 encoded word.
R David Murray0b6f6c82012-05-25 18:42:14 -04001656 """
1657 atom = Atom()
1658 if value and value[0] in CFWS_LEADER:
1659 token, value = get_cfws(value)
1660 atom.append(token)
1661 if value and value[0] in ATOM_ENDS:
1662 raise errors.HeaderParseError(
1663 "expected atom but found '{}'".format(value))
R David Murray923512f2013-07-12 16:00:28 -04001664 if value.startswith('=?'):
1665 try:
1666 token, value = get_encoded_word(value)
1667 except errors.HeaderParseError:
1668 # XXX: need to figure out how to register defects when
1669 # appropriate here.
1670 token, value = get_atext(value)
1671 else:
1672 token, value = get_atext(value)
R David Murray0b6f6c82012-05-25 18:42:14 -04001673 atom.append(token)
1674 if value and value[0] in CFWS_LEADER:
1675 token, value = get_cfws(value)
1676 atom.append(token)
1677 return atom, value
1678
1679def get_dot_atom_text(value):
1680 """ dot-text = 1*atext *("." 1*atext)
1681
1682 """
1683 dot_atom_text = DotAtomText()
1684 if not value or value[0] in ATOM_ENDS:
1685 raise errors.HeaderParseError("expected atom at a start of "
1686 "dot-atom-text but found '{}'".format(value))
1687 while value and value[0] not in ATOM_ENDS:
1688 token, value = get_atext(value)
1689 dot_atom_text.append(token)
1690 if value and value[0] == '.':
1691 dot_atom_text.append(DOT)
1692 value = value[1:]
1693 if dot_atom_text[-1] is DOT:
1694 raise errors.HeaderParseError("expected atom at end of dot-atom-text "
1695 "but found '{}'".format('.'+value))
1696 return dot_atom_text, value
1697
1698def get_dot_atom(value):
1699 """ dot-atom = [CFWS] dot-atom-text [CFWS]
1700
R David Murray923512f2013-07-12 16:00:28 -04001701 Any place we can have a dot atom, we could instead have an rfc2047 encoded
1702 word.
R David Murray0b6f6c82012-05-25 18:42:14 -04001703 """
1704 dot_atom = DotAtom()
1705 if value[0] in CFWS_LEADER:
1706 token, value = get_cfws(value)
1707 dot_atom.append(token)
R David Murray923512f2013-07-12 16:00:28 -04001708 if value.startswith('=?'):
1709 try:
1710 token, value = get_encoded_word(value)
1711 except errors.HeaderParseError:
1712 # XXX: need to figure out how to register defects when
1713 # appropriate here.
1714 token, value = get_dot_atom_text(value)
1715 else:
1716 token, value = get_dot_atom_text(value)
R David Murray0b6f6c82012-05-25 18:42:14 -04001717 dot_atom.append(token)
1718 if value and value[0] in CFWS_LEADER:
1719 token, value = get_cfws(value)
1720 dot_atom.append(token)
1721 return dot_atom, value
1722
1723def get_word(value):
1724 """word = atom / quoted-string
1725
1726 Either atom or quoted-string may start with CFWS. We have to peel off this
1727 CFWS first to determine which type of word to parse. Afterward we splice
1728 the leading CFWS, if any, into the parsed sub-token.
1729
1730 If neither an atom or a quoted-string is found before the next special, a
1731 HeaderParseError is raised.
1732
1733 The token returned is either an Atom or a QuotedString, as appropriate.
1734 This means the 'word' level of the formal grammar is not represented in the
1735 parse tree; this is because having that extra layer when manipulating the
1736 parse tree is more confusing than it is helpful.
1737
1738 """
1739 if value[0] in CFWS_LEADER:
1740 leader, value = get_cfws(value)
1741 else:
1742 leader = None
1743 if value[0]=='"':
1744 token, value = get_quoted_string(value)
1745 elif value[0] in SPECIALS:
1746 raise errors.HeaderParseError("Expected 'atom' or 'quoted-string' "
1747 "but found '{}'".format(value))
1748 else:
1749 token, value = get_atom(value)
1750 if leader is not None:
1751 token[:0] = [leader]
1752 return token, value
1753
1754def get_phrase(value):
1755 """ phrase = 1*word / obs-phrase
1756 obs-phrase = word *(word / "." / CFWS)
1757
1758 This means a phrase can be a sequence of words, periods, and CFWS in any
1759 order as long as it starts with at least one word. If anything other than
1760 words is detected, an ObsoleteHeaderDefect is added to the token's defect
1761 list. We also accept a phrase that starts with CFWS followed by a dot;
1762 this is registered as an InvalidHeaderDefect, since it is not supported by
1763 even the obsolete grammar.
1764
1765 """
1766 phrase = Phrase()
1767 try:
1768 token, value = get_word(value)
1769 phrase.append(token)
1770 except errors.HeaderParseError:
1771 phrase.defects.append(errors.InvalidHeaderDefect(
1772 "phrase does not start with word"))
1773 while value and value[0] not in PHRASE_ENDS:
1774 if value[0]=='.':
1775 phrase.append(DOT)
1776 phrase.defects.append(errors.ObsoleteHeaderDefect(
1777 "period in 'phrase'"))
1778 value = value[1:]
1779 else:
1780 try:
1781 token, value = get_word(value)
1782 except errors.HeaderParseError:
1783 if value[0] in CFWS_LEADER:
1784 token, value = get_cfws(value)
1785 phrase.defects.append(errors.ObsoleteHeaderDefect(
1786 "comment found without atom"))
1787 else:
1788 raise
1789 phrase.append(token)
1790 return phrase, value
1791
1792def get_local_part(value):
1793 """ local-part = dot-atom / quoted-string / obs-local-part
1794
1795 """
1796 local_part = LocalPart()
1797 leader = None
1798 if value[0] in CFWS_LEADER:
1799 leader, value = get_cfws(value)
1800 if not value:
1801 raise errors.HeaderParseError(
1802 "expected local-part but found '{}'".format(value))
1803 try:
1804 token, value = get_dot_atom(value)
1805 except errors.HeaderParseError:
1806 try:
1807 token, value = get_word(value)
1808 except errors.HeaderParseError:
1809 if value[0] != '\\' and value[0] in PHRASE_ENDS:
1810 raise
1811 token = TokenList()
1812 if leader is not None:
1813 token[:0] = [leader]
1814 local_part.append(token)
1815 if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
1816 obs_local_part, value = get_obs_local_part(str(local_part) + value)
1817 if obs_local_part.token_type == 'invalid-obs-local-part':
1818 local_part.defects.append(errors.InvalidHeaderDefect(
1819 "local-part is not dot-atom, quoted-string, or obs-local-part"))
1820 else:
1821 local_part.defects.append(errors.ObsoleteHeaderDefect(
1822 "local-part is not a dot-atom (contains CFWS)"))
1823 local_part[0] = obs_local_part
1824 try:
1825 local_part.value.encode('ascii')
1826 except UnicodeEncodeError:
1827 local_part.defects.append(errors.NonASCIILocalPartDefect(
1828 "local-part contains non-ASCII characters)"))
1829 return local_part, value
1830
1831def get_obs_local_part(value):
1832 """ obs-local-part = word *("." word)
1833 """
1834 obs_local_part = ObsLocalPart()
1835 last_non_ws_was_dot = False
1836 while value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
1837 if value[0] == '.':
1838 if last_non_ws_was_dot:
1839 obs_local_part.defects.append(errors.InvalidHeaderDefect(
1840 "invalid repeated '.'"))
1841 obs_local_part.append(DOT)
1842 last_non_ws_was_dot = True
1843 value = value[1:]
1844 continue
1845 elif value[0]=='\\':
1846 obs_local_part.append(ValueTerminal(value[0],
1847 'misplaced-special'))
1848 value = value[1:]
1849 obs_local_part.defects.append(errors.InvalidHeaderDefect(
1850 "'\\' character outside of quoted-string/ccontent"))
1851 last_non_ws_was_dot = False
1852 continue
1853 if obs_local_part and obs_local_part[-1].token_type != 'dot':
1854 obs_local_part.defects.append(errors.InvalidHeaderDefect(
1855 "missing '.' between words"))
1856 try:
1857 token, value = get_word(value)
1858 last_non_ws_was_dot = False
1859 except errors.HeaderParseError:
1860 if value[0] not in CFWS_LEADER:
1861 raise
1862 token, value = get_cfws(value)
1863 obs_local_part.append(token)
1864 if (obs_local_part[0].token_type == 'dot' or
1865 obs_local_part[0].token_type=='cfws' and
1866 obs_local_part[1].token_type=='dot'):
1867 obs_local_part.defects.append(errors.InvalidHeaderDefect(
1868 "Invalid leading '.' in local part"))
1869 if (obs_local_part[-1].token_type == 'dot' or
1870 obs_local_part[-1].token_type=='cfws' and
1871 obs_local_part[-2].token_type=='dot'):
1872 obs_local_part.defects.append(errors.InvalidHeaderDefect(
1873 "Invalid trailing '.' in local part"))
1874 if obs_local_part.defects:
1875 obs_local_part.token_type = 'invalid-obs-local-part'
1876 return obs_local_part, value
1877
1878def get_dtext(value):
R David Murray44b548d2016-09-08 13:59:53 -04001879 r""" dtext = <printable ascii except \ [ ]> / obs-dtext
R David Murray0b6f6c82012-05-25 18:42:14 -04001880 obs-dtext = obs-NO-WS-CTL / quoted-pair
1881
Terry Jan Reedy0f847642013-03-11 18:34:00 -04001882 We allow anything except the excluded characters, but if we find any
Serhiy Storchaka6a7b3a72016-04-17 08:32:47 +03001883 ASCII other than the RFC defined printable ASCII, a NonPrintableDefect is
R David Murray0b6f6c82012-05-25 18:42:14 -04001884 added to the token's defects list. Quoted pairs are converted to their
1885 unquoted values, so what is returned is a ptext token, in this case a
1886 ValueTerminal. If there were quoted-printables, an ObsoleteHeaderDefect is
1887 added to the returned token's defect list.
1888
1889 """
1890 ptext, value, had_qp = _get_ptext_to_endchars(value, '[]')
1891 ptext = ValueTerminal(ptext, 'ptext')
1892 if had_qp:
1893 ptext.defects.append(errors.ObsoleteHeaderDefect(
1894 "quoted printable found in domain-literal"))
1895 _validate_xtext(ptext)
1896 return ptext, value
1897
1898def _check_for_early_dl_end(value, domain_literal):
1899 if value:
1900 return False
1901 domain_literal.append(errors.InvalidHeaderDefect(
1902 "end of input inside domain-literal"))
1903 domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
1904 return True
1905
1906def get_domain_literal(value):
1907 """ domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS]
1908
1909 """
1910 domain_literal = DomainLiteral()
1911 if value[0] in CFWS_LEADER:
1912 token, value = get_cfws(value)
1913 domain_literal.append(token)
1914 if not value:
1915 raise errors.HeaderParseError("expected domain-literal")
1916 if value[0] != '[':
1917 raise errors.HeaderParseError("expected '[' at start of domain-literal "
1918 "but found '{}'".format(value))
1919 value = value[1:]
1920 if _check_for_early_dl_end(value, domain_literal):
1921 return domain_literal, value
1922 domain_literal.append(ValueTerminal('[', 'domain-literal-start'))
1923 if value[0] in WSP:
1924 token, value = get_fws(value)
1925 domain_literal.append(token)
1926 token, value = get_dtext(value)
1927 domain_literal.append(token)
1928 if _check_for_early_dl_end(value, domain_literal):
1929 return domain_literal, value
1930 if value[0] in WSP:
1931 token, value = get_fws(value)
1932 domain_literal.append(token)
1933 if _check_for_early_dl_end(value, domain_literal):
1934 return domain_literal, value
1935 if value[0] != ']':
1936 raise errors.HeaderParseError("expected ']' at end of domain-literal "
1937 "but found '{}'".format(value))
1938 domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
1939 value = value[1:]
1940 if value and value[0] in CFWS_LEADER:
1941 token, value = get_cfws(value)
1942 domain_literal.append(token)
1943 return domain_literal, value
1944
1945def get_domain(value):
1946 """ domain = dot-atom / domain-literal / obs-domain
1947 obs-domain = atom *("." atom))
1948
1949 """
1950 domain = Domain()
1951 leader = None
1952 if value[0] in CFWS_LEADER:
1953 leader, value = get_cfws(value)
1954 if not value:
1955 raise errors.HeaderParseError(
1956 "expected domain but found '{}'".format(value))
1957 if value[0] == '[':
1958 token, value = get_domain_literal(value)
1959 if leader is not None:
1960 token[:0] = [leader]
1961 domain.append(token)
1962 return domain, value
1963 try:
1964 token, value = get_dot_atom(value)
1965 except errors.HeaderParseError:
1966 token, value = get_atom(value)
1967 if leader is not None:
1968 token[:0] = [leader]
1969 domain.append(token)
1970 if value and value[0] == '.':
1971 domain.defects.append(errors.ObsoleteHeaderDefect(
1972 "domain is not a dot-atom (contains CFWS)"))
1973 if domain[0].token_type == 'dot-atom':
1974 domain[:] = domain[0]
1975 while value and value[0] == '.':
1976 domain.append(DOT)
1977 token, value = get_atom(value[1:])
1978 domain.append(token)
1979 return domain, value
1980
1981def get_addr_spec(value):
1982 """ addr-spec = local-part "@" domain
1983
1984 """
1985 addr_spec = AddrSpec()
1986 token, value = get_local_part(value)
1987 addr_spec.append(token)
1988 if not value or value[0] != '@':
1989 addr_spec.defects.append(errors.InvalidHeaderDefect(
1990 "add-spec local part with no domain"))
1991 return addr_spec, value
1992 addr_spec.append(ValueTerminal('@', 'address-at-symbol'))
1993 token, value = get_domain(value[1:])
1994 addr_spec.append(token)
1995 return addr_spec, value
1996
1997def get_obs_route(value):
1998 """ obs-route = obs-domain-list ":"
1999 obs-domain-list = *(CFWS / ",") "@" domain *("," [CFWS] ["@" domain])
2000
2001 Returns an obs-route token with the appropriate sub-tokens (that is,
2002 there is no obs-domain-list in the parse tree).
2003 """
2004 obs_route = ObsRoute()
2005 while value and (value[0]==',' or value[0] in CFWS_LEADER):
2006 if value[0] in CFWS_LEADER:
2007 token, value = get_cfws(value)
2008 obs_route.append(token)
2009 elif value[0] == ',':
2010 obs_route.append(ListSeparator)
2011 value = value[1:]
2012 if not value or value[0] != '@':
2013 raise errors.HeaderParseError(
2014 "expected obs-route domain but found '{}'".format(value))
2015 obs_route.append(RouteComponentMarker)
2016 token, value = get_domain(value[1:])
2017 obs_route.append(token)
2018 while value and value[0]==',':
2019 obs_route.append(ListSeparator)
2020 value = value[1:]
2021 if not value:
2022 break
2023 if value[0] in CFWS_LEADER:
2024 token, value = get_cfws(value)
2025 obs_route.append(token)
2026 if value[0] == '@':
2027 obs_route.append(RouteComponentMarker)
2028 token, value = get_domain(value[1:])
2029 obs_route.append(token)
2030 if not value:
2031 raise errors.HeaderParseError("end of header while parsing obs-route")
2032 if value[0] != ':':
2033 raise errors.HeaderParseError( "expected ':' marking end of "
2034 "obs-route but found '{}'".format(value))
2035 obs_route.append(ValueTerminal(':', 'end-of-obs-route-marker'))
2036 return obs_route, value[1:]
2037
2038def get_angle_addr(value):
2039 """ angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr
2040 obs-angle-addr = [CFWS] "<" obs-route addr-spec ">" [CFWS]
2041
2042 """
2043 angle_addr = AngleAddr()
2044 if value[0] in CFWS_LEADER:
2045 token, value = get_cfws(value)
2046 angle_addr.append(token)
2047 if not value or value[0] != '<':
2048 raise errors.HeaderParseError(
2049 "expected angle-addr but found '{}'".format(value))
2050 angle_addr.append(ValueTerminal('<', 'angle-addr-start'))
2051 value = value[1:]
R David Murray032eed32012-05-26 14:31:12 -04002052 # Although it is not legal per RFC5322, SMTP uses '<>' in certain
2053 # circumstances.
2054 if value[0] == '>':
2055 angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
2056 angle_addr.defects.append(errors.InvalidHeaderDefect(
2057 "null addr-spec in angle-addr"))
2058 value = value[1:]
2059 return angle_addr, value
R David Murray0b6f6c82012-05-25 18:42:14 -04002060 try:
2061 token, value = get_addr_spec(value)
2062 except errors.HeaderParseError:
2063 try:
2064 token, value = get_obs_route(value)
2065 angle_addr.defects.append(errors.ObsoleteHeaderDefect(
2066 "obsolete route specification in angle-addr"))
2067 except errors.HeaderParseError:
2068 raise errors.HeaderParseError(
R David Murray032eed32012-05-26 14:31:12 -04002069 "expected addr-spec or obs-route but found '{}'".format(value))
R David Murray0b6f6c82012-05-25 18:42:14 -04002070 angle_addr.append(token)
2071 token, value = get_addr_spec(value)
2072 angle_addr.append(token)
2073 if value and value[0] == '>':
2074 value = value[1:]
2075 else:
2076 angle_addr.defects.append(errors.InvalidHeaderDefect(
2077 "missing trailing '>' on angle-addr"))
2078 angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
2079 if value and value[0] in CFWS_LEADER:
2080 token, value = get_cfws(value)
2081 angle_addr.append(token)
2082 return angle_addr, value
2083
2084def get_display_name(value):
2085 """ display-name = phrase
2086
2087 Because this is simply a name-rule, we don't return a display-name
2088 token containing a phrase, but rather a display-name token with
2089 the content of the phrase.
2090
2091 """
2092 display_name = DisplayName()
2093 token, value = get_phrase(value)
2094 display_name.extend(token[:])
2095 display_name.defects = token.defects[:]
2096 return display_name, value
2097
2098
2099def get_name_addr(value):
2100 """ name-addr = [display-name] angle-addr
2101
2102 """
2103 name_addr = NameAddr()
2104 # Both the optional display name and the angle-addr can start with cfws.
2105 leader = None
2106 if value[0] in CFWS_LEADER:
2107 leader, value = get_cfws(value)
2108 if not value:
2109 raise errors.HeaderParseError(
2110 "expected name-addr but found '{}'".format(leader))
2111 if value[0] != '<':
2112 if value[0] in PHRASE_ENDS:
2113 raise errors.HeaderParseError(
2114 "expected name-addr but found '{}'".format(value))
2115 token, value = get_display_name(value)
2116 if not value:
2117 raise errors.HeaderParseError(
2118 "expected name-addr but found '{}'".format(token))
2119 if leader is not None:
2120 token[0][:0] = [leader]
2121 leader = None
2122 name_addr.append(token)
2123 token, value = get_angle_addr(value)
2124 if leader is not None:
2125 token[:0] = [leader]
2126 name_addr.append(token)
2127 return name_addr, value
2128
2129def get_mailbox(value):
2130 """ mailbox = name-addr / addr-spec
2131
2132 """
2133 # The only way to figure out if we are dealing with a name-addr or an
2134 # addr-spec is to try parsing each one.
2135 mailbox = Mailbox()
2136 try:
2137 token, value = get_name_addr(value)
2138 except errors.HeaderParseError:
2139 try:
2140 token, value = get_addr_spec(value)
2141 except errors.HeaderParseError:
2142 raise errors.HeaderParseError(
2143 "expected mailbox but found '{}'".format(value))
2144 if any(isinstance(x, errors.InvalidHeaderDefect)
2145 for x in token.all_defects):
2146 mailbox.token_type = 'invalid-mailbox'
2147 mailbox.append(token)
2148 return mailbox, value
2149
2150def get_invalid_mailbox(value, endchars):
2151 """ Read everything up to one of the chars in endchars.
2152
2153 This is outside the formal grammar. The InvalidMailbox TokenList that is
2154 returned acts like a Mailbox, but the data attributes are None.
2155
2156 """
2157 invalid_mailbox = InvalidMailbox()
2158 while value and value[0] not in endchars:
2159 if value[0] in PHRASE_ENDS:
2160 invalid_mailbox.append(ValueTerminal(value[0],
2161 'misplaced-special'))
2162 value = value[1:]
2163 else:
2164 token, value = get_phrase(value)
2165 invalid_mailbox.append(token)
2166 return invalid_mailbox, value
2167
2168def get_mailbox_list(value):
2169 """ mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list
2170 obs-mbox-list = *([CFWS] ",") mailbox *("," [mailbox / CFWS])
2171
2172 For this routine we go outside the formal grammar in order to improve error
2173 handling. We recognize the end of the mailbox list only at the end of the
2174 value or at a ';' (the group terminator). This is so that we can turn
2175 invalid mailboxes into InvalidMailbox tokens and continue parsing any
2176 remaining valid mailboxes. We also allow all mailbox entries to be null,
2177 and this condition is handled appropriately at a higher level.
2178
2179 """
2180 mailbox_list = MailboxList()
2181 while value and value[0] != ';':
2182 try:
2183 token, value = get_mailbox(value)
2184 mailbox_list.append(token)
2185 except errors.HeaderParseError:
2186 leader = None
2187 if value[0] in CFWS_LEADER:
2188 leader, value = get_cfws(value)
2189 if not value or value[0] in ',;':
2190 mailbox_list.append(leader)
2191 mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
2192 "empty element in mailbox-list"))
2193 else:
2194 token, value = get_invalid_mailbox(value, ',;')
2195 if leader is not None:
2196 token[:0] = [leader]
2197 mailbox_list.append(token)
2198 mailbox_list.defects.append(errors.InvalidHeaderDefect(
2199 "invalid mailbox in mailbox-list"))
2200 elif value[0] == ',':
2201 mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
2202 "empty element in mailbox-list"))
2203 else:
2204 token, value = get_invalid_mailbox(value, ',;')
2205 if leader is not None:
2206 token[:0] = [leader]
2207 mailbox_list.append(token)
2208 mailbox_list.defects.append(errors.InvalidHeaderDefect(
2209 "invalid mailbox in mailbox-list"))
2210 if value and value[0] not in ',;':
2211 # Crap after mailbox; treat it as an invalid mailbox.
2212 # The mailbox info will still be available.
2213 mailbox = mailbox_list[-1]
2214 mailbox.token_type = 'invalid-mailbox'
2215 token, value = get_invalid_mailbox(value, ',;')
2216 mailbox.extend(token)
2217 mailbox_list.defects.append(errors.InvalidHeaderDefect(
2218 "invalid mailbox in mailbox-list"))
2219 if value and value[0] == ',':
2220 mailbox_list.append(ListSeparator)
2221 value = value[1:]
2222 return mailbox_list, value
2223
2224
2225def get_group_list(value):
2226 """ group-list = mailbox-list / CFWS / obs-group-list
2227 obs-group-list = 1*([CFWS] ",") [CFWS]
2228
2229 """
2230 group_list = GroupList()
2231 if not value:
2232 group_list.defects.append(errors.InvalidHeaderDefect(
2233 "end of header before group-list"))
2234 return group_list, value
2235 leader = None
2236 if value and value[0] in CFWS_LEADER:
2237 leader, value = get_cfws(value)
2238 if not value:
2239 # This should never happen in email parsing, since CFWS-only is a
2240 # legal alternative to group-list in a group, which is the only
2241 # place group-list appears.
2242 group_list.defects.append(errors.InvalidHeaderDefect(
2243 "end of header in group-list"))
2244 group_list.append(leader)
2245 return group_list, value
2246 if value[0] == ';':
2247 group_list.append(leader)
2248 return group_list, value
2249 token, value = get_mailbox_list(value)
2250 if len(token.all_mailboxes)==0:
2251 if leader is not None:
2252 group_list.append(leader)
2253 group_list.extend(token)
2254 group_list.defects.append(errors.ObsoleteHeaderDefect(
2255 "group-list with empty entries"))
2256 return group_list, value
2257 if leader is not None:
2258 token[:0] = [leader]
2259 group_list.append(token)
2260 return group_list, value
2261
2262def get_group(value):
2263 """ group = display-name ":" [group-list] ";" [CFWS]
2264
2265 """
2266 group = Group()
2267 token, value = get_display_name(value)
2268 if not value or value[0] != ':':
2269 raise errors.HeaderParseError("expected ':' at end of group "
2270 "display name but found '{}'".format(value))
2271 group.append(token)
2272 group.append(ValueTerminal(':', 'group-display-name-terminator'))
2273 value = value[1:]
2274 if value and value[0] == ';':
2275 group.append(ValueTerminal(';', 'group-terminator'))
2276 return group, value[1:]
2277 token, value = get_group_list(value)
2278 group.append(token)
2279 if not value:
2280 group.defects.append(errors.InvalidHeaderDefect(
2281 "end of header in group"))
2282 if value[0] != ';':
2283 raise errors.HeaderParseError(
2284 "expected ';' at end of group but found {}".format(value))
2285 group.append(ValueTerminal(';', 'group-terminator'))
2286 value = value[1:]
2287 if value and value[0] in CFWS_LEADER:
2288 token, value = get_cfws(value)
2289 group.append(token)
2290 return group, value
2291
2292def get_address(value):
2293 """ address = mailbox / group
2294
2295 Note that counter-intuitively, an address can be either a single address or
2296 a list of addresses (a group). This is why the returned Address object has
2297 a 'mailboxes' attribute which treats a single address as a list of length
2298 one. When you need to differentiate between to two cases, extract the single
2299 element, which is either a mailbox or a group token.
2300
2301 """
2302 # The formal grammar isn't very helpful when parsing an address. mailbox
2303 # and group, especially when allowing for obsolete forms, start off very
2304 # similarly. It is only when you reach one of @, <, or : that you know
2305 # what you've got. So, we try each one in turn, starting with the more
2306 # likely of the two. We could perhaps make this more efficient by looking
2307 # for a phrase and then branching based on the next character, but that
2308 # would be a premature optimization.
2309 address = Address()
2310 try:
2311 token, value = get_group(value)
2312 except errors.HeaderParseError:
2313 try:
2314 token, value = get_mailbox(value)
2315 except errors.HeaderParseError:
2316 raise errors.HeaderParseError(
2317 "expected address but found '{}'".format(value))
2318 address.append(token)
2319 return address, value
2320
2321def get_address_list(value):
2322 """ address_list = (address *("," address)) / obs-addr-list
2323 obs-addr-list = *([CFWS] ",") address *("," [address / CFWS])
2324
2325 We depart from the formal grammar here by continuing to parse until the end
2326 of the input, assuming the input to be entirely composed of an
2327 address-list. This is always true in email parsing, and allows us
2328 to skip invalid addresses to parse additional valid ones.
2329
2330 """
2331 address_list = AddressList()
2332 while value:
2333 try:
2334 token, value = get_address(value)
2335 address_list.append(token)
2336 except errors.HeaderParseError as err:
2337 leader = None
2338 if value[0] in CFWS_LEADER:
2339 leader, value = get_cfws(value)
2340 if not value or value[0] == ',':
2341 address_list.append(leader)
2342 address_list.defects.append(errors.ObsoleteHeaderDefect(
2343 "address-list entry with no content"))
2344 else:
2345 token, value = get_invalid_mailbox(value, ',')
2346 if leader is not None:
2347 token[:0] = [leader]
2348 address_list.append(Address([token]))
2349 address_list.defects.append(errors.InvalidHeaderDefect(
2350 "invalid address in address-list"))
2351 elif value[0] == ',':
2352 address_list.defects.append(errors.ObsoleteHeaderDefect(
2353 "empty element in address-list"))
2354 else:
2355 token, value = get_invalid_mailbox(value, ',')
2356 if leader is not None:
2357 token[:0] = [leader]
2358 address_list.append(Address([token]))
2359 address_list.defects.append(errors.InvalidHeaderDefect(
2360 "invalid address in address-list"))
2361 if value and value[0] != ',':
2362 # Crap after address; treat it as an invalid mailbox.
2363 # The mailbox info will still be available.
2364 mailbox = address_list[-1][0]
2365 mailbox.token_type = 'invalid-mailbox'
2366 token, value = get_invalid_mailbox(value, ',')
2367 mailbox.extend(token)
2368 address_list.defects.append(errors.InvalidHeaderDefect(
2369 "invalid address in address-list"))
2370 if value: # Must be a , at this point.
2371 address_list.append(ValueTerminal(',', 'list-separator'))
2372 value = value[1:]
2373 return address_list, value
R David Murray97f43c02012-06-24 05:03:27 -04002374
2375#
2376# XXX: As I begin to add additional header parsers, I'm realizing we probably
2377# have two level of parser routines: the get_XXX methods that get a token in
2378# the grammar, and parse_XXX methods that parse an entire field value. So
2379# get_address_list above should really be a parse_ method, as probably should
2380# be get_unstructured.
2381#
2382
2383def parse_mime_version(value):
2384 """ mime-version = [CFWS] 1*digit [CFWS] "." [CFWS] 1*digit [CFWS]
2385
2386 """
2387 # The [CFWS] is implicit in the RFC 2045 BNF.
2388 # XXX: This routine is a bit verbose, should factor out a get_int method.
2389 mime_version = MIMEVersion()
2390 if not value:
2391 mime_version.defects.append(errors.HeaderMissingRequiredValue(
2392 "Missing MIME version number (eg: 1.0)"))
2393 return mime_version
2394 if value[0] in CFWS_LEADER:
2395 token, value = get_cfws(value)
2396 mime_version.append(token)
2397 if not value:
2398 mime_version.defects.append(errors.HeaderMissingRequiredValue(
2399 "Expected MIME version number but found only CFWS"))
2400 digits = ''
2401 while value and value[0] != '.' and value[0] not in CFWS_LEADER:
2402 digits += value[0]
2403 value = value[1:]
2404 if not digits.isdigit():
2405 mime_version.defects.append(errors.InvalidHeaderDefect(
2406 "Expected MIME major version number but found {!r}".format(digits)))
2407 mime_version.append(ValueTerminal(digits, 'xtext'))
2408 else:
2409 mime_version.major = int(digits)
2410 mime_version.append(ValueTerminal(digits, 'digits'))
2411 if value and value[0] in CFWS_LEADER:
2412 token, value = get_cfws(value)
2413 mime_version.append(token)
2414 if not value or value[0] != '.':
2415 if mime_version.major is not None:
2416 mime_version.defects.append(errors.InvalidHeaderDefect(
2417 "Incomplete MIME version; found only major number"))
2418 if value:
2419 mime_version.append(ValueTerminal(value, 'xtext'))
2420 return mime_version
2421 mime_version.append(ValueTerminal('.', 'version-separator'))
2422 value = value[1:]
2423 if value and value[0] in CFWS_LEADER:
2424 token, value = get_cfws(value)
2425 mime_version.append(token)
2426 if not value:
2427 if mime_version.major is not None:
2428 mime_version.defects.append(errors.InvalidHeaderDefect(
2429 "Incomplete MIME version; found only major number"))
2430 return mime_version
2431 digits = ''
2432 while value and value[0] not in CFWS_LEADER:
2433 digits += value[0]
2434 value = value[1:]
2435 if not digits.isdigit():
2436 mime_version.defects.append(errors.InvalidHeaderDefect(
2437 "Expected MIME minor version number but found {!r}".format(digits)))
2438 mime_version.append(ValueTerminal(digits, 'xtext'))
2439 else:
2440 mime_version.minor = int(digits)
2441 mime_version.append(ValueTerminal(digits, 'digits'))
2442 if value and value[0] in CFWS_LEADER:
2443 token, value = get_cfws(value)
2444 mime_version.append(token)
2445 if value:
2446 mime_version.defects.append(errors.InvalidHeaderDefect(
2447 "Excess non-CFWS text after MIME version"))
2448 mime_version.append(ValueTerminal(value, 'xtext'))
2449 return mime_version
2450
2451def get_invalid_parameter(value):
2452 """ Read everything up to the next ';'.
2453
2454 This is outside the formal grammar. The InvalidParameter TokenList that is
2455 returned acts like a Parameter, but the data attributes are None.
2456
2457 """
2458 invalid_parameter = InvalidParameter()
2459 while value and value[0] != ';':
2460 if value[0] in PHRASE_ENDS:
2461 invalid_parameter.append(ValueTerminal(value[0],
2462 'misplaced-special'))
2463 value = value[1:]
2464 else:
2465 token, value = get_phrase(value)
2466 invalid_parameter.append(token)
2467 return invalid_parameter, value
2468
2469def get_ttext(value):
2470 """ttext = <matches _ttext_matcher>
2471
2472 We allow any non-TOKEN_ENDS in ttext, but add defects to the token's
2473 defects list if we find non-ttext characters. We also register defects for
2474 *any* non-printables even though the RFC doesn't exclude all of them,
2475 because we follow the spirit of RFC 5322.
2476
2477 """
2478 m = _non_token_end_matcher(value)
2479 if not m:
2480 raise errors.HeaderParseError(
2481 "expected ttext but found '{}'".format(value))
2482 ttext = m.group()
2483 value = value[len(ttext):]
2484 ttext = ValueTerminal(ttext, 'ttext')
2485 _validate_xtext(ttext)
2486 return ttext, value
2487
2488def get_token(value):
2489 """token = [CFWS] 1*ttext [CFWS]
2490
2491 The RFC equivalent of ttext is any US-ASCII chars except space, ctls, or
2492 tspecials. We also exclude tabs even though the RFC doesn't.
2493
2494 The RFC implies the CFWS but is not explicit about it in the BNF.
2495
2496 """
2497 mtoken = Token()
2498 if value and value[0] in CFWS_LEADER:
2499 token, value = get_cfws(value)
2500 mtoken.append(token)
2501 if value and value[0] in TOKEN_ENDS:
2502 raise errors.HeaderParseError(
2503 "expected token but found '{}'".format(value))
2504 token, value = get_ttext(value)
2505 mtoken.append(token)
2506 if value and value[0] in CFWS_LEADER:
2507 token, value = get_cfws(value)
2508 mtoken.append(token)
2509 return mtoken, value
2510
2511def get_attrtext(value):
2512 """attrtext = 1*(any non-ATTRIBUTE_ENDS character)
2513
2514 We allow any non-ATTRIBUTE_ENDS in attrtext, but add defects to the
2515 token's defects list if we find non-attrtext characters. We also register
2516 defects for *any* non-printables even though the RFC doesn't exclude all of
2517 them, because we follow the spirit of RFC 5322.
2518
2519 """
2520 m = _non_attribute_end_matcher(value)
2521 if not m:
2522 raise errors.HeaderParseError(
2523 "expected attrtext but found {!r}".format(value))
2524 attrtext = m.group()
2525 value = value[len(attrtext):]
2526 attrtext = ValueTerminal(attrtext, 'attrtext')
2527 _validate_xtext(attrtext)
2528 return attrtext, value
2529
2530def get_attribute(value):
2531 """ [CFWS] 1*attrtext [CFWS]
2532
2533 This version of the BNF makes the CFWS explicit, and as usual we use a
2534 value terminal for the actual run of characters. The RFC equivalent of
2535 attrtext is the token characters, with the subtraction of '*', "'", and '%'.
2536 We include tab in the excluded set just as we do for token.
2537
2538 """
2539 attribute = Attribute()
2540 if value and value[0] in CFWS_LEADER:
2541 token, value = get_cfws(value)
2542 attribute.append(token)
2543 if value and value[0] in ATTRIBUTE_ENDS:
2544 raise errors.HeaderParseError(
2545 "expected token but found '{}'".format(value))
2546 token, value = get_attrtext(value)
2547 attribute.append(token)
2548 if value and value[0] in CFWS_LEADER:
2549 token, value = get_cfws(value)
2550 attribute.append(token)
2551 return attribute, value
2552
2553def get_extended_attrtext(value):
2554 """attrtext = 1*(any non-ATTRIBUTE_ENDS character plus '%')
2555
2556 This is a special parsing routine so that we get a value that
2557 includes % escapes as a single string (which we decode as a single
2558 string later).
2559
2560 """
2561 m = _non_extended_attribute_end_matcher(value)
2562 if not m:
2563 raise errors.HeaderParseError(
2564 "expected extended attrtext but found {!r}".format(value))
2565 attrtext = m.group()
2566 value = value[len(attrtext):]
2567 attrtext = ValueTerminal(attrtext, 'extended-attrtext')
2568 _validate_xtext(attrtext)
2569 return attrtext, value
2570
2571def get_extended_attribute(value):
2572 """ [CFWS] 1*extended_attrtext [CFWS]
2573
2574 This is like the non-extended version except we allow % characters, so that
2575 we can pick up an encoded value as a single string.
2576
2577 """
2578 # XXX: should we have an ExtendedAttribute TokenList?
2579 attribute = Attribute()
2580 if value and value[0] in CFWS_LEADER:
2581 token, value = get_cfws(value)
2582 attribute.append(token)
2583 if value and value[0] in EXTENDED_ATTRIBUTE_ENDS:
2584 raise errors.HeaderParseError(
2585 "expected token but found '{}'".format(value))
2586 token, value = get_extended_attrtext(value)
2587 attribute.append(token)
2588 if value and value[0] in CFWS_LEADER:
2589 token, value = get_cfws(value)
2590 attribute.append(token)
2591 return attribute, value
2592
2593def get_section(value):
2594 """ '*' digits
2595
2596 The formal BNF is more complicated because leading 0s are not allowed. We
2597 check for that and add a defect. We also assume no CFWS is allowed between
2598 the '*' and the digits, though the RFC is not crystal clear on that.
2599 The caller should already have dealt with leading CFWS.
2600
2601 """
2602 section = Section()
2603 if not value or value[0] != '*':
2604 raise errors.HeaderParseError("Expected section but found {}".format(
2605 value))
2606 section.append(ValueTerminal('*', 'section-marker'))
2607 value = value[1:]
2608 if not value or not value[0].isdigit():
2609 raise errors.HeaderParseError("Expected section number but "
2610 "found {}".format(value))
2611 digits = ''
2612 while value and value[0].isdigit():
2613 digits += value[0]
2614 value = value[1:]
2615 if digits[0] == '0' and digits != '0':
2616 section.defects.append(errors.InvalidHeaderError("section number"
2617 "has an invalid leading 0"))
2618 section.number = int(digits)
2619 section.append(ValueTerminal(digits, 'digits'))
2620 return section, value
2621
2622
2623def get_value(value):
2624 """ quoted-string / attribute
2625
2626 """
2627 v = Value()
2628 if not value:
2629 raise errors.HeaderParseError("Expected value but found end of string")
2630 leader = None
2631 if value[0] in CFWS_LEADER:
2632 leader, value = get_cfws(value)
2633 if not value:
2634 raise errors.HeaderParseError("Expected value but found "
2635 "only {}".format(leader))
2636 if value[0] == '"':
2637 token, value = get_quoted_string(value)
2638 else:
2639 token, value = get_extended_attribute(value)
2640 if leader is not None:
2641 token[:0] = [leader]
2642 v.append(token)
2643 return v, value
2644
2645def get_parameter(value):
2646 """ attribute [section] ["*"] [CFWS] "=" value
2647
2648 The CFWS is implied by the RFC but not made explicit in the BNF. This
2649 simplified form of the BNF from the RFC is made to conform with the RFC BNF
2650 through some extra checks. We do it this way because it makes both error
2651 recovery and working with the resulting parse tree easier.
2652 """
2653 # It is possible CFWS would also be implicitly allowed between the section
2654 # and the 'extended-attribute' marker (the '*') , but we've never seen that
2655 # in the wild and we will therefore ignore the possibility.
2656 param = Parameter()
2657 token, value = get_attribute(value)
2658 param.append(token)
2659 if not value or value[0] == ';':
2660 param.defects.append(errors.InvalidHeaderDefect("Parameter contains "
2661 "name ({}) but no value".format(token)))
2662 return param, value
2663 if value[0] == '*':
2664 try:
2665 token, value = get_section(value)
2666 param.sectioned = True
2667 param.append(token)
2668 except errors.HeaderParseError:
2669 pass
2670 if not value:
2671 raise errors.HeaderParseError("Incomplete parameter")
2672 if value[0] == '*':
2673 param.append(ValueTerminal('*', 'extended-parameter-marker'))
2674 value = value[1:]
2675 param.extended = True
2676 if value[0] != '=':
2677 raise errors.HeaderParseError("Parameter not followed by '='")
2678 param.append(ValueTerminal('=', 'parameter-separator'))
2679 value = value[1:]
2680 leader = None
2681 if value and value[0] in CFWS_LEADER:
2682 token, value = get_cfws(value)
2683 param.append(token)
2684 remainder = None
2685 appendto = param
2686 if param.extended and value and value[0] == '"':
2687 # Now for some serious hackery to handle the common invalid case of
2688 # double quotes around an extended value. We also accept (with defect)
2689 # a value marked as encoded that isn't really.
2690 qstring, remainder = get_quoted_string(value)
2691 inner_value = qstring.stripped_value
2692 semi_valid = False
2693 if param.section_number == 0:
2694 if inner_value and inner_value[0] == "'":
2695 semi_valid = True
2696 else:
2697 token, rest = get_attrtext(inner_value)
2698 if rest and rest[0] == "'":
2699 semi_valid = True
2700 else:
2701 try:
2702 token, rest = get_extended_attrtext(inner_value)
2703 except:
2704 pass
2705 else:
2706 if not rest:
2707 semi_valid = True
2708 if semi_valid:
2709 param.defects.append(errors.InvalidHeaderDefect(
2710 "Quoted string value for extended parameter is invalid"))
2711 param.append(qstring)
2712 for t in qstring:
2713 if t.token_type == 'bare-quoted-string':
2714 t[:] = []
2715 appendto = t
2716 break
2717 value = inner_value
2718 else:
2719 remainder = None
2720 param.defects.append(errors.InvalidHeaderDefect(
2721 "Parameter marked as extended but appears to have a "
2722 "quoted string value that is non-encoded"))
2723 if value and value[0] == "'":
2724 token = None
2725 else:
2726 token, value = get_value(value)
2727 if not param.extended or param.section_number > 0:
2728 if not value or value[0] != "'":
2729 appendto.append(token)
2730 if remainder is not None:
2731 assert not value, value
2732 value = remainder
2733 return param, value
2734 param.defects.append(errors.InvalidHeaderDefect(
2735 "Apparent initial-extended-value but attribute "
2736 "was not marked as extended or was not initial section"))
2737 if not value:
2738 # Assume the charset/lang is missing and the token is the value.
2739 param.defects.append(errors.InvalidHeaderDefect(
2740 "Missing required charset/lang delimiters"))
2741 appendto.append(token)
2742 if remainder is None:
2743 return param, value
2744 else:
2745 if token is not None:
2746 for t in token:
2747 if t.token_type == 'extended-attrtext':
2748 break
2749 t.token_type == 'attrtext'
2750 appendto.append(t)
2751 param.charset = t.value
2752 if value[0] != "'":
2753 raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
2754 "delimiter, but found {!r}".format(value))
2755 appendto.append(ValueTerminal("'", 'RFC2231 delimiter'))
2756 value = value[1:]
2757 if value and value[0] != "'":
2758 token, value = get_attrtext(value)
2759 appendto.append(token)
2760 param.lang = token.value
2761 if not value or value[0] != "'":
2762 raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
2763 "delimiter, but found {}".format(value))
2764 appendto.append(ValueTerminal("'", 'RFC2231 delimiter'))
2765 value = value[1:]
2766 if remainder is not None:
2767 # Treat the rest of value as bare quoted string content.
2768 v = Value()
2769 while value:
2770 if value[0] in WSP:
2771 token, value = get_fws(value)
2772 else:
2773 token, value = get_qcontent(value)
2774 v.append(token)
2775 token = v
2776 else:
2777 token, value = get_value(value)
2778 appendto.append(token)
2779 if remainder is not None:
2780 assert not value, value
2781 value = remainder
2782 return param, value
2783
2784def parse_mime_parameters(value):
2785 """ parameter *( ";" parameter )
2786
2787 That BNF is meant to indicate this routine should only be called after
2788 finding and handling the leading ';'. There is no corresponding rule in
2789 the formal RFC grammar, but it is more convenient for us for the set of
2790 parameters to be treated as its own TokenList.
2791
2792 This is 'parse' routine because it consumes the reminaing value, but it
2793 would never be called to parse a full header. Instead it is called to
2794 parse everything after the non-parameter value of a specific MIME header.
2795
2796 """
2797 mime_parameters = MimeParameters()
2798 while value:
2799 try:
2800 token, value = get_parameter(value)
2801 mime_parameters.append(token)
2802 except errors.HeaderParseError as err:
2803 leader = None
2804 if value[0] in CFWS_LEADER:
2805 leader, value = get_cfws(value)
2806 if not value:
2807 mime_parameters.append(leader)
2808 return mime_parameters
2809 if value[0] == ';':
2810 if leader is not None:
2811 mime_parameters.append(leader)
2812 mime_parameters.defects.append(errors.InvalidHeaderDefect(
2813 "parameter entry with no content"))
2814 else:
2815 token, value = get_invalid_parameter(value)
2816 if leader:
2817 token[:0] = [leader]
2818 mime_parameters.append(token)
2819 mime_parameters.defects.append(errors.InvalidHeaderDefect(
2820 "invalid parameter {!r}".format(token)))
2821 if value and value[0] != ';':
2822 # Junk after the otherwise valid parameter. Mark it as
2823 # invalid, but it will have a value.
2824 param = mime_parameters[-1]
2825 param.token_type = 'invalid-parameter'
2826 token, value = get_invalid_parameter(value)
2827 param.extend(token)
2828 mime_parameters.defects.append(errors.InvalidHeaderDefect(
2829 "parameter with invalid trailing text {!r}".format(token)))
2830 if value:
2831 # Must be a ';' at this point.
2832 mime_parameters.append(ValueTerminal(';', 'parameter-separator'))
2833 value = value[1:]
2834 return mime_parameters
2835
2836def _find_mime_parameters(tokenlist, value):
2837 """Do our best to find the parameters in an invalid MIME header
2838
2839 """
2840 while value and value[0] != ';':
2841 if value[0] in PHRASE_ENDS:
2842 tokenlist.append(ValueTerminal(value[0], 'misplaced-special'))
2843 value = value[1:]
2844 else:
2845 token, value = get_phrase(value)
2846 tokenlist.append(token)
2847 if not value:
2848 return
2849 tokenlist.append(ValueTerminal(';', 'parameter-separator'))
2850 tokenlist.append(parse_mime_parameters(value[1:]))
2851
2852def parse_content_type_header(value):
2853 """ maintype "/" subtype *( ";" parameter )
2854
2855 The maintype and substype are tokens. Theoretically they could
2856 be checked against the official IANA list + x-token, but we
2857 don't do that.
2858 """
2859 ctype = ContentType()
2860 recover = False
2861 if not value:
2862 ctype.defects.append(errors.HeaderMissingRequiredValue(
2863 "Missing content type specification"))
2864 return ctype
2865 try:
2866 token, value = get_token(value)
2867 except errors.HeaderParseError:
2868 ctype.defects.append(errors.InvalidHeaderDefect(
2869 "Expected content maintype but found {!r}".format(value)))
2870 _find_mime_parameters(ctype, value)
2871 return ctype
2872 ctype.append(token)
Martin Panter46f50722016-05-26 05:35:26 +00002873 # XXX: If we really want to follow the formal grammar we should make
R David Murray97f43c02012-06-24 05:03:27 -04002874 # mantype and subtype specialized TokenLists here. Probably not worth it.
2875 if not value or value[0] != '/':
2876 ctype.defects.append(errors.InvalidHeaderDefect(
2877 "Invalid content type"))
2878 if value:
2879 _find_mime_parameters(ctype, value)
2880 return ctype
2881 ctype.maintype = token.value.strip().lower()
2882 ctype.append(ValueTerminal('/', 'content-type-separator'))
2883 value = value[1:]
2884 try:
2885 token, value = get_token(value)
2886 except errors.HeaderParseError:
2887 ctype.defects.append(errors.InvalidHeaderDefect(
2888 "Expected content subtype but found {!r}".format(value)))
2889 _find_mime_parameters(ctype, value)
2890 return ctype
2891 ctype.append(token)
2892 ctype.subtype = token.value.strip().lower()
2893 if not value:
2894 return ctype
2895 if value[0] != ';':
2896 ctype.defects.append(errors.InvalidHeaderDefect(
2897 "Only parameters are valid after content type, but "
2898 "found {!r}".format(value)))
2899 # The RFC requires that a syntactically invalid content-type be treated
2900 # as text/plain. Perhaps we should postel this, but we should probably
2901 # only do that if we were checking the subtype value against IANA.
2902 del ctype.maintype, ctype.subtype
2903 _find_mime_parameters(ctype, value)
2904 return ctype
2905 ctype.append(ValueTerminal(';', 'parameter-separator'))
2906 ctype.append(parse_mime_parameters(value[1:]))
2907 return ctype
2908
2909def parse_content_disposition_header(value):
2910 """ disposition-type *( ";" parameter )
2911
2912 """
2913 disp_header = ContentDisposition()
2914 if not value:
2915 disp_header.defects.append(errors.HeaderMissingRequiredValue(
2916 "Missing content disposition"))
2917 return disp_header
2918 try:
2919 token, value = get_token(value)
2920 except errors.HeaderParseError:
Ezio Melottid5774802014-08-04 17:16:49 +03002921 disp_header.defects.append(errors.InvalidHeaderDefect(
R David Murray97f43c02012-06-24 05:03:27 -04002922 "Expected content disposition but found {!r}".format(value)))
2923 _find_mime_parameters(disp_header, value)
2924 return disp_header
2925 disp_header.append(token)
2926 disp_header.content_disposition = token.value.strip().lower()
2927 if not value:
2928 return disp_header
2929 if value[0] != ';':
2930 disp_header.defects.append(errors.InvalidHeaderDefect(
2931 "Only parameters are valid after content disposition, but "
2932 "found {!r}".format(value)))
2933 _find_mime_parameters(disp_header, value)
2934 return disp_header
2935 disp_header.append(ValueTerminal(';', 'parameter-separator'))
2936 disp_header.append(parse_mime_parameters(value[1:]))
2937 return disp_header
2938
2939def parse_content_transfer_encoding_header(value):
2940 """ mechanism
2941
2942 """
2943 # We should probably validate the values, since the list is fixed.
2944 cte_header = ContentTransferEncoding()
2945 if not value:
2946 cte_header.defects.append(errors.HeaderMissingRequiredValue(
2947 "Missing content transfer encoding"))
2948 return cte_header
2949 try:
2950 token, value = get_token(value)
2951 except errors.HeaderParseError:
Ezio Melottid5774802014-08-04 17:16:49 +03002952 cte_header.defects.append(errors.InvalidHeaderDefect(
2953 "Expected content transfer encoding but found {!r}".format(value)))
R David Murray97f43c02012-06-24 05:03:27 -04002954 else:
2955 cte_header.append(token)
2956 cte_header.cte = token.value.strip().lower()
2957 if not value:
2958 return cte_header
2959 while value:
2960 cte_header.defects.append(errors.InvalidHeaderDefect(
2961 "Extra text after content transfer encoding"))
2962 if value[0] in PHRASE_ENDS:
2963 cte_header.append(ValueTerminal(value[0], 'misplaced-special'))
2964 value = value[1:]
2965 else:
2966 token, value = get_phrase(value)
2967 cte_header.append(token)
2968 return cte_header