blob: 63f2fb61f0d43664c774650da809ef473505f7cf [file] [log] [blame]
Guido van Rossum01ca3361992-07-13 14:28:59 +00001# RFC-822 message manipulation class.
2#
3# XXX This is only a very rough sketch of a full RFC-822 parser;
4# additional methods are needed to parse addresses and dates, and to
5# tokenize lines according to various other syntax rules.
6#
7# Directions for use:
8#
9# To create a Message object: first open a file, e.g.:
10# fp = open(file, 'r')
11# (or use any other legal way of getting an open file object, e.g. use
12# sys.stdin or call os.popen()).
13# Then pass the open file object to the init() method of Message:
14# m = Message().init(fp)
15#
16# To get the text of a particular header there are several methods:
17# str = m.getheader(name)
18# str = m.getrawheader(name)
19# where name is the name of the header, e.g. 'Subject'.
20# The difference is that getheader() strips the leading and trailing
21# whitespace, while getrawheader() doesn't. Both functions retain
22# embedded whitespace (including newlines) exactly as they are
23# specified in the header, and leave the case of the text unchanged.
24#
25# See the class definition for lower level access methods.
26#
27# There are also some utility functions here.
28
29
30import regex
31import string
32
33
34class Message:
35
36 # Initialize the class instance and read the headers.
37
38 def init(self, fp):
39 self.fp = fp
40 #
41 try:
42 self.startofheaders = self.fp.tell()
43 except IOError:
44 self.startofheaders = None
45 #
46 self.readheaders()
47 #
48 try:
49 self.startofbody = self.fp.tell()
50 except IOError:
51 self.startofbody = None
52 #
53 return self
54
55
56 # Rewind the file to the start of the body (if seekable).
57
58 def rewindbody(self):
59 self.fp.seek(self.startofbody)
60
61
62 # Read header lines up to the entirely blank line that
63 # terminates them. The (normally blank) line that ends the
64 # headers is skipped, but not included in the returned list.
65 # If a non-header line ends the headers, (which is an error),
66 # an attempt is made to backspace over it; it is never
67 # included in the returned list.
68 #
69 # The variable self.status is set to the empty string if all
70 # went well, otherwise it is an error message.
71 # The variable self.headers is a completely uninterpreted list
72 # of lines contained in the header (so printing them will
73 # reproduce the header exactly as it appears in the file).
74
75 def readheaders(self):
76 self.headers = list = []
77 self.status = ''
78 headerseen = 0
79 while 1:
80 line = self.fp.readline()
81 if not line:
82 self.status = 'EOF in headers'
83 break
84 if self.islast(line):
85 break
86 elif headerseen and line[0] in ' \t':
87 # It's a continuation line.
88 list.append(line)
89 elif regex.match('^[!-9;-~]+:', line):
90 # It's a header line.
91 list.append(line)
92 headerseen = 1
93 else:
94 # It's not a header line; stop here.
95 if not headerseen:
96 self.status = 'No headers'
97 else:
98 self.status = 'Bad header'
99 # Try to undo the read.
100 try:
101 self.fp.seek(-len(line), 1)
102 except IOError:
103 self.status = \
104 self.status + '; bad seek'
105 break
106
107
108 # Method to determine whether a line is a legal end of
109 # RFC-822 headers. You may override this method if your
110 # application wants to bend the rules, e.g. to accept lines
111 # ending in '\r\n', to strip trailing whitespace, or to
112 # recognise MH template separators ('--------').
113
114 def islast(self, line):
115 return line == '\n'
116
117
118 # Look through the list of headers and find all lines matching
119 # a given header name (and their continuation lines).
120 # A list of the lines is returned, without interpretation.
121 # If the header does not occur, an empty list is returned.
122 # If the header occurs multiple times, all occurrences are
123 # returned. Case is not important in the header name.
124
125 def getallmatchingheaders(self, name):
126 name = string.lower(name) + ':'
127 n = len(name)
128 list = []
129 hit = 0
130 for line in self.headers:
131 if string.lower(line[:n]) == name:
132 hit = 1
133 elif line[:1] not in string.whitespace:
134 hit = 0
135 if hit:
136 list.append(line)
137 return list
138
139
140 # Similar, but return only the first matching header (and its
141 # continuation lines).
142
143 def getfirstmatchingheader(self, name):
144 name = string.lower(name) + ':'
145 n = len(name)
146 list = []
147 hit = 0
148 for line in self.headers:
149 if string.lower(line[:n]) == name:
150 hit = 1
151 elif line[:1] not in string.whitespace:
152 if hit:
153 break
154 if hit:
155 list.append(line)
156 return list
157
158
159 # A higher-level interface to getfirstmatchingheader().
160 # Return a string containing the literal text of the header
161 # but with the keyword stripped. All leading, trailing and
162 # embedded whitespace is kept in the string, however.
163 # Return None if the header does not occur.
164
165 def getrawheader(self, name):
166 list = self.getfirstmatchingheader(name)
167 if not list:
168 return None
169 list[0] = list[0][len(name) + 1:]
170 return string.joinfields(list, '')
171
172
173 # Going one step further: also strip leading and trailing
174 # whitespace.
175
176 def getheader(self, name):
177 text = self.getrawheader(name)
178 if text == None:
179 return None
180 return string.strip(text)
181
182
183 # XXX The next step would be to define self.getaddr(name)
184 # and self.getaddrlist(name) which would parse a header
185 # consisting of a single mail address and a number of mail
186 # addresses, respectively. Lower level functions would be
187 # parseaddr(string) and parseaddrlist(string).
188
189 # XXX Similar, there would be a function self.getdate(name) to
190 # return a date in canonical form (perhaps a number compatible
191 # to time.time()) and a function parsedate(string).
192
193 # XXX The inverses of the parse functions may also be useful.
194
195
196
197
198# Utility functions
199# -----------------
200
201
202# Remove quotes from a string.
203# XXX Should fix this to be really conformant.
204
205def unquote(str):
206 if len(str) > 1:
207 if str[0] == '"' and str[-1:] == '"':
208 return str[1:-1]
209 if str[0] == '<' and str[-1:] == '>':
210 return str[1:-1]
211 return str