blob: 39ab6a608c4b8c745d016995383675d6459f6b19 [file] [log] [blame]
Guido van Rossum01ca3361992-07-13 14:28:59 +00001# RFC-822 message manipulation class.
2#
3# XXX This is only a very rough sketch of a full RFC-822 parser;
4# additional methods are needed to parse addresses and dates, and to
5# tokenize lines according to various other syntax rules.
6#
7# Directions for use:
8#
9# To create a Message object: first open a file, e.g.:
10# fp = open(file, 'r')
11# (or use any other legal way of getting an open file object, e.g. use
12# sys.stdin or call os.popen()).
Guido van Rossum7bc817d1993-12-17 15:25:27 +000013# Then pass the open file object to the Message() constructor:
14# m = Message(fp)
Guido van Rossum01ca3361992-07-13 14:28:59 +000015#
16# To get the text of a particular header there are several methods:
17# str = m.getheader(name)
18# str = m.getrawheader(name)
19# where name is the name of the header, e.g. 'Subject'.
20# The difference is that getheader() strips the leading and trailing
21# whitespace, while getrawheader() doesn't. Both functions retain
22# embedded whitespace (including newlines) exactly as they are
23# specified in the header, and leave the case of the text unchanged.
24#
25# See the class definition for lower level access methods.
26#
27# There are also some utility functions here.
28
29
30import regex
31import string
32
33
34class Message:
35
36 # Initialize the class instance and read the headers.
37
Guido van Rossum7bc817d1993-12-17 15:25:27 +000038 def __init__(self, fp):
Guido van Rossum01ca3361992-07-13 14:28:59 +000039 self.fp = fp
40 #
41 try:
42 self.startofheaders = self.fp.tell()
43 except IOError:
44 self.startofheaders = None
45 #
46 self.readheaders()
47 #
48 try:
49 self.startofbody = self.fp.tell()
50 except IOError:
51 self.startofbody = None
Guido van Rossum01ca3361992-07-13 14:28:59 +000052
53
54 # Rewind the file to the start of the body (if seekable).
55
56 def rewindbody(self):
57 self.fp.seek(self.startofbody)
58
59
60 # Read header lines up to the entirely blank line that
61 # terminates them. The (normally blank) line that ends the
62 # headers is skipped, but not included in the returned list.
63 # If a non-header line ends the headers, (which is an error),
64 # an attempt is made to backspace over it; it is never
65 # included in the returned list.
66 #
67 # The variable self.status is set to the empty string if all
68 # went well, otherwise it is an error message.
69 # The variable self.headers is a completely uninterpreted list
70 # of lines contained in the header (so printing them will
71 # reproduce the header exactly as it appears in the file).
72
73 def readheaders(self):
74 self.headers = list = []
75 self.status = ''
76 headerseen = 0
77 while 1:
78 line = self.fp.readline()
79 if not line:
80 self.status = 'EOF in headers'
81 break
82 if self.islast(line):
83 break
84 elif headerseen and line[0] in ' \t':
85 # It's a continuation line.
86 list.append(line)
87 elif regex.match('^[!-9;-~]+:', line):
88 # It's a header line.
89 list.append(line)
90 headerseen = 1
91 else:
92 # It's not a header line; stop here.
93 if not headerseen:
94 self.status = 'No headers'
95 else:
96 self.status = 'Bad header'
97 # Try to undo the read.
98 try:
99 self.fp.seek(-len(line), 1)
100 except IOError:
101 self.status = \
102 self.status + '; bad seek'
103 break
104
105
106 # Method to determine whether a line is a legal end of
107 # RFC-822 headers. You may override this method if your
108 # application wants to bend the rules, e.g. to accept lines
109 # ending in '\r\n', to strip trailing whitespace, or to
110 # recognise MH template separators ('--------').
111
112 def islast(self, line):
113 return line == '\n'
114
115
116 # Look through the list of headers and find all lines matching
117 # a given header name (and their continuation lines).
118 # A list of the lines is returned, without interpretation.
119 # If the header does not occur, an empty list is returned.
120 # If the header occurs multiple times, all occurrences are
121 # returned. Case is not important in the header name.
122
123 def getallmatchingheaders(self, name):
124 name = string.lower(name) + ':'
125 n = len(name)
126 list = []
127 hit = 0
128 for line in self.headers:
129 if string.lower(line[:n]) == name:
130 hit = 1
131 elif line[:1] not in string.whitespace:
132 hit = 0
133 if hit:
134 list.append(line)
135 return list
136
137
138 # Similar, but return only the first matching header (and its
139 # continuation lines).
140
141 def getfirstmatchingheader(self, name):
142 name = string.lower(name) + ':'
143 n = len(name)
144 list = []
145 hit = 0
146 for line in self.headers:
147 if string.lower(line[:n]) == name:
148 hit = 1
149 elif line[:1] not in string.whitespace:
150 if hit:
151 break
152 if hit:
153 list.append(line)
154 return list
155
156
157 # A higher-level interface to getfirstmatchingheader().
158 # Return a string containing the literal text of the header
159 # but with the keyword stripped. All leading, trailing and
160 # embedded whitespace is kept in the string, however.
161 # Return None if the header does not occur.
162
163 def getrawheader(self, name):
164 list = self.getfirstmatchingheader(name)
165 if not list:
166 return None
167 list[0] = list[0][len(name) + 1:]
168 return string.joinfields(list, '')
169
170
171 # Going one step further: also strip leading and trailing
172 # whitespace.
173
174 def getheader(self, name):
175 text = self.getrawheader(name)
176 if text == None:
177 return None
178 return string.strip(text)
179
180
181 # XXX The next step would be to define self.getaddr(name)
182 # and self.getaddrlist(name) which would parse a header
183 # consisting of a single mail address and a number of mail
184 # addresses, respectively. Lower level functions would be
185 # parseaddr(string) and parseaddrlist(string).
186
187 # XXX Similar, there would be a function self.getdate(name) to
188 # return a date in canonical form (perhaps a number compatible
189 # to time.time()) and a function parsedate(string).
190
191 # XXX The inverses of the parse functions may also be useful.
192
193
194
195
196# Utility functions
197# -----------------
198
199
200# Remove quotes from a string.
201# XXX Should fix this to be really conformant.
202
203def unquote(str):
204 if len(str) > 1:
205 if str[0] == '"' and str[-1:] == '"':
206 return str[1:-1]
207 if str[0] == '<' and str[-1:] == '>':
208 return str[1:-1]
209 return str