blob: 695530a2b00ed0cf4c200c8415a2baf49be45a21 [file] [log] [blame]
Guido van Rossum7c750e11995-02-27 13:16:55 +00001# A parser for SGML, using the derived class as static DTD.
2
3# XXX This only supports those SGML features used by HTML.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
11import regex
12import string
13
14
15# Regular expressions used for parsing
16
Guido van Rossum1dba24e1995-03-04 22:28:49 +000017incomplete = regex.compile(
18 '<!-?\|</[a-zA-Z][a-zA-Z0-9]*[ \t\n]*\|</?\|' +
Guido van Rossum7c750e11995-02-27 13:16:55 +000019 '&#[a-zA-Z0-9]*\|&[a-zA-Z][a-zA-Z0-9]*\|&')
20entityref = regex.compile('&[a-zA-Z][a-zA-Z0-9]*[;.]')
21charref = regex.compile('&#[a-zA-Z0-9]+;')
22starttagopen = regex.compile('<[a-zA-Z]')
23endtag = regex.compile('</[a-zA-Z][a-zA-Z0-9]*[ \t\n]*>')
Guido van Rossum145b2e01995-08-04 04:22:39 +000024special = regex.compile('<![^<>]*>')
Guido van Rossum7c750e11995-02-27 13:16:55 +000025commentopen = regex.compile('<!--')
Guido van Rossum145b2e01995-08-04 04:22:39 +000026commentclose = regex.compile('--[ \t\n]*>')
Guido van Rossum7c750e11995-02-27 13:16:55 +000027
28
29# SGML parser base class -- find tags and call handler functions.
30# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
31# The dtd is defined by deriving a class which defines methods
32# with special names to handle tags: start_foo and end_foo to handle
33# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
34# (Tags are converted to lower case for this purpose.) The data
35# between tags is passed to the parser by calling self.handle_data()
36# with some data as argument (the data may be split up in arbutrary
37# chunks). Entity references are passed by calling
38# self.handle_entityref() with the entity reference as argument.
39
40class SGMLParser:
41
42 # Interface -- initialize and reset this instance
43 def __init__(self):
44 self.reset()
45
46 # Interface -- reset this instance. Loses all unprocessed data
47 def reset(self):
48 self.rawdata = ''
49 self.stack = []
50 self.nomoretags = 0
51 self.literal = 0
52
53 # For derived classes only -- enter literal mode (CDATA) till EOF
54 def setnomoretags(self):
55 self.nomoretags = self.literal = 1
56
57 # For derived classes only -- enter literal mode (CDATA)
58 def setliteral(self, *args):
59 self.literal = 1
60
61 # Interface -- feed some data to the parser. Call this as
62 # often as you want, with as little or as much text as you
63 # want (may include '\n'). (This just saves the text, all the
Guido van Rossum1dba24e1995-03-04 22:28:49 +000064 # processing is done by goahead().)
Guido van Rossum7c750e11995-02-27 13:16:55 +000065 def feed(self, data):
66 self.rawdata = self.rawdata + data
67 self.goahead(0)
68
69 # Interface -- handle the remaining data
70 def close(self):
71 self.goahead(1)
72
73 # Internal -- handle data as far as reasonable. May leave state
74 # and data to be processed by a subsequent call. If 'end' is
75 # true, force handling all data as if followed by EOF marker.
76 def goahead(self, end):
77 rawdata = self.rawdata
78 i = 0
79 n = len(rawdata)
80 while i < n:
81 if self.nomoretags:
82 self.handle_data(rawdata[i:n])
83 i = n
84 break
85 j = incomplete.search(rawdata, i)
86 if j < 0: j = n
87 if i < j: self.handle_data(rawdata[i:j])
88 i = j
89 if i == n: break
90 if rawdata[i] == '<':
91 if starttagopen.match(rawdata, i) >= 0:
92 if self.literal:
93 self.handle_data(rawdata[i])
94 i = i+1
95 continue
96 k = self.parse_starttag(i)
97 if k < 0: break
98 i = i + k
99 continue
100 k = endtag.match(rawdata, i)
101 if k >= 0:
102 j = i+k
103 self.parse_endtag(rawdata[i:j])
104 i = j
105 self.literal = 0
106 continue
107 if commentopen.match(rawdata, i) >= 0:
108 if self.literal:
109 self.handle_data(rawdata[i])
110 i = i+1
111 continue
112 k = self.parse_comment(i)
113 if k < 0: break
114 i = i+k
115 continue
Guido van Rossum145b2e01995-08-04 04:22:39 +0000116 k = special.match(rawdata, i)
117 if k >= 0:
118 if self.literal:
119 self.handle_data(rawdata[i])
120 i = i+1
121 continue
122 i = i+k
123 continue
Guido van Rossum7c750e11995-02-27 13:16:55 +0000124 elif rawdata[i] == '&':
125 k = charref.match(rawdata, i)
126 if k >= 0:
127 j = i+k
128 self.handle_charref(rawdata[i+2:j-1])
129 i = j
130 continue
131 k = entityref.match(rawdata, i)
132 if k >= 0:
133 j = i+k
134 self.handle_entityref(rawdata[i+1:j-1])
135 i = j
136 continue
137 else:
138 raise RuntimeError, 'neither < nor & ??'
139 # We get here only if incomplete matches but
140 # nothing else
141 k = incomplete.match(rawdata, i)
142 if k < 0: raise RuntimeError, 'no incomplete match ??'
143 j = i+k
144 if j == n: break # Really incomplete
145 self.handle_data(rawdata[i:j])
146 i = j
147 # end while
148 if end and i < n:
149 self.handle_data(rawdata[i:n])
150 i = n
151 self.rawdata = rawdata[i:]
152 # XXX if end: check for empty stack
153
Guido van Rossum145b2e01995-08-04 04:22:39 +0000154 # Internal -- parse comment, return length or -1 if not terminated
Guido van Rossum7c750e11995-02-27 13:16:55 +0000155 def parse_comment(self, i):
156 rawdata = self.rawdata
157 if rawdata[i:i+4] <> '<!--':
158 raise RuntimeError, 'unexpected call to handle_comment'
Guido van Rossum145b2e01995-08-04 04:22:39 +0000159 j = commentclose.search(rawdata, i+4)
160 if j < 0:
Guido van Rossum7c750e11995-02-27 13:16:55 +0000161 return -1
162 self.handle_comment(rawdata[i+4: j])
Guido van Rossum145b2e01995-08-04 04:22:39 +0000163 j = j+commentclose.match(rawdata, j)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000164 return j-i
165
166 # Internal -- handle starttag, return length or -1 if not terminated
167 def parse_starttag(self, i):
168 rawdata = self.rawdata
169 try:
170 j = string.index(rawdata, '>', i)
171 except string.index_error:
172 return -1
173 # Now parse the data between i+1 and j into a tag and attrs
174 attrs = []
175 tagfind = regex.compile('[a-zA-Z][a-zA-Z0-9]*')
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000176 attrfind = regex.compile(
177 '[ \t\n]+\([a-zA-Z][a-zA-Z0-9]*\)' +
178 '\([ \t\n]*=[ \t\n]*' +
Guido van Rossum7c750e11995-02-27 13:16:55 +0000179 '\(\'[^\']*\';\|"[^"]*"\|[-a-zA-Z0-9./:+*%?!()_#]+\)\)?')
180 k = tagfind.match(rawdata, i+1)
181 if k < 0:
182 raise RuntimeError, 'unexpected call to parse_starttag'
183 k = i+1+k
184 tag = string.lower(rawdata[i+1:k])
185 while k < j:
186 l = attrfind.match(rawdata, k)
187 if l < 0: break
188 regs = attrfind.regs
189 a1, b1 = regs[1]
190 a2, b2 = regs[2]
191 a3, b3 = regs[3]
192 attrname = rawdata[a1:b1]
193 if '=' in rawdata[k:k+l]:
194 attrvalue = rawdata[a3:b3]
195 if attrvalue[:1] == '\'' == attrvalue[-1:] or \
196 attrvalue[:1] == '"' == attrvalue[-1:]:
197 attrvalue = attrvalue[1:-1]
198 else:
199 attrvalue = ''
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000200 attrs.append((string.lower(attrname), attrvalue))
Guido van Rossum7c750e11995-02-27 13:16:55 +0000201 k = k + l
202 j = j+1
203 try:
204 method = getattr(self, 'start_' + tag)
205 except AttributeError:
206 try:
207 method = getattr(self, 'do_' + tag)
208 except AttributeError:
209 self.unknown_starttag(tag, attrs)
210 return j-i
211 method(attrs)
212 return j-i
213 self.stack.append(tag)
214 method(attrs)
215 return j-i
216
217 # Internal -- parse endtag
218 def parse_endtag(self, data):
219 if data[:2] <> '</' or data[-1:] <> '>':
220 raise RuntimeError, 'unexpected call to parse_endtag'
221 tag = string.lower(string.strip(data[2:-1]))
222 try:
223 method = getattr(self, 'end_' + tag)
224 except AttributeError:
225 self.unknown_endtag(tag)
226 return
227 if self.stack and self.stack[-1] == tag:
228 del self.stack[-1]
229 else:
Guido van Rossumefe5ac41995-06-22 18:56:36 +0000230 self.report_unbalanced(tag)
231 # Now repair it
Guido van Rossum7c750e11995-02-27 13:16:55 +0000232 found = None
233 for i in range(len(self.stack)):
234 if self.stack[i] == tag: found = i
235 if found <> None:
236 del self.stack[found:]
237 method()
238
Guido van Rossumefe5ac41995-06-22 18:56:36 +0000239 # Example -- report an unbalanced </...> tag.
240 def report_unbalanced(self, tag):
241 print '*** Unbalanced </' + tag + '>'
242 print '*** Stack:', self.stack
243
Guido van Rossum7c750e11995-02-27 13:16:55 +0000244 # Example -- handle character reference, no need to override
245 def handle_charref(self, name):
246 try:
247 n = string.atoi(name)
248 except string.atoi_error:
249 self.unknown_charref(name)
250 return
251 if not 0 <= n <= 255:
252 self.unknown_charref(name)
253 return
254 self.handle_data(chr(n))
255
256 # Definition of entities -- derived classes may override
257 entitydefs = \
258 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
259
260 # Example -- handle entity reference, no need to override
261 def handle_entityref(self, name):
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000262 table = self.entitydefs
Guido van Rossum7c750e11995-02-27 13:16:55 +0000263 name = string.lower(name)
264 if table.has_key(name):
265 self.handle_data(table[name])
266 else:
267 self.unknown_entityref(name)
268 return
269
270 # Example -- handle data, should be overridden
271 def handle_data(self, data):
272 pass
273
274 # Example -- handle comment, could be overridden
275 def handle_comment(self, data):
276 pass
277
278 # To be overridden -- handlers for unknown objects
279 def unknown_starttag(self, tag, attrs): pass
280 def unknown_endtag(self, tag): pass
281 def unknown_charref(self, ref): pass
282 def unknown_entityref(self, ref): pass
283
284
285class TestSGML(SGMLParser):
286
287 def handle_data(self, data):
288 r = repr(data)
289 if len(r) > 72:
290 r = r[:35] + '...' + r[-35:]
291 print 'data:', r
292
293 def handle_comment(self, data):
294 r = repr(data)
295 if len(r) > 68:
296 r = r[:32] + '...' + r[-32:]
297 print 'comment:', r
298
299 def unknown_starttag(self, tag, attrs):
300 print 'start tag: <' + tag,
301 for name, value in attrs:
302 print name + '=' + '"' + value + '"',
303 print '>'
304
305 def unknown_endtag(self, tag):
306 print 'end tag: </' + tag + '>'
307
308 def unknown_entityref(self, ref):
309 print '*** unknown entity ref: &' + ref + ';'
310
311 def unknown_charref(self, ref):
312 print '*** unknown char ref: &#' + ref + ';'
313
314
315def test():
316 file = 'test.html'
317 f = open(file, 'r')
318 x = TestSGML()
319 while 1:
320 line = f.readline()
321 if not line:
322 x.close()
323 break
324 x.feed(line)
325
326
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000327if __name__ == '__main__':
328 test()