blob: 3eed7a8fd7b7145f64abdcc725d9107dd936ead5 [file] [log] [blame]
Guido van Rossum7c750e11995-02-27 13:16:55 +00001# A parser for SGML, using the derived class as static DTD.
2
3# XXX This only supports those SGML features used by HTML.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
11import regex
12import string
13
14
15# Regular expressions used for parsing
16
Guido van Rossum1dba24e1995-03-04 22:28:49 +000017incomplete = regex.compile(
18 '<!-?\|</[a-zA-Z][a-zA-Z0-9]*[ \t\n]*\|</?\|' +
Guido van Rossum7c750e11995-02-27 13:16:55 +000019 '&#[a-zA-Z0-9]*\|&[a-zA-Z][a-zA-Z0-9]*\|&')
20entityref = regex.compile('&[a-zA-Z][a-zA-Z0-9]*[;.]')
21charref = regex.compile('&#[a-zA-Z0-9]+;')
22starttagopen = regex.compile('<[a-zA-Z]')
23endtag = regex.compile('</[a-zA-Z][a-zA-Z0-9]*[ \t\n]*>')
Guido van Rossum145b2e01995-08-04 04:22:39 +000024special = regex.compile('<![^<>]*>')
Guido van Rossum7c750e11995-02-27 13:16:55 +000025commentopen = regex.compile('<!--')
Guido van Rossum145b2e01995-08-04 04:22:39 +000026commentclose = regex.compile('--[ \t\n]*>')
Guido van Rossum7c750e11995-02-27 13:16:55 +000027
28
29# SGML parser base class -- find tags and call handler functions.
30# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
31# The dtd is defined by deriving a class which defines methods
32# with special names to handle tags: start_foo and end_foo to handle
33# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
34# (Tags are converted to lower case for this purpose.) The data
35# between tags is passed to the parser by calling self.handle_data()
36# with some data as argument (the data may be split up in arbutrary
37# chunks). Entity references are passed by calling
38# self.handle_entityref() with the entity reference as argument.
39
40class SGMLParser:
41
42 # Interface -- initialize and reset this instance
43 def __init__(self):
44 self.reset()
45
46 # Interface -- reset this instance. Loses all unprocessed data
47 def reset(self):
48 self.rawdata = ''
49 self.stack = []
50 self.nomoretags = 0
51 self.literal = 0
52
53 # For derived classes only -- enter literal mode (CDATA) till EOF
54 def setnomoretags(self):
55 self.nomoretags = self.literal = 1
56
57 # For derived classes only -- enter literal mode (CDATA)
58 def setliteral(self, *args):
59 self.literal = 1
60
61 # Interface -- feed some data to the parser. Call this as
62 # often as you want, with as little or as much text as you
63 # want (may include '\n'). (This just saves the text, all the
Guido van Rossum1dba24e1995-03-04 22:28:49 +000064 # processing is done by goahead().)
Guido van Rossum7c750e11995-02-27 13:16:55 +000065 def feed(self, data):
66 self.rawdata = self.rawdata + data
67 self.goahead(0)
68
69 # Interface -- handle the remaining data
70 def close(self):
71 self.goahead(1)
72
73 # Internal -- handle data as far as reasonable. May leave state
74 # and data to be processed by a subsequent call. If 'end' is
75 # true, force handling all data as if followed by EOF marker.
76 def goahead(self, end):
77 rawdata = self.rawdata
78 i = 0
79 n = len(rawdata)
80 while i < n:
81 if self.nomoretags:
82 self.handle_data(rawdata[i:n])
83 i = n
84 break
85 j = incomplete.search(rawdata, i)
86 if j < 0: j = n
87 if i < j: self.handle_data(rawdata[i:j])
88 i = j
89 if i == n: break
90 if rawdata[i] == '<':
91 if starttagopen.match(rawdata, i) >= 0:
92 if self.literal:
93 self.handle_data(rawdata[i])
94 i = i+1
95 continue
96 k = self.parse_starttag(i)
97 if k < 0: break
98 i = i + k
99 continue
100 k = endtag.match(rawdata, i)
101 if k >= 0:
102 j = i+k
103 self.parse_endtag(rawdata[i:j])
104 i = j
105 self.literal = 0
106 continue
107 if commentopen.match(rawdata, i) >= 0:
108 if self.literal:
109 self.handle_data(rawdata[i])
110 i = i+1
111 continue
112 k = self.parse_comment(i)
113 if k < 0: break
114 i = i+k
115 continue
Guido van Rossum145b2e01995-08-04 04:22:39 +0000116 k = special.match(rawdata, i)
117 if k >= 0:
118 if self.literal:
119 self.handle_data(rawdata[i])
120 i = i+1
121 continue
122 i = i+k
123 continue
Guido van Rossum7c750e11995-02-27 13:16:55 +0000124 elif rawdata[i] == '&':
125 k = charref.match(rawdata, i)
126 if k >= 0:
127 j = i+k
128 self.handle_charref(rawdata[i+2:j-1])
129 i = j
130 continue
131 k = entityref.match(rawdata, i)
132 if k >= 0:
133 j = i+k
134 self.handle_entityref(rawdata[i+1:j-1])
135 i = j
136 continue
137 else:
138 raise RuntimeError, 'neither < nor & ??'
139 # We get here only if incomplete matches but
140 # nothing else
141 k = incomplete.match(rawdata, i)
142 if k < 0: raise RuntimeError, 'no incomplete match ??'
143 j = i+k
144 if j == n: break # Really incomplete
145 self.handle_data(rawdata[i:j])
146 i = j
147 # end while
148 if end and i < n:
149 self.handle_data(rawdata[i:n])
150 i = n
151 self.rawdata = rawdata[i:]
152 # XXX if end: check for empty stack
153
Guido van Rossum145b2e01995-08-04 04:22:39 +0000154 # Internal -- parse comment, return length or -1 if not terminated
Guido van Rossum7c750e11995-02-27 13:16:55 +0000155 def parse_comment(self, i):
156 rawdata = self.rawdata
157 if rawdata[i:i+4] <> '<!--':
158 raise RuntimeError, 'unexpected call to handle_comment'
Guido van Rossum145b2e01995-08-04 04:22:39 +0000159 j = commentclose.search(rawdata, i+4)
160 if j < 0:
Guido van Rossum7c750e11995-02-27 13:16:55 +0000161 return -1
162 self.handle_comment(rawdata[i+4: j])
Guido van Rossum145b2e01995-08-04 04:22:39 +0000163 j = j+commentclose.match(rawdata, j)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000164 return j-i
165
166 # Internal -- handle starttag, return length or -1 if not terminated
167 def parse_starttag(self, i):
168 rawdata = self.rawdata
169 try:
170 j = string.index(rawdata, '>', i)
171 except string.index_error:
172 return -1
173 # Now parse the data between i+1 and j into a tag and attrs
174 attrs = []
175 tagfind = regex.compile('[a-zA-Z][a-zA-Z0-9]*')
Guido van Rossumeae892d1995-08-10 19:43:53 +0000176 # XXX Should also support value-less attributes (e.g. ISMAP)
177 # XXX Should use regex.group()
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000178 attrfind = regex.compile(
179 '[ \t\n]+\([a-zA-Z][a-zA-Z0-9]*\)' +
180 '\([ \t\n]*=[ \t\n]*' +
Guido van Rossum7c750e11995-02-27 13:16:55 +0000181 '\(\'[^\']*\';\|"[^"]*"\|[-a-zA-Z0-9./:+*%?!()_#]+\)\)?')
182 k = tagfind.match(rawdata, i+1)
183 if k < 0:
184 raise RuntimeError, 'unexpected call to parse_starttag'
185 k = i+1+k
186 tag = string.lower(rawdata[i+1:k])
187 while k < j:
188 l = attrfind.match(rawdata, k)
189 if l < 0: break
190 regs = attrfind.regs
191 a1, b1 = regs[1]
192 a2, b2 = regs[2]
193 a3, b3 = regs[3]
194 attrname = rawdata[a1:b1]
195 if '=' in rawdata[k:k+l]:
196 attrvalue = rawdata[a3:b3]
197 if attrvalue[:1] == '\'' == attrvalue[-1:] or \
198 attrvalue[:1] == '"' == attrvalue[-1:]:
199 attrvalue = attrvalue[1:-1]
200 else:
201 attrvalue = ''
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000202 attrs.append((string.lower(attrname), attrvalue))
Guido van Rossum7c750e11995-02-27 13:16:55 +0000203 k = k + l
204 j = j+1
205 try:
206 method = getattr(self, 'start_' + tag)
207 except AttributeError:
208 try:
209 method = getattr(self, 'do_' + tag)
210 except AttributeError:
211 self.unknown_starttag(tag, attrs)
212 return j-i
213 method(attrs)
214 return j-i
215 self.stack.append(tag)
216 method(attrs)
217 return j-i
218
219 # Internal -- parse endtag
220 def parse_endtag(self, data):
221 if data[:2] <> '</' or data[-1:] <> '>':
222 raise RuntimeError, 'unexpected call to parse_endtag'
223 tag = string.lower(string.strip(data[2:-1]))
224 try:
225 method = getattr(self, 'end_' + tag)
226 except AttributeError:
227 self.unknown_endtag(tag)
228 return
229 if self.stack and self.stack[-1] == tag:
230 del self.stack[-1]
231 else:
Guido van Rossumefe5ac41995-06-22 18:56:36 +0000232 self.report_unbalanced(tag)
233 # Now repair it
Guido van Rossum7c750e11995-02-27 13:16:55 +0000234 found = None
235 for i in range(len(self.stack)):
236 if self.stack[i] == tag: found = i
237 if found <> None:
238 del self.stack[found:]
239 method()
240
Guido van Rossumefe5ac41995-06-22 18:56:36 +0000241 # Example -- report an unbalanced </...> tag.
242 def report_unbalanced(self, tag):
243 print '*** Unbalanced </' + tag + '>'
244 print '*** Stack:', self.stack
245
Guido van Rossum7c750e11995-02-27 13:16:55 +0000246 # Example -- handle character reference, no need to override
247 def handle_charref(self, name):
248 try:
249 n = string.atoi(name)
250 except string.atoi_error:
251 self.unknown_charref(name)
252 return
253 if not 0 <= n <= 255:
254 self.unknown_charref(name)
255 return
256 self.handle_data(chr(n))
257
258 # Definition of entities -- derived classes may override
259 entitydefs = \
260 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
261
262 # Example -- handle entity reference, no need to override
263 def handle_entityref(self, name):
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000264 table = self.entitydefs
Guido van Rossum7c750e11995-02-27 13:16:55 +0000265 name = string.lower(name)
266 if table.has_key(name):
267 self.handle_data(table[name])
268 else:
269 self.unknown_entityref(name)
270 return
271
272 # Example -- handle data, should be overridden
273 def handle_data(self, data):
274 pass
275
276 # Example -- handle comment, could be overridden
277 def handle_comment(self, data):
278 pass
279
280 # To be overridden -- handlers for unknown objects
281 def unknown_starttag(self, tag, attrs): pass
282 def unknown_endtag(self, tag): pass
283 def unknown_charref(self, ref): pass
284 def unknown_entityref(self, ref): pass
285
286
287class TestSGML(SGMLParser):
288
289 def handle_data(self, data):
290 r = repr(data)
291 if len(r) > 72:
292 r = r[:35] + '...' + r[-35:]
293 print 'data:', r
294
295 def handle_comment(self, data):
296 r = repr(data)
297 if len(r) > 68:
298 r = r[:32] + '...' + r[-32:]
299 print 'comment:', r
300
301 def unknown_starttag(self, tag, attrs):
302 print 'start tag: <' + tag,
303 for name, value in attrs:
304 print name + '=' + '"' + value + '"',
305 print '>'
306
307 def unknown_endtag(self, tag):
308 print 'end tag: </' + tag + '>'
309
310 def unknown_entityref(self, ref):
311 print '*** unknown entity ref: &' + ref + ';'
312
313 def unknown_charref(self, ref):
314 print '*** unknown char ref: &#' + ref + ';'
315
316
317def test():
318 file = 'test.html'
319 f = open(file, 'r')
320 x = TestSGML()
321 while 1:
322 line = f.readline()
323 if not line:
324 x.close()
325 break
326 x.feed(line)
327
328
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000329if __name__ == '__main__':
330 test()