blob: 0169ea31d513f7f97405efb87be45febc5183132 [file] [log] [blame]
Guido van Rossum7c750e11995-02-27 13:16:55 +00001# A parser for SGML, using the derived class as static DTD.
2
3# XXX This only supports those SGML features used by HTML.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
11import regex
12import string
13
14
15# Regular expressions used for parsing
16
Guido van Rossum1dba24e1995-03-04 22:28:49 +000017incomplete = regex.compile(
18 '<!-?\|</[a-zA-Z][a-zA-Z0-9]*[ \t\n]*\|</?\|' +
Guido van Rossum7c750e11995-02-27 13:16:55 +000019 '&#[a-zA-Z0-9]*\|&[a-zA-Z][a-zA-Z0-9]*\|&')
20entityref = regex.compile('&[a-zA-Z][a-zA-Z0-9]*[;.]')
21charref = regex.compile('&#[a-zA-Z0-9]+;')
22starttagopen = regex.compile('<[a-zA-Z]')
23endtag = regex.compile('</[a-zA-Z][a-zA-Z0-9]*[ \t\n]*>')
Guido van Rossum145b2e01995-08-04 04:22:39 +000024special = regex.compile('<![^<>]*>')
Guido van Rossum7c750e11995-02-27 13:16:55 +000025commentopen = regex.compile('<!--')
Guido van Rossum145b2e01995-08-04 04:22:39 +000026commentclose = regex.compile('--[ \t\n]*>')
Guido van Rossum7c750e11995-02-27 13:16:55 +000027
28
29# SGML parser base class -- find tags and call handler functions.
30# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
31# The dtd is defined by deriving a class which defines methods
32# with special names to handle tags: start_foo and end_foo to handle
33# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
34# (Tags are converted to lower case for this purpose.) The data
35# between tags is passed to the parser by calling self.handle_data()
36# with some data as argument (the data may be split up in arbutrary
37# chunks). Entity references are passed by calling
38# self.handle_entityref() with the entity reference as argument.
39
40class SGMLParser:
41
42 # Interface -- initialize and reset this instance
43 def __init__(self):
44 self.reset()
45
46 # Interface -- reset this instance. Loses all unprocessed data
47 def reset(self):
48 self.rawdata = ''
49 self.stack = []
50 self.nomoretags = 0
51 self.literal = 0
52
53 # For derived classes only -- enter literal mode (CDATA) till EOF
54 def setnomoretags(self):
55 self.nomoretags = self.literal = 1
56
57 # For derived classes only -- enter literal mode (CDATA)
58 def setliteral(self, *args):
59 self.literal = 1
60
61 # Interface -- feed some data to the parser. Call this as
62 # often as you want, with as little or as much text as you
63 # want (may include '\n'). (This just saves the text, all the
Guido van Rossum1dba24e1995-03-04 22:28:49 +000064 # processing is done by goahead().)
Guido van Rossum7c750e11995-02-27 13:16:55 +000065 def feed(self, data):
66 self.rawdata = self.rawdata + data
67 self.goahead(0)
68
69 # Interface -- handle the remaining data
70 def close(self):
71 self.goahead(1)
72
73 # Internal -- handle data as far as reasonable. May leave state
74 # and data to be processed by a subsequent call. If 'end' is
75 # true, force handling all data as if followed by EOF marker.
76 def goahead(self, end):
77 rawdata = self.rawdata
78 i = 0
79 n = len(rawdata)
80 while i < n:
81 if self.nomoretags:
82 self.handle_data(rawdata[i:n])
83 i = n
84 break
85 j = incomplete.search(rawdata, i)
86 if j < 0: j = n
87 if i < j: self.handle_data(rawdata[i:j])
88 i = j
89 if i == n: break
90 if rawdata[i] == '<':
91 if starttagopen.match(rawdata, i) >= 0:
92 if self.literal:
93 self.handle_data(rawdata[i])
94 i = i+1
95 continue
96 k = self.parse_starttag(i)
97 if k < 0: break
98 i = i + k
99 continue
100 k = endtag.match(rawdata, i)
101 if k >= 0:
102 j = i+k
103 self.parse_endtag(rawdata[i:j])
104 i = j
105 self.literal = 0
106 continue
107 if commentopen.match(rawdata, i) >= 0:
108 if self.literal:
109 self.handle_data(rawdata[i])
110 i = i+1
111 continue
112 k = self.parse_comment(i)
113 if k < 0: break
114 i = i+k
115 continue
Guido van Rossum145b2e01995-08-04 04:22:39 +0000116 k = special.match(rawdata, i)
117 if k >= 0:
118 if self.literal:
119 self.handle_data(rawdata[i])
120 i = i+1
121 continue
122 i = i+k
123 continue
Guido van Rossum7c750e11995-02-27 13:16:55 +0000124 elif rawdata[i] == '&':
125 k = charref.match(rawdata, i)
126 if k >= 0:
127 j = i+k
128 self.handle_charref(rawdata[i+2:j-1])
129 i = j
130 continue
131 k = entityref.match(rawdata, i)
132 if k >= 0:
133 j = i+k
134 self.handle_entityref(rawdata[i+1:j-1])
135 i = j
136 continue
137 else:
138 raise RuntimeError, 'neither < nor & ??'
139 # We get here only if incomplete matches but
140 # nothing else
141 k = incomplete.match(rawdata, i)
142 if k < 0: raise RuntimeError, 'no incomplete match ??'
143 j = i+k
144 if j == n: break # Really incomplete
145 self.handle_data(rawdata[i:j])
146 i = j
147 # end while
148 if end and i < n:
149 self.handle_data(rawdata[i:n])
150 i = n
151 self.rawdata = rawdata[i:]
152 # XXX if end: check for empty stack
153
Guido van Rossum145b2e01995-08-04 04:22:39 +0000154 # Internal -- parse comment, return length or -1 if not terminated
Guido van Rossum7c750e11995-02-27 13:16:55 +0000155 def parse_comment(self, i):
156 rawdata = self.rawdata
157 if rawdata[i:i+4] <> '<!--':
158 raise RuntimeError, 'unexpected call to handle_comment'
Guido van Rossum145b2e01995-08-04 04:22:39 +0000159 j = commentclose.search(rawdata, i+4)
160 if j < 0:
Guido van Rossum7c750e11995-02-27 13:16:55 +0000161 return -1
162 self.handle_comment(rawdata[i+4: j])
Guido van Rossum145b2e01995-08-04 04:22:39 +0000163 j = j+commentclose.match(rawdata, j)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000164 return j-i
165
166 # Internal -- handle starttag, return length or -1 if not terminated
167 def parse_starttag(self, i):
168 rawdata = self.rawdata
169 try:
170 j = string.index(rawdata, '>', i)
171 except string.index_error:
172 return -1
173 # Now parse the data between i+1 and j into a tag and attrs
174 attrs = []
175 tagfind = regex.compile('[a-zA-Z][a-zA-Z0-9]*')
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000176 attrfind = regex.compile(
177 '[ \t\n]+\([a-zA-Z][a-zA-Z0-9]*\)' +
178 '\([ \t\n]*=[ \t\n]*' +
Guido van Rossum7c750e11995-02-27 13:16:55 +0000179 '\(\'[^\']*\';\|"[^"]*"\|[-a-zA-Z0-9./:+*%?!()_#]+\)\)?')
180 k = tagfind.match(rawdata, i+1)
181 if k < 0:
182 raise RuntimeError, 'unexpected call to parse_starttag'
183 k = i+1+k
184 tag = string.lower(rawdata[i+1:k])
185 while k < j:
186 l = attrfind.match(rawdata, k)
187 if l < 0: break
Guido van Rossumcf9e27c1995-09-01 20:34:29 +0000188 attrname, rest, attrvalue = attrfind.group(1, 2, 3)
189 if not rest:
190 attrvalue = attrname
191 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
192 attrvalue[:1] == '"' == attrvalue[-1:]:
193 attrvalue = attrvalue[1:-1]
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000194 attrs.append((string.lower(attrname), attrvalue))
Guido van Rossum7c750e11995-02-27 13:16:55 +0000195 k = k + l
196 j = j+1
197 try:
198 method = getattr(self, 'start_' + tag)
199 except AttributeError:
200 try:
201 method = getattr(self, 'do_' + tag)
202 except AttributeError:
203 self.unknown_starttag(tag, attrs)
204 return j-i
205 method(attrs)
206 return j-i
207 self.stack.append(tag)
208 method(attrs)
209 return j-i
210
211 # Internal -- parse endtag
212 def parse_endtag(self, data):
213 if data[:2] <> '</' or data[-1:] <> '>':
214 raise RuntimeError, 'unexpected call to parse_endtag'
215 tag = string.lower(string.strip(data[2:-1]))
216 try:
217 method = getattr(self, 'end_' + tag)
218 except AttributeError:
219 self.unknown_endtag(tag)
220 return
Guido van Rossumcf9e27c1995-09-01 20:34:29 +0000221 # XXX Should invoke end methods when popping their
222 # XXX stack entry, not when encountering the tag!
Guido van Rossum7c750e11995-02-27 13:16:55 +0000223 if self.stack and self.stack[-1] == tag:
224 del self.stack[-1]
225 else:
Guido van Rossumefe5ac41995-06-22 18:56:36 +0000226 self.report_unbalanced(tag)
227 # Now repair it
Guido van Rossum7c750e11995-02-27 13:16:55 +0000228 found = None
229 for i in range(len(self.stack)):
230 if self.stack[i] == tag: found = i
231 if found <> None:
232 del self.stack[found:]
233 method()
234
Guido van Rossumefe5ac41995-06-22 18:56:36 +0000235 # Example -- report an unbalanced </...> tag.
236 def report_unbalanced(self, tag):
237 print '*** Unbalanced </' + tag + '>'
238 print '*** Stack:', self.stack
239
Guido van Rossum7c750e11995-02-27 13:16:55 +0000240 # Example -- handle character reference, no need to override
241 def handle_charref(self, name):
242 try:
243 n = string.atoi(name)
244 except string.atoi_error:
245 self.unknown_charref(name)
246 return
247 if not 0 <= n <= 255:
248 self.unknown_charref(name)
249 return
250 self.handle_data(chr(n))
251
252 # Definition of entities -- derived classes may override
253 entitydefs = \
254 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
255
256 # Example -- handle entity reference, no need to override
257 def handle_entityref(self, name):
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000258 table = self.entitydefs
Guido van Rossum7c750e11995-02-27 13:16:55 +0000259 name = string.lower(name)
260 if table.has_key(name):
261 self.handle_data(table[name])
262 else:
263 self.unknown_entityref(name)
264 return
265
266 # Example -- handle data, should be overridden
267 def handle_data(self, data):
268 pass
269
270 # Example -- handle comment, could be overridden
271 def handle_comment(self, data):
272 pass
273
274 # To be overridden -- handlers for unknown objects
275 def unknown_starttag(self, tag, attrs): pass
276 def unknown_endtag(self, tag): pass
277 def unknown_charref(self, ref): pass
278 def unknown_entityref(self, ref): pass
279
280
281class TestSGML(SGMLParser):
282
283 def handle_data(self, data):
284 r = repr(data)
285 if len(r) > 72:
286 r = r[:35] + '...' + r[-35:]
287 print 'data:', r
288
289 def handle_comment(self, data):
290 r = repr(data)
291 if len(r) > 68:
292 r = r[:32] + '...' + r[-32:]
293 print 'comment:', r
294
295 def unknown_starttag(self, tag, attrs):
296 print 'start tag: <' + tag,
297 for name, value in attrs:
298 print name + '=' + '"' + value + '"',
299 print '>'
300
301 def unknown_endtag(self, tag):
302 print 'end tag: </' + tag + '>'
303
304 def unknown_entityref(self, ref):
305 print '*** unknown entity ref: &' + ref + ';'
306
307 def unknown_charref(self, ref):
308 print '*** unknown char ref: &#' + ref + ';'
309
310
311def test():
312 file = 'test.html'
313 f = open(file, 'r')
314 x = TestSGML()
315 while 1:
316 line = f.readline()
317 if not line:
318 x.close()
319 break
320 x.feed(line)
321
322
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000323if __name__ == '__main__':
324 test()