blob: b46f82985a810514913eeefe8de2b7626b02e2d5 [file] [log] [blame]
Guido van Rossum7c750e11995-02-27 13:16:55 +00001# A parser for SGML, using the derived class as static DTD.
2
3# XXX This only supports those SGML features used by HTML.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
11import regex
12import string
13
14
15# Regular expressions used for parsing
16
Guido van Rossum1dba24e1995-03-04 22:28:49 +000017incomplete = regex.compile(
18 '<!-?\|</[a-zA-Z][a-zA-Z0-9]*[ \t\n]*\|</?\|' +
Guido van Rossum7c750e11995-02-27 13:16:55 +000019 '&#[a-zA-Z0-9]*\|&[a-zA-Z][a-zA-Z0-9]*\|&')
20entityref = regex.compile('&[a-zA-Z][a-zA-Z0-9]*[;.]')
21charref = regex.compile('&#[a-zA-Z0-9]+;')
22starttagopen = regex.compile('<[a-zA-Z]')
23endtag = regex.compile('</[a-zA-Z][a-zA-Z0-9]*[ \t\n]*>')
Guido van Rossum145b2e01995-08-04 04:22:39 +000024special = regex.compile('<![^<>]*>')
Guido van Rossum7c750e11995-02-27 13:16:55 +000025commentopen = regex.compile('<!--')
Guido van Rossum145b2e01995-08-04 04:22:39 +000026commentclose = regex.compile('--[ \t\n]*>')
Guido van Rossum7c750e11995-02-27 13:16:55 +000027
28
29# SGML parser base class -- find tags and call handler functions.
30# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
31# The dtd is defined by deriving a class which defines methods
32# with special names to handle tags: start_foo and end_foo to handle
33# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
34# (Tags are converted to lower case for this purpose.) The data
35# between tags is passed to the parser by calling self.handle_data()
36# with some data as argument (the data may be split up in arbutrary
37# chunks). Entity references are passed by calling
38# self.handle_entityref() with the entity reference as argument.
39
40class SGMLParser:
41
42 # Interface -- initialize and reset this instance
Guido van Rossum3c0bfd01995-09-22 00:54:32 +000043 def __init__(self, verbose=0):
44 self.verbose = verbose
Guido van Rossum7c750e11995-02-27 13:16:55 +000045 self.reset()
46
47 # Interface -- reset this instance. Loses all unprocessed data
48 def reset(self):
49 self.rawdata = ''
50 self.stack = []
51 self.nomoretags = 0
52 self.literal = 0
53
54 # For derived classes only -- enter literal mode (CDATA) till EOF
55 def setnomoretags(self):
56 self.nomoretags = self.literal = 1
57
58 # For derived classes only -- enter literal mode (CDATA)
59 def setliteral(self, *args):
60 self.literal = 1
61
62 # Interface -- feed some data to the parser. Call this as
63 # often as you want, with as little or as much text as you
64 # want (may include '\n'). (This just saves the text, all the
Guido van Rossum1dba24e1995-03-04 22:28:49 +000065 # processing is done by goahead().)
Guido van Rossum7c750e11995-02-27 13:16:55 +000066 def feed(self, data):
67 self.rawdata = self.rawdata + data
68 self.goahead(0)
69
70 # Interface -- handle the remaining data
71 def close(self):
72 self.goahead(1)
73
74 # Internal -- handle data as far as reasonable. May leave state
75 # and data to be processed by a subsequent call. If 'end' is
76 # true, force handling all data as if followed by EOF marker.
77 def goahead(self, end):
78 rawdata = self.rawdata
79 i = 0
80 n = len(rawdata)
81 while i < n:
82 if self.nomoretags:
83 self.handle_data(rawdata[i:n])
84 i = n
85 break
86 j = incomplete.search(rawdata, i)
87 if j < 0: j = n
88 if i < j: self.handle_data(rawdata[i:j])
89 i = j
90 if i == n: break
91 if rawdata[i] == '<':
92 if starttagopen.match(rawdata, i) >= 0:
93 if self.literal:
94 self.handle_data(rawdata[i])
95 i = i+1
96 continue
97 k = self.parse_starttag(i)
98 if k < 0: break
99 i = i + k
100 continue
101 k = endtag.match(rawdata, i)
102 if k >= 0:
103 j = i+k
104 self.parse_endtag(rawdata[i:j])
105 i = j
106 self.literal = 0
107 continue
108 if commentopen.match(rawdata, i) >= 0:
109 if self.literal:
110 self.handle_data(rawdata[i])
111 i = i+1
112 continue
113 k = self.parse_comment(i)
114 if k < 0: break
115 i = i+k
116 continue
Guido van Rossum145b2e01995-08-04 04:22:39 +0000117 k = special.match(rawdata, i)
118 if k >= 0:
119 if self.literal:
120 self.handle_data(rawdata[i])
121 i = i+1
122 continue
123 i = i+k
124 continue
Guido van Rossum7c750e11995-02-27 13:16:55 +0000125 elif rawdata[i] == '&':
126 k = charref.match(rawdata, i)
127 if k >= 0:
128 j = i+k
129 self.handle_charref(rawdata[i+2:j-1])
130 i = j
131 continue
132 k = entityref.match(rawdata, i)
133 if k >= 0:
134 j = i+k
135 self.handle_entityref(rawdata[i+1:j-1])
136 i = j
137 continue
138 else:
139 raise RuntimeError, 'neither < nor & ??'
140 # We get here only if incomplete matches but
141 # nothing else
142 k = incomplete.match(rawdata, i)
143 if k < 0: raise RuntimeError, 'no incomplete match ??'
144 j = i+k
Guido van Rossum3c0bfd01995-09-22 00:54:32 +0000145 if j == n or rawdata[i:i+2] == '<!':
146 break # Really incomplete
Guido van Rossum7c750e11995-02-27 13:16:55 +0000147 self.handle_data(rawdata[i:j])
148 i = j
149 # end while
150 if end and i < n:
151 self.handle_data(rawdata[i:n])
152 i = n
153 self.rawdata = rawdata[i:]
154 # XXX if end: check for empty stack
155
Guido van Rossum145b2e01995-08-04 04:22:39 +0000156 # Internal -- parse comment, return length or -1 if not terminated
Guido van Rossum7c750e11995-02-27 13:16:55 +0000157 def parse_comment(self, i):
158 rawdata = self.rawdata
159 if rawdata[i:i+4] <> '<!--':
160 raise RuntimeError, 'unexpected call to handle_comment'
Guido van Rossum145b2e01995-08-04 04:22:39 +0000161 j = commentclose.search(rawdata, i+4)
162 if j < 0:
Guido van Rossum7c750e11995-02-27 13:16:55 +0000163 return -1
164 self.handle_comment(rawdata[i+4: j])
Guido van Rossum145b2e01995-08-04 04:22:39 +0000165 j = j+commentclose.match(rawdata, j)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000166 return j-i
167
168 # Internal -- handle starttag, return length or -1 if not terminated
169 def parse_starttag(self, i):
170 rawdata = self.rawdata
171 try:
172 j = string.index(rawdata, '>', i)
173 except string.index_error:
174 return -1
175 # Now parse the data between i+1 and j into a tag and attrs
176 attrs = []
177 tagfind = regex.compile('[a-zA-Z][a-zA-Z0-9]*')
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000178 attrfind = regex.compile(
Guido van Rossume3d93201995-09-30 16:49:36 +0000179 '[ \t\n]+\([a-zA-Z_][a-zA-Z_0-9]*\)' +
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000180 '\([ \t\n]*=[ \t\n]*' +
Guido van Rossum650ba371995-10-06 15:30:28 +0000181 '\(\'[^\']*\'\|"[^"]*"\|[-a-zA-Z0-9./:+*%?!()_#]*\)\)?')
Guido van Rossum7c750e11995-02-27 13:16:55 +0000182 k = tagfind.match(rawdata, i+1)
183 if k < 0:
184 raise RuntimeError, 'unexpected call to parse_starttag'
185 k = i+1+k
186 tag = string.lower(rawdata[i+1:k])
187 while k < j:
188 l = attrfind.match(rawdata, k)
189 if l < 0: break
Guido van Rossumcf9e27c1995-09-01 20:34:29 +0000190 attrname, rest, attrvalue = attrfind.group(1, 2, 3)
191 if not rest:
192 attrvalue = attrname
193 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
194 attrvalue[:1] == '"' == attrvalue[-1:]:
195 attrvalue = attrvalue[1:-1]
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000196 attrs.append((string.lower(attrname), attrvalue))
Guido van Rossum7c750e11995-02-27 13:16:55 +0000197 k = k + l
198 j = j+1
199 try:
200 method = getattr(self, 'start_' + tag)
201 except AttributeError:
202 try:
203 method = getattr(self, 'do_' + tag)
204 except AttributeError:
205 self.unknown_starttag(tag, attrs)
206 return j-i
207 method(attrs)
208 return j-i
209 self.stack.append(tag)
210 method(attrs)
211 return j-i
212
213 # Internal -- parse endtag
214 def parse_endtag(self, data):
215 if data[:2] <> '</' or data[-1:] <> '>':
216 raise RuntimeError, 'unexpected call to parse_endtag'
217 tag = string.lower(string.strip(data[2:-1]))
218 try:
219 method = getattr(self, 'end_' + tag)
220 except AttributeError:
221 self.unknown_endtag(tag)
222 return
Guido van Rossumcf9e27c1995-09-01 20:34:29 +0000223 # XXX Should invoke end methods when popping their
224 # XXX stack entry, not when encountering the tag!
Guido van Rossum7c750e11995-02-27 13:16:55 +0000225 if self.stack and self.stack[-1] == tag:
226 del self.stack[-1]
227 else:
Guido van Rossumefe5ac41995-06-22 18:56:36 +0000228 self.report_unbalanced(tag)
229 # Now repair it
Guido van Rossum7c750e11995-02-27 13:16:55 +0000230 found = None
231 for i in range(len(self.stack)):
232 if self.stack[i] == tag: found = i
233 if found <> None:
234 del self.stack[found:]
235 method()
236
Guido van Rossumefe5ac41995-06-22 18:56:36 +0000237 # Example -- report an unbalanced </...> tag.
238 def report_unbalanced(self, tag):
Guido van Rossum3c0bfd01995-09-22 00:54:32 +0000239 if self.verbose:
240 print '*** Unbalanced </' + tag + '>'
241 print '*** Stack:', self.stack
Guido van Rossumefe5ac41995-06-22 18:56:36 +0000242
Guido van Rossum7c750e11995-02-27 13:16:55 +0000243 # Example -- handle character reference, no need to override
244 def handle_charref(self, name):
245 try:
246 n = string.atoi(name)
247 except string.atoi_error:
248 self.unknown_charref(name)
249 return
250 if not 0 <= n <= 255:
251 self.unknown_charref(name)
252 return
253 self.handle_data(chr(n))
254
255 # Definition of entities -- derived classes may override
256 entitydefs = \
257 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
258
259 # Example -- handle entity reference, no need to override
260 def handle_entityref(self, name):
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000261 table = self.entitydefs
Guido van Rossum7c750e11995-02-27 13:16:55 +0000262 if table.has_key(name):
263 self.handle_data(table[name])
264 else:
265 self.unknown_entityref(name)
266 return
267
268 # Example -- handle data, should be overridden
269 def handle_data(self, data):
270 pass
271
272 # Example -- handle comment, could be overridden
273 def handle_comment(self, data):
274 pass
275
276 # To be overridden -- handlers for unknown objects
277 def unknown_starttag(self, tag, attrs): pass
278 def unknown_endtag(self, tag): pass
279 def unknown_charref(self, ref): pass
280 def unknown_entityref(self, ref): pass
281
282
283class TestSGML(SGMLParser):
284
285 def handle_data(self, data):
286 r = repr(data)
287 if len(r) > 72:
288 r = r[:35] + '...' + r[-35:]
289 print 'data:', r
290
291 def handle_comment(self, data):
292 r = repr(data)
293 if len(r) > 68:
294 r = r[:32] + '...' + r[-32:]
295 print 'comment:', r
296
297 def unknown_starttag(self, tag, attrs):
298 print 'start tag: <' + tag,
299 for name, value in attrs:
300 print name + '=' + '"' + value + '"',
301 print '>'
302
303 def unknown_endtag(self, tag):
304 print 'end tag: </' + tag + '>'
305
306 def unknown_entityref(self, ref):
307 print '*** unknown entity ref: &' + ref + ';'
308
309 def unknown_charref(self, ref):
310 print '*** unknown char ref: &#' + ref + ';'
311
312
313def test():
314 file = 'test.html'
315 f = open(file, 'r')
316 x = TestSGML()
317 while 1:
318 line = f.readline()
319 if not line:
320 x.close()
321 break
322 x.feed(line)
323
324
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000325if __name__ == '__main__':
326 test()