blob: 29e26c2f157cef4015f9554bc375dade49266139 [file] [log] [blame]
Guido van Rossuma12ef941995-02-27 17:53:25 +00001\section{Built-in module \sectcode{sgmllib}}
2\stmodindex{sgmllib}
Guido van Rossum86751151995-02-28 17:14:32 +00003\index{SGML}
4
5\renewcommand{\indexsubitem}{(in module sgmllib)}
6
7This module defines a class \code{SGMLParser} which serves as the
8basis for parsing text files formatted in SGML (Standard Generalized
9Mark-up Language). In fact, it does not provide a full SGML parser
10--- it only parses SGML insofar as it is used by HTML, and module only
11exists as a basis for the \code{htmllib} module.
12\stmodindex{htmllib}
13
14In particular, the parser is hardcoded to recognize the following
15elements:
16
17\begin{itemize}
18
19\item
20Opening and closing tags of the form
21``\code{<\var{tag} \var{attr}="\var{value}" ...>}'' and
22``\code{</\var{tag}>}'', respectively.
23
24\item
25Character references of the form ``\code{\&\#\var{name};}''.
26
27\item
28Entity references of the form ``\code{\&\var{name};}''.
29
30\item
31SGML comments of the form ``\code{<!--\var{text}>}''.
32
33\end{itemize}
34
35The \code{SGMLParser} class must be instantiated without arguments.
36It has the following interface methods:
37
38\begin{funcdesc}{reset}{}
39Reset the instance. Loses all unprocessed data. This is called
40implicitly at instantiation time.
41\end{funcdesc}
42
43\begin{funcdesc}{setnomoretags}{}
44Stop processing tags. Treat all following input as literal input
45(CDATA). (This is only provided so the HTML tag \code{<PLAINTEXT>}
46can be implemented.)
47\end{funcdesc}
48
49\begin{funcdesc}{setliteral}{}
50Enter literal mode (CDATA mode).
51\end{funcdesc}
52
53\begin{funcdesc}{feed}{data}
54Feed some text to the parser. It is processed insofar as it consists
55of complete elements; incomplete data is buffered until more data is
56fed or \code{close()} is called.
57\end{funcdesc}
58
59\begin{funcdesc}{close}{}
60Force processing of all buffered data as if it were followed by an
61end-of-file mark. This method may be redefined by a derived class to
62define additional processing at the end of the input, but the
63redefined version should always call \code{SGMLParser.close()}.
64\end{funcdesc}
65
66\begin{funcdesc}{handle_charref}{ref}
67This method is called to process a character reference of the form
68``\code{\&\#\var{ref};}'' where \var{ref} is a decimal number in the
69range 0-255. It translates the character to ASCII and calls the
70method \code{handle_data()} with the character as argument. If
71\var{ref} is invalid or out of range, the method
72\code{unknown_charref(\var{ref})} is called instead.
73\end{funcdesc}
74
75\begin{funcdesc}{handle_entityref}{ref}
76This method is called to process an entity reference of the form
77``\code{\&\var{ref};}'' where \var{ref} is an alphabetic entity
78reference. It looks for \var{ref} in the instance (or class)
79variable \code{entitydefs} which should give the entity's translation.
80If a translation is found, it callse the method \code{handle_data()}
81with the translation; otherwise, it callse the method
82\code{unknown_entityref(\var{ref})}.
83\end{funcdesc}
84
85\begin{funcdesc}{handle_data}{data}
86This method is called to process arbitrary data. It is intended to be
87overridden by a derived class; the base class implementation does
88nothing.
89\end{funcdesc}
90
91\begin{funcdesc}{unknown_starttag}{tag\, attributes}
92This method is called to process an unknown start tag. It is intended
93to be overridden by a derived class; the base class implementation
94does nothing. The \var{attributes} argument is a list of
95(\var{name}, \var{value}) pairs containing the attributes found inside
96the tag's \code{<>} brackets. The \var{name} has been translated to
97lower case and double quotes and backslashes in the \var{value} have
98been interpreted. For instance, for the tag
99\code{<A HREF="http://www.cwi.nl/">}, this method would be
100called as \code{unknown_starttag('a', [('href', 'http://www.cwi.nl/')])}.
101\end{funcdesc}
102
103\begin{funcdesc}{unknown_endtag}{tag}
104This method is called to process an unknown end tag. It is intended
105to be overridden by a derived class; the base class implementation
106does nothing.
107\end{funcdesc}
108
109\begin{funcdesc}{unknown_charref}{ref}
110This method is called to process an unknown character reference. It
111is intended to be overridden by a derived class; the base class
112implementation does nothing.
113\end{funcdesc}
114
115\begin{funcdesc}{unknown_entityref}{ref}
116This method is called to process an unknown entity reference. It is
117intended to be overridden by a derived class; the base class
118implementation does nothing.
119\end{funcdesc}
120
121Apart from overriding or extending the methods listed above, derived
122classes may also define methods of the following form to define
123processing of specific tags. Tag names in the input stream are case
124independent; the \var{tag} occurring in method names must be in lower
125case:
126
127\begin{funcdesc}{start_\var{tag}}{attributes}
128This method is called to process an opening tag \var{tag}. It has
129preference over \code{do_\var{tag}()}. The \var{attributes} argument
130has the same meaning as described for \code{unknown_tag()} above.
131\end{funcdesc}
132
133\begin{funcdesc}{do_\var{tag}}{attributes}
134This method is called to process an opening tag \var{tag} that does
135not come with a matching closing tag. The \var{attributes} argument
136has the same meaning as described for \code{unknown_tag()} above.
137\end{funcdesc}
138
139\begin{funcdesc}{end_\var{tag}}{}
140This method is called to process a closing tag \var{tag}.
141\end{funcdesc}
142
143Note that the parser maintains a stack of opening tags for which no
144matching closing tag has been found yet. Only tags processed by
145\code{start_\var{tag}()} are pushed on this stack. Definition if a
146\code{end_\var{tag}()} method is optional for these tags. For tags
147processed by \code{do_\var{tag}()} or by \code{unknown_tag()}, no
148\code{end_\var{tag}()} method must be defined.