blob: db4d7509ee5fb221b73f9512d6dd149a85442193 [file] [log] [blame]
Guido van Rossuma10768a1997-11-18 15:11:22 +00001\section{Standard Module \sectcode{xmllib}}
2% Author: Sjoerd Mullender
3\label{module-xmllib}
4\stmodindex{xmllib}
5\index{XML}
6
7This module defines a class \code{XMLParser} which serves as the basis
8for parsing text files formatted in XML (eXtended Markup Language).
9
10The \code{XMLParser} class must be instantiated without arguments. It
11has the following interface methods:
12
Fred Drake0add4c11997-12-12 05:32:31 +000013\renewcommand{\indexsubitem}{(XMLParser method)}
Guido van Rossuma10768a1997-11-18 15:11:22 +000014
15\begin{funcdesc}{reset}{}
16Reset the instance. Loses all unprocessed data. This is called
17implicitly at the instantiation time.
18\end{funcdesc}
19
20\begin{funcdesc}{setnomoretags}{}
21Stop processing tags. Treat all following input as literal input
22(CDATA).
23\end{funcdesc}
24
25\begin{funcdesc}{setliteral}{}
26Enter literal mode (CDATA mode).
27\end{funcdesc}
28
29\begin{funcdesc}{feed}{data}
30Feed some text to the parser. It is processed insofar as it consists
31of complete elements; incomplete data is buffered until more data is
32fed or \code{close()} is called.
33\end{funcdesc}
34
35\begin{funcdesc}{close}{}
36Force processing of all buffered data as if it were followed by an
37end-of-file mark. This method may be redefined by a derived class to
38define additional processing at the end of the input, but the
39redefined version should always call \code{XMLParser.close()}.
40\end{funcdesc}
41
42\begin{funcdesc}{handle_starttag}{tag\, method\, attributes}
43This method is called to handle start tags for which a
44\code{start_\var{tag}()} method has been defined. The \code{tag}
45argument is the name of the tag, and the \code{method} argument is the
46bound method which should be used to support semantic interpretation
47of the start tag. The \var{attributes} argument is a dictionary of
48attributes, the key being the \var{name} and the value being the
49\var{value} of the attribute found inside the tag's \code{<>} brackets.
50Lower case and double quotes and backslashes in the \var{value} have
51been interpreted. For instance, for the tag
52\code{<A HREF="http://www.cwi.nl/">}, this method would be called as
Fred Drakeb0744c51997-12-29 19:59:38 +000053\code{handle_starttag('A', self.start_A, \{'HREF': 'http://www.cwi.nl/'\})}.
Guido van Rossuma10768a1997-11-18 15:11:22 +000054The base implementation simply calls \code{method} with \code{attributes}
55as the only argument.
56\end{funcdesc}
57
58\begin{funcdesc}{handle_endtag}{tag\, method}
59This method is called to handle endtags for which an
60\code{end_\var{tag}()} method has been defined. The \code{tag}
61argument is the name of the tag, and the
62\code{method} argument is the bound method which should be used to
63support semantic interpretation of the end tag. If no
64\code{end_\var{tag}()} method is defined for the closing element, this
65handler is not called. The base implementation simply calls
66\code{method}.
67\end{funcdesc}
68
69\begin{funcdesc}{handle_data}{data}
70This method is called to process arbitrary data. It is intended to be
71overridden by a derived class; the base class implementation does
72nothing.
73\end{funcdesc}
74
75\begin{funcdesc}{handle_charref}{ref}
76This method is called to process a character reference of the form
77``\code{\&\#\var{ref};}''. \var{ref} can either be a decimal number,
78or a hexadecimal number when preceded by \code{x}.
79In the base implementation, \var{ref} must be a number in the
80range 0-255. It translates the character to \ASCII{} and calls the
81method \code{handle_data()} with the character as argument. If
82\var{ref} is invalid or out of range, the method
83\code{unknown_charref(\var{ref})} is called to handle the error. A
84subclass must override this method to provide support for character
85references outside of the \ASCII{} range.
86\end{funcdesc}
87
88\begin{funcdesc}{handle_entityref}{ref}
89This method is called to process a general entity reference of the form
90``\code{\&\var{ref};}'' where \var{ref} is an general entity
91reference. It looks for \var{ref} in the instance (or class)
92variable \code{entitydefs} which should be a mapping from entity names
93to corresponding translations.
94If a translation is found, it calls the method \code{handle_data()}
95with the translation; otherwise, it calls the method
96\code{unknown_entityref(\var{ref})}. The default \code{entitydefs}
97defines translations for \code{\&amp;}, \code{\&apos}, \code{\&gt;},
98\code{\&lt;}, and \code{\&quot;}.
99\end{funcdesc}
100
101\begin{funcdesc}{handle_comment}{comment}
102This method is called when a comment is encountered. The
103\code{comment} argument is a string containing the text between the
104``\code{<!--}'' and ``\code{-->}'' delimiters, but not the delimiters
105themselves. For example, the comment ``\code{<!--text-->}'' will
106cause this method to be called with the argument \code{'text'}. The
107default method does nothing.
108\end{funcdesc}
109
110\begin{funcdesc}{handle_cdata}{data}
111This method is called when a CDATA element is encountered. The
112\code{data} argument is a string containing the text between the
113``\code{<![CDATA[}'' and ``\code{]]>}'' delimiters, but not the delimiters
114themselves. For example, the entity ``\code{<![CDATA[text]]>}'' will
115cause this method to be called with the argument \code{'text'}. The
116default method does nothing.
117\end{funcdesc}
118
119\begin{funcdesc}{handle_proc}{name\, data}
120This method is called when a processing instruction (PI) is encountered. The
121\code{name} is the PI target, and the \code{data} argument is a
122string containing the text between the PI target and the closing delimiter,
123but not the delimiter itself. For example, the instruction
124``\code{<?XML text?>}'' will cause this method to be called with the
125arguments \code{'XML'} and \code{'text'}. The default method does
126nothing.
127\end{funcdesc}
128
129\begin{funcdesc}{handle_special}{data}
130This method is called when a declaration is encountered. The
131\code{data} argument is a string containing the text between the
132``\code{<!}'' and ``\code{>}'' delimiters, but not the delimiters
133themselves. For example, the entity ``\code{<!DOCTYPE text>}'' will
134cause this method to be called with the argument \code{'DOCTYPE text'}. The
135default method does nothing.
136\end{funcdesc}
137
138\begin{funcdesc}{syntax_error}{lineno\, message}
139This method is called when a syntax error is encountered. The
140\code{lineno} argument is the line number of the error, and the
141\code{message} is a description of what was wrong. The default method
142raises a \code{RuntimeError} exception. If this method is overridden,
143it is permissable for it to return. This method is only called when
144the error can be recovered from.
145\end{funcdesc}
146
147\begin{funcdesc}{unknown_starttag}{tag\, attributes}
148This method is called to process an unknown start tag. It is intended
149to be overridden by a derived class; the base class implementation
150does nothing.
151\end{funcdesc}
152
153\begin{funcdesc}{unknown_endtag}{tag}
154This method is called to process an unknown end tag. It is intended
155to be overridden by a derived class; the base class implementation
156does nothing.
157\end{funcdesc}
158
159\begin{funcdesc}{unknown_charref}{ref}
160This method is called to process unresolvable numeric character
161references. It is intended to be overridden by a derived class; the
162base class implementation does nothing.
163\end{funcdesc}
164
165\begin{funcdesc}{unknown_entityref}{ref}
166This method is called to process an unknown entity reference. It is
167intended to be overridden by a derived class; the base class
168implementation does nothing.
169\end{funcdesc}
170
171Apart from overriding or extending the methods listed above, derived
172classes may also define methods of the following form to define
173processing of specific tags. Tag names in the input stream are case
174dependent; the \var{tag} occurring in method names must be in the
175correct case:
176
177\begin{funcdesc}{start_\var{tag}}{attributes}
178This method is called to process an opening tag \var{tag}. The
179\var{attributes} argument has the same meaning as described for
180\code{handle_starttag()} above.
181\end{funcdesc}
182
183\begin{funcdesc}{end_\var{tag}}{}
184This method is called to process a closing tag \var{tag}.
185\end{funcdesc}