blob: e8ba2e898452cf24336e2fca834882bff5a740eb [file] [log] [blame]
Guido van Rossuma10768a1997-11-18 15:11:22 +00001\section{Standard Module \sectcode{xmllib}}
2% Author: Sjoerd Mullender
3\label{module-xmllib}
4\stmodindex{xmllib}
5\index{XML}
6
7This module defines a class \code{XMLParser} which serves as the basis
8for parsing text files formatted in XML (eXtended Markup Language).
9
10The \code{XMLParser} class must be instantiated without arguments. It
11has the following interface methods:
12
Fred Drake0add4c11997-12-12 05:32:31 +000013\renewcommand{\indexsubitem}{(XMLParser method)}
Guido van Rossuma10768a1997-11-18 15:11:22 +000014
15\begin{funcdesc}{reset}{}
16Reset the instance. Loses all unprocessed data. This is called
17implicitly at the instantiation time.
18\end{funcdesc}
19
20\begin{funcdesc}{setnomoretags}{}
21Stop processing tags. Treat all following input as literal input
22(CDATA).
23\end{funcdesc}
24
25\begin{funcdesc}{setliteral}{}
26Enter literal mode (CDATA mode).
27\end{funcdesc}
28
29\begin{funcdesc}{feed}{data}
30Feed some text to the parser. It is processed insofar as it consists
31of complete elements; incomplete data is buffered until more data is
32fed or \code{close()} is called.
33\end{funcdesc}
34
35\begin{funcdesc}{close}{}
36Force processing of all buffered data as if it were followed by an
37end-of-file mark. This method may be redefined by a derived class to
38define additional processing at the end of the input, but the
39redefined version should always call \code{XMLParser.close()}.
40\end{funcdesc}
41
Guido van Rossum02505e41998-01-29 14:55:24 +000042\begin{funcdesc}{translate_references}{data}
43Translate all entity and character references in \code{data} and
44returns the translated string.
45\end{funcdesc}
46
47\begin{funcdesc}{handle_xml}{encoding\, standalone}
48This method is called when the \code{<?xml ...?>} tag is processed.
49The arguments are the values of the encoding and standalone attributes
50in the tag. Both encoding and standalone are optional. The values
51passed to \code{handle_xml} default to \code{None} and the string
52\code{'no'} respectively.
53\end{funcdesc}
54
55\begin{funcdesc}{handle_doctype}{tag\, data}
56This method is called when the \code{<!DOCTYPE...>} tag is processed.
57The arguments are the name of the root element and the uninterpreted
58contents of the tag, starting after the white space after the name of
59the root element.
60\end{funcdesc}
61
Guido van Rossuma10768a1997-11-18 15:11:22 +000062\begin{funcdesc}{handle_starttag}{tag\, method\, attributes}
63This method is called to handle start tags for which a
64\code{start_\var{tag}()} method has been defined. The \code{tag}
65argument is the name of the tag, and the \code{method} argument is the
66bound method which should be used to support semantic interpretation
67of the start tag. The \var{attributes} argument is a dictionary of
68attributes, the key being the \var{name} and the value being the
69\var{value} of the attribute found inside the tag's \code{<>} brackets.
Guido van Rossum02505e41998-01-29 14:55:24 +000070Character and entity references in the \var{value} have
Guido van Rossuma10768a1997-11-18 15:11:22 +000071been interpreted. For instance, for the tag
72\code{<A HREF="http://www.cwi.nl/">}, this method would be called as
Fred Drakeb0744c51997-12-29 19:59:38 +000073\code{handle_starttag('A', self.start_A, \{'HREF': 'http://www.cwi.nl/'\})}.
Guido van Rossuma10768a1997-11-18 15:11:22 +000074The base implementation simply calls \code{method} with \code{attributes}
75as the only argument.
76\end{funcdesc}
77
78\begin{funcdesc}{handle_endtag}{tag\, method}
79This method is called to handle endtags for which an
80\code{end_\var{tag}()} method has been defined. The \code{tag}
81argument is the name of the tag, and the
82\code{method} argument is the bound method which should be used to
83support semantic interpretation of the end tag. If no
84\code{end_\var{tag}()} method is defined for the closing element, this
85handler is not called. The base implementation simply calls
86\code{method}.
87\end{funcdesc}
88
89\begin{funcdesc}{handle_data}{data}
90This method is called to process arbitrary data. It is intended to be
91overridden by a derived class; the base class implementation does
92nothing.
93\end{funcdesc}
94
95\begin{funcdesc}{handle_charref}{ref}
96This method is called to process a character reference of the form
97``\code{\&\#\var{ref};}''. \var{ref} can either be a decimal number,
98or a hexadecimal number when preceded by \code{x}.
99In the base implementation, \var{ref} must be a number in the
100range 0-255. It translates the character to \ASCII{} and calls the
101method \code{handle_data()} with the character as argument. If
102\var{ref} is invalid or out of range, the method
103\code{unknown_charref(\var{ref})} is called to handle the error. A
104subclass must override this method to provide support for character
105references outside of the \ASCII{} range.
106\end{funcdesc}
107
108\begin{funcdesc}{handle_entityref}{ref}
109This method is called to process a general entity reference of the form
110``\code{\&\var{ref};}'' where \var{ref} is an general entity
111reference. It looks for \var{ref} in the instance (or class)
112variable \code{entitydefs} which should be a mapping from entity names
113to corresponding translations.
114If a translation is found, it calls the method \code{handle_data()}
115with the translation; otherwise, it calls the method
116\code{unknown_entityref(\var{ref})}. The default \code{entitydefs}
117defines translations for \code{\&amp;}, \code{\&apos}, \code{\&gt;},
118\code{\&lt;}, and \code{\&quot;}.
119\end{funcdesc}
120
121\begin{funcdesc}{handle_comment}{comment}
122This method is called when a comment is encountered. The
123\code{comment} argument is a string containing the text between the
124``\code{<!--}'' and ``\code{-->}'' delimiters, but not the delimiters
125themselves. For example, the comment ``\code{<!--text-->}'' will
126cause this method to be called with the argument \code{'text'}. The
127default method does nothing.
128\end{funcdesc}
129
130\begin{funcdesc}{handle_cdata}{data}
131This method is called when a CDATA element is encountered. The
132\code{data} argument is a string containing the text between the
133``\code{<![CDATA[}'' and ``\code{]]>}'' delimiters, but not the delimiters
134themselves. For example, the entity ``\code{<![CDATA[text]]>}'' will
135cause this method to be called with the argument \code{'text'}. The
136default method does nothing.
137\end{funcdesc}
138
139\begin{funcdesc}{handle_proc}{name\, data}
140This method is called when a processing instruction (PI) is encountered. The
141\code{name} is the PI target, and the \code{data} argument is a
142string containing the text between the PI target and the closing delimiter,
143but not the delimiter itself. For example, the instruction
144``\code{<?XML text?>}'' will cause this method to be called with the
145arguments \code{'XML'} and \code{'text'}. The default method does
Fred Drake8aad4c81998-02-03 23:12:13 +0000146nothing. Note that if a document starts with a \code{<?xml ...?>}
Guido van Rossum02505e41998-01-29 14:55:24 +0000147tag, \code{handle_xml} is called to handle it.
Guido van Rossuma10768a1997-11-18 15:11:22 +0000148\end{funcdesc}
149
150\begin{funcdesc}{handle_special}{data}
151This method is called when a declaration is encountered. The
152\code{data} argument is a string containing the text between the
153``\code{<!}'' and ``\code{>}'' delimiters, but not the delimiters
Guido van Rossum02505e41998-01-29 14:55:24 +0000154themselves. For example, the entity ``\code{<!ENTITY text>}'' will
155cause this method to be called with the argument \code{'ENTITY text'}. The
156default method does nothing. Note that \code{<!DOCTYPE ...>} is
157handled separately if it is located at the start of the document.
Guido van Rossuma10768a1997-11-18 15:11:22 +0000158\end{funcdesc}
159
Guido van Rossum02505e41998-01-29 14:55:24 +0000160\begin{funcdesc}{syntax_error}{message}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000161This method is called when a syntax error is encountered. The
Guido van Rossuma10768a1997-11-18 15:11:22 +0000162\code{message} is a description of what was wrong. The default method
163raises a \code{RuntimeError} exception. If this method is overridden,
164it is permissable for it to return. This method is only called when
Guido van Rossum02505e41998-01-29 14:55:24 +0000165the error can be recovered from. Unrecoverable errors raise a
166\code{RuntimeError} without first calling \code{syntax_error}.
Guido van Rossuma10768a1997-11-18 15:11:22 +0000167\end{funcdesc}
168
169\begin{funcdesc}{unknown_starttag}{tag\, attributes}
170This method is called to process an unknown start tag. It is intended
171to be overridden by a derived class; the base class implementation
172does nothing.
173\end{funcdesc}
174
175\begin{funcdesc}{unknown_endtag}{tag}
176This method is called to process an unknown end tag. It is intended
177to be overridden by a derived class; the base class implementation
178does nothing.
179\end{funcdesc}
180
181\begin{funcdesc}{unknown_charref}{ref}
182This method is called to process unresolvable numeric character
183references. It is intended to be overridden by a derived class; the
184base class implementation does nothing.
185\end{funcdesc}
186
187\begin{funcdesc}{unknown_entityref}{ref}
188This method is called to process an unknown entity reference. It is
189intended to be overridden by a derived class; the base class
190implementation does nothing.
191\end{funcdesc}
192
193Apart from overriding or extending the methods listed above, derived
Guido van Rossum02505e41998-01-29 14:55:24 +0000194classes may also define methods and variables of the following form to
195define processing of specific tags. Tag names in the input stream are
196case dependent; the \var{tag} occurring in method names must be in the
Guido van Rossuma10768a1997-11-18 15:11:22 +0000197correct case:
198
199\begin{funcdesc}{start_\var{tag}}{attributes}
200This method is called to process an opening tag \var{tag}. The
201\var{attributes} argument has the same meaning as described for
Guido van Rossum02505e41998-01-29 14:55:24 +0000202\code{handle_starttag()} above. In fact, the base implementation of
203\code{handle_starttag} calls this method.
Guido van Rossuma10768a1997-11-18 15:11:22 +0000204\end{funcdesc}
205
206\begin{funcdesc}{end_\var{tag}}{}
207This method is called to process a closing tag \var{tag}.
208\end{funcdesc}
Guido van Rossum02505e41998-01-29 14:55:24 +0000209
210\begin{datadesc}{\var{tag}_attributes}
211If a class or instance variable \code{\var{tag}_attributes} exists, it
212should be a list or a dictionary. If a list, the elements of the list
213are the valid attributes for the element \var{tag}; if a dictionary,
214the keys are the valid attributes for the element \var{tag}, and the
215values the default values of the attributes, or \code{None} if there
216is no default.
217In addition to the attributes that were present in the tag, the
218attribute dictionary that is passed to \code{handle_starttag} and
219\code{unknown_starttag} contains values for all attributes that have a
220default value.
221\end{datadesc}