blob: efc7f008acf720f36e9a296355ed9dc9e1aed901 [file] [log] [blame]
Guido van Rossuma10768a1997-11-18 15:11:22 +00001\section{Standard Module \sectcode{xmllib}}
2% Author: Sjoerd Mullender
3\label{module-xmllib}
4\stmodindex{xmllib}
5\index{XML}
6
Fred Drake3b5da761998-03-12 15:33:05 +00007This module defines a class \class{XMLParser} which serves as the basis
Guido van Rossuma10768a1997-11-18 15:11:22 +00008for parsing text files formatted in XML (eXtended Markup Language).
9
Fred Drake3b5da761998-03-12 15:33:05 +000010\begin{classdesc}{XMLParser}{}
11The \class{XMLParser} class must be instantiated without arguments.
12\end{classdesc}
13
14This class provides the following interface methods:
Guido van Rossuma10768a1997-11-18 15:11:22 +000015
Fred Drake19479911998-02-13 06:58:54 +000016\setindexsubitem{(XMLParser method)}
Guido van Rossuma10768a1997-11-18 15:11:22 +000017
18\begin{funcdesc}{reset}{}
19Reset the instance. Loses all unprocessed data. This is called
20implicitly at the instantiation time.
21\end{funcdesc}
22
23\begin{funcdesc}{setnomoretags}{}
24Stop processing tags. Treat all following input as literal input
25(CDATA).
26\end{funcdesc}
27
28\begin{funcdesc}{setliteral}{}
29Enter literal mode (CDATA mode).
30\end{funcdesc}
31
32\begin{funcdesc}{feed}{data}
33Feed some text to the parser. It is processed insofar as it consists
34of complete elements; incomplete data is buffered until more data is
Fred Drake3b5da761998-03-12 15:33:05 +000035fed or \method{close()} is called.
Guido van Rossuma10768a1997-11-18 15:11:22 +000036\end{funcdesc}
37
38\begin{funcdesc}{close}{}
39Force processing of all buffered data as if it were followed by an
40end-of-file mark. This method may be redefined by a derived class to
41define additional processing at the end of the input, but the
Fred Drake3b5da761998-03-12 15:33:05 +000042redefined version should always call \method{close()}.
Guido van Rossuma10768a1997-11-18 15:11:22 +000043\end{funcdesc}
44
Guido van Rossum02505e41998-01-29 14:55:24 +000045\begin{funcdesc}{translate_references}{data}
Fred Drake3b5da761998-03-12 15:33:05 +000046Translate all entity and character references in \var{data} and
Guido van Rossum02505e41998-01-29 14:55:24 +000047returns the translated string.
48\end{funcdesc}
49
Fred Drake3b5da761998-03-12 15:33:05 +000050\begin{funcdesc}{handle_xml}{encoding, standalone}
Guido van Rossum02505e41998-01-29 14:55:24 +000051This method is called when the \code{<?xml ...?>} tag is processed.
52The arguments are the values of the encoding and standalone attributes
53in the tag. Both encoding and standalone are optional. The values
Fred Drake3b5da761998-03-12 15:33:05 +000054passed to \method{handle_xml()} default to \code{None} and the string
Guido van Rossum02505e41998-01-29 14:55:24 +000055\code{'no'} respectively.
56\end{funcdesc}
57
Fred Drake3b5da761998-03-12 15:33:05 +000058\begin{funcdesc}{handle_doctype}{tag, data}
Guido van Rossum02505e41998-01-29 14:55:24 +000059This method is called when the \code{<!DOCTYPE...>} tag is processed.
60The arguments are the name of the root element and the uninterpreted
61contents of the tag, starting after the white space after the name of
62the root element.
63\end{funcdesc}
64
Fred Drake3b5da761998-03-12 15:33:05 +000065\begin{funcdesc}{handle_starttag}{tag, method, attributes}
Guido van Rossuma10768a1997-11-18 15:11:22 +000066This method is called to handle start tags for which a
Fred Drake3b5da761998-03-12 15:33:05 +000067\code{start_\var{tag}()} method has been defined. The \var{tag}
68argument is the name of the tag, and the \method{method} argument is the
Guido van Rossuma10768a1997-11-18 15:11:22 +000069bound method which should be used to support semantic interpretation
70of the start tag. The \var{attributes} argument is a dictionary of
71attributes, the key being the \var{name} and the value being the
72\var{value} of the attribute found inside the tag's \code{<>} brackets.
Guido van Rossum02505e41998-01-29 14:55:24 +000073Character and entity references in the \var{value} have
Guido van Rossuma10768a1997-11-18 15:11:22 +000074been interpreted. For instance, for the tag
75\code{<A HREF="http://www.cwi.nl/">}, this method would be called as
Fred Drakeb0744c51997-12-29 19:59:38 +000076\code{handle_starttag('A', self.start_A, \{'HREF': 'http://www.cwi.nl/'\})}.
Fred Drake3b5da761998-03-12 15:33:05 +000077The base implementation simply calls \var{method} with \var{attributes}
Guido van Rossuma10768a1997-11-18 15:11:22 +000078as the only argument.
79\end{funcdesc}
80
Fred Drakecce10901998-03-17 06:33:25 +000081\begin{funcdesc}{handle_endtag}{tag, method}
Guido van Rossuma10768a1997-11-18 15:11:22 +000082This method is called to handle endtags for which an
Fred Drake3b5da761998-03-12 15:33:05 +000083\code{end_\var{tag}()} method has been defined. The \var{tag}
Guido van Rossuma10768a1997-11-18 15:11:22 +000084argument is the name of the tag, and the
Fred Drake3b5da761998-03-12 15:33:05 +000085\var{method} argument is the bound method which should be used to
Guido van Rossuma10768a1997-11-18 15:11:22 +000086support semantic interpretation of the end tag. If no
87\code{end_\var{tag}()} method is defined for the closing element, this
88handler is not called. The base implementation simply calls
Fred Drake3b5da761998-03-12 15:33:05 +000089\var{method}.
Guido van Rossuma10768a1997-11-18 15:11:22 +000090\end{funcdesc}
91
92\begin{funcdesc}{handle_data}{data}
93This method is called to process arbitrary data. It is intended to be
94overridden by a derived class; the base class implementation does
95nothing.
96\end{funcdesc}
97
98\begin{funcdesc}{handle_charref}{ref}
99This method is called to process a character reference of the form
Fred Drake7f6e2c41998-02-13 14:38:23 +0000100\samp{\&\#\var{ref};}. \var{ref} can either be a decimal number,
Guido van Rossuma10768a1997-11-18 15:11:22 +0000101or a hexadecimal number when preceded by \code{x}.
102In the base implementation, \var{ref} must be a number in the
103range 0-255. It translates the character to \ASCII{} and calls the
Fred Drake3b5da761998-03-12 15:33:05 +0000104method \method{handle_data()} with the character as argument. If
Guido van Rossuma10768a1997-11-18 15:11:22 +0000105\var{ref} is invalid or out of range, the method
106\code{unknown_charref(\var{ref})} is called to handle the error. A
107subclass must override this method to provide support for character
108references outside of the \ASCII{} range.
109\end{funcdesc}
110
111\begin{funcdesc}{handle_entityref}{ref}
Fred Drake3b5da761998-03-12 15:33:05 +0000112This method is called to process a general entity reference of the
113form \samp{\&\var{ref};} where \var{ref} is an general entity
Guido van Rossuma10768a1997-11-18 15:11:22 +0000114reference. It looks for \var{ref} in the instance (or class)
Fred Drake3b5da761998-03-12 15:33:05 +0000115variable \member{entitydefs} which should be a mapping from entity
116names to corresponding translations.
117If a translation is found, it calls the method \method{handle_data()}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000118with the translation; otherwise, it calls the method
Fred Drake3b5da761998-03-12 15:33:05 +0000119\code{unknown_entityref(\var{ref})}. The default \member{entitydefs}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000120defines translations for \code{\&amp;}, \code{\&apos}, \code{\&gt;},
121\code{\&lt;}, and \code{\&quot;}.
122\end{funcdesc}
123
124\begin{funcdesc}{handle_comment}{comment}
125This method is called when a comment is encountered. The
Fred Drake3b5da761998-03-12 15:33:05 +0000126\var{comment} argument is a string containing the text between the
Fred Drake7f6e2c41998-02-13 14:38:23 +0000127\samp{<!--} and \samp{-->} delimiters, but not the delimiters
128themselves. For example, the comment \samp{<!--text-->} will
Guido van Rossuma10768a1997-11-18 15:11:22 +0000129cause this method to be called with the argument \code{'text'}. The
130default method does nothing.
131\end{funcdesc}
132
133\begin{funcdesc}{handle_cdata}{data}
134This method is called when a CDATA element is encountered. The
Fred Drake3b5da761998-03-12 15:33:05 +0000135\var{data} argument is a string containing the text between the
Fred Drake7f6e2c41998-02-13 14:38:23 +0000136\samp{<![CDATA[} and \samp{]]>} delimiters, but not the delimiters
137themselves. For example, the entity \samp{<![CDATA[text]]>} will
Guido van Rossuma10768a1997-11-18 15:11:22 +0000138cause this method to be called with the argument \code{'text'}. The
Fred Drake3b5da761998-03-12 15:33:05 +0000139default method does nothing, and is intended to be overridden.
Guido van Rossuma10768a1997-11-18 15:11:22 +0000140\end{funcdesc}
141
Fred Drake3b5da761998-03-12 15:33:05 +0000142\begin{funcdesc}{handle_proc}{name, data}
143This method is called when a processing instruction (PI) is
144encountered. The \var{name} is the PI target, and the \var{data}
145argument is a string containing the text between the PI target and the
146closing delimiter, but not the delimiter itself. For example, the
147instruction \samp{<?XML text?>} will cause this method to be called
148with the arguments \code{'XML'} and \code{'text'}. The default method
149does nothing. Note that if a document starts with \code{<?xml
150...?>}, \method{handle_xml()} is called to handle it.
Guido van Rossuma10768a1997-11-18 15:11:22 +0000151\end{funcdesc}
152
153\begin{funcdesc}{handle_special}{data}
154This method is called when a declaration is encountered. The
Fred Drake3b5da761998-03-12 15:33:05 +0000155\var{data} argument is a string containing the text between the
Fred Drake7f6e2c41998-02-13 14:38:23 +0000156\samp{<!} and \samp{>} delimiters, but not the delimiters
157themselves. For example, the entity \samp{<!ENTITY text>} will
Guido van Rossum02505e41998-01-29 14:55:24 +0000158cause this method to be called with the argument \code{'ENTITY text'}. The
159default method does nothing. Note that \code{<!DOCTYPE ...>} is
160handled separately if it is located at the start of the document.
Guido van Rossuma10768a1997-11-18 15:11:22 +0000161\end{funcdesc}
162
Guido van Rossum02505e41998-01-29 14:55:24 +0000163\begin{funcdesc}{syntax_error}{message}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000164This method is called when a syntax error is encountered. The
Fred Drake3b5da761998-03-12 15:33:05 +0000165\var{message} is a description of what was wrong. The default method
166raises a \exception{RuntimeError} exception. If this method is
167overridden, it is permissable for it to return. This method is only
168called when the error can be recovered from. Unrecoverable errors
169raise a \exception{RuntimeError} without first calling
170\method{syntax_error()}.
Guido van Rossuma10768a1997-11-18 15:11:22 +0000171\end{funcdesc}
172
Fred Drakecce10901998-03-17 06:33:25 +0000173\begin{funcdesc}{unknown_starttag}{tag, attributes}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000174This method is called to process an unknown start tag. It is intended
175to be overridden by a derived class; the base class implementation
176does nothing.
177\end{funcdesc}
178
179\begin{funcdesc}{unknown_endtag}{tag}
180This method is called to process an unknown end tag. It is intended
181to be overridden by a derived class; the base class implementation
182does nothing.
183\end{funcdesc}
184
185\begin{funcdesc}{unknown_charref}{ref}
186This method is called to process unresolvable numeric character
187references. It is intended to be overridden by a derived class; the
188base class implementation does nothing.
189\end{funcdesc}
190
191\begin{funcdesc}{unknown_entityref}{ref}
192This method is called to process an unknown entity reference. It is
193intended to be overridden by a derived class; the base class
194implementation does nothing.
195\end{funcdesc}
196
197Apart from overriding or extending the methods listed above, derived
Guido van Rossum02505e41998-01-29 14:55:24 +0000198classes may also define methods and variables of the following form to
199define processing of specific tags. Tag names in the input stream are
200case dependent; the \var{tag} occurring in method names must be in the
Guido van Rossuma10768a1997-11-18 15:11:22 +0000201correct case:
202
Fred Drake7f6e2c41998-02-13 14:38:23 +0000203\begin{funcdescni}{start_\var{tag}}{attributes}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000204This method is called to process an opening tag \var{tag}. The
205\var{attributes} argument has the same meaning as described for
Fred Drake3b5da761998-03-12 15:33:05 +0000206\method{handle_starttag()} above. In fact, the base implementation of
207\method{handle_starttag()} calls this method.
Fred Drake7f6e2c41998-02-13 14:38:23 +0000208\end{funcdescni}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000209
Fred Drake7f6e2c41998-02-13 14:38:23 +0000210\begin{funcdescni}{end_\var{tag}}{}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000211This method is called to process a closing tag \var{tag}.
Fred Drake7f6e2c41998-02-13 14:38:23 +0000212\end{funcdescni}
Guido van Rossum02505e41998-01-29 14:55:24 +0000213
Fred Drake7f6e2c41998-02-13 14:38:23 +0000214\begin{datadescni}{\var{tag}_attributes}
Guido van Rossum02505e41998-01-29 14:55:24 +0000215If a class or instance variable \code{\var{tag}_attributes} exists, it
216should be a list or a dictionary. If a list, the elements of the list
217are the valid attributes for the element \var{tag}; if a dictionary,
218the keys are the valid attributes for the element \var{tag}, and the
219values the default values of the attributes, or \code{None} if there
220is no default.
221In addition to the attributes that were present in the tag, the
Fred Drake3b5da761998-03-12 15:33:05 +0000222attribute dictionary that is passed to \method{handle_starttag()} and
223\method{unknown_starttag()} contains values for all attributes that
224have a default value.
Fred Drake7f6e2c41998-02-13 14:38:23 +0000225\end{datadescni}