blob: 3ec36e0c65e61927fd40cc3c6bfcfd9ae365a076 [file] [log] [blame]
Fred Drake295da241998-08-10 19:42:37 +00001\section{\module{xmllib} ---
Fred Drake34250111999-02-19 23:45:06 +00002 A parser for XML documents}
3
Fred Drakeb91e9341998-07-23 17:59:49 +00004\declaremodule{standard}{xmllib}
Fred Drake34250111999-02-19 23:45:06 +00005\modulesynopsis{A parser for XML documents.}
Fred Drake191f2851998-12-22 18:06:02 +00006\moduleauthor{Sjoerd Mullender}{Sjoerd.Mullender@cwi.nl}
7\sectionauthor{Sjoerd Mullender}{Sjoerd.Mullender@cwi.nl}
Fred Drakeb91e9341998-07-23 17:59:49 +00008
Fred Drakeb91e9341998-07-23 17:59:49 +00009
Guido van Rossuma10768a1997-11-18 15:11:22 +000010\index{XML}
Fred Drake5cb48a41998-12-22 18:46:13 +000011\index{Extensible Markup Language}
12
13\versionchanged{1.5.2}
Guido van Rossuma10768a1997-11-18 15:11:22 +000014
Fred Drake3b5da761998-03-12 15:33:05 +000015This module defines a class \class{XMLParser} which serves as the basis
Fred Drake5cb48a41998-12-22 18:46:13 +000016for parsing text files formatted in XML (Extensible Markup Language).
Guido van Rossuma10768a1997-11-18 15:11:22 +000017
Fred Drake3b5da761998-03-12 15:33:05 +000018\begin{classdesc}{XMLParser}{}
19The \class{XMLParser} class must be instantiated without arguments.
20\end{classdesc}
21
Guido van Rossumb083a9f1998-12-18 20:17:13 +000022This class provides the following interface methods and instance variables:
23
24\begin{memberdesc}{attributes}
25A mapping of element names to mappings. The latter mapping maps
26attribute names that are valid for the element to the default value of
27the attribute, or if there is no default to \code{None}. The default
Guido van Rossum09da65e1999-02-02 17:55:12 +000028value is the empty dictionary. This variable is meant to be
29overridden, not extended since the default is shared by all instances
30of \class{XMLParser}.
Guido van Rossumb083a9f1998-12-18 20:17:13 +000031\end{memberdesc}
32
33\begin{memberdesc}{elements}
34A mapping of element names to tuples. The tuples contain a function
35for handling the start and end tag respectively of the element, or
36\code{None} if the method \method{unknown_starttag()} or
37\method{unknown_endtag()} is to be called. The default value is the
Guido van Rossum09da65e1999-02-02 17:55:12 +000038empty dictionary. This variable is meant to be overridden, not
39extended since the default is shared by all instances of
40\class{XMLParser}.
Guido van Rossumb083a9f1998-12-18 20:17:13 +000041\end{memberdesc}
42
43\begin{memberdesc}{entitydefs}
44A mapping of entitynames to their values. The default value contains
45definitions for \code{'lt'}, \code{'gt'}, \code{'amp'}, \code{'quot'},
46and \code{'apos'}.
47\end{memberdesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +000048
Fred Drakefc576191998-04-04 07:15:02 +000049\begin{methoddesc}{reset}{}
Guido van Rossuma10768a1997-11-18 15:11:22 +000050Reset the instance. Loses all unprocessed data. This is called
51implicitly at the instantiation time.
Fred Drakefc576191998-04-04 07:15:02 +000052\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +000053
Fred Drakefc576191998-04-04 07:15:02 +000054\begin{methoddesc}{setnomoretags}{}
Guido van Rossuma10768a1997-11-18 15:11:22 +000055Stop processing tags. Treat all following input as literal input
56(CDATA).
Fred Drakefc576191998-04-04 07:15:02 +000057\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +000058
Fred Drakefc576191998-04-04 07:15:02 +000059\begin{methoddesc}{setliteral}{}
Guido van Rossumf484a331998-12-07 21:59:56 +000060Enter literal mode (CDATA mode). This mode is automatically exited
61when the close tag matching the last unclosed open tag is encountered.
Fred Drakefc576191998-04-04 07:15:02 +000062\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +000063
Fred Drakefc576191998-04-04 07:15:02 +000064\begin{methoddesc}{feed}{data}
Guido van Rossuma10768a1997-11-18 15:11:22 +000065Feed some text to the parser. It is processed insofar as it consists
Guido van Rossumb083a9f1998-12-18 20:17:13 +000066of complete tags; incomplete data is buffered until more data is
Fred Drake3b5da761998-03-12 15:33:05 +000067fed or \method{close()} is called.
Fred Drakefc576191998-04-04 07:15:02 +000068\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +000069
Fred Drakefc576191998-04-04 07:15:02 +000070\begin{methoddesc}{close}{}
Guido van Rossuma10768a1997-11-18 15:11:22 +000071Force processing of all buffered data as if it were followed by an
72end-of-file mark. This method may be redefined by a derived class to
73define additional processing at the end of the input, but the
Fred Drake3b5da761998-03-12 15:33:05 +000074redefined version should always call \method{close()}.
Fred Drakefc576191998-04-04 07:15:02 +000075\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +000076
Fred Drakefc576191998-04-04 07:15:02 +000077\begin{methoddesc}{translate_references}{data}
Fred Drake3b5da761998-03-12 15:33:05 +000078Translate all entity and character references in \var{data} and
Fred Draked8a41e61999-02-19 17:54:10 +000079return the translated string.
Fred Drakefc576191998-04-04 07:15:02 +000080\end{methoddesc}
Guido van Rossum02505e41998-01-29 14:55:24 +000081
Fred Drakefc576191998-04-04 07:15:02 +000082\begin{methoddesc}{handle_xml}{encoding, standalone}
83This method is called when the \samp{<?xml ...?>} tag is processed.
Guido van Rossum02505e41998-01-29 14:55:24 +000084The arguments are the values of the encoding and standalone attributes
85in the tag. Both encoding and standalone are optional. The values
Fred Drake3b5da761998-03-12 15:33:05 +000086passed to \method{handle_xml()} default to \code{None} and the string
Guido van Rossum02505e41998-01-29 14:55:24 +000087\code{'no'} respectively.
Fred Drakefc576191998-04-04 07:15:02 +000088\end{methoddesc}
Guido van Rossum02505e41998-01-29 14:55:24 +000089
Fred Drakefc576191998-04-04 07:15:02 +000090\begin{methoddesc}{handle_doctype}{tag, data}
91This method is called when the \samp{<!DOCTYPE...>} tag is processed.
Guido van Rossum02505e41998-01-29 14:55:24 +000092The arguments are the name of the root element and the uninterpreted
93contents of the tag, starting after the white space after the name of
94the root element.
Fred Drakefc576191998-04-04 07:15:02 +000095\end{methoddesc}
Guido van Rossum02505e41998-01-29 14:55:24 +000096
Fred Drakefc576191998-04-04 07:15:02 +000097\begin{methoddesc}{handle_starttag}{tag, method, attributes}
Guido van Rossumb083a9f1998-12-18 20:17:13 +000098This method is called to handle start tags for which a start tag
99handler is defined in the instance variable \member{elements}. The
100\var{tag} argument is the name of the tag, and the \var{method}
101argument is the function (method) which should be used to support semantic
102interpretation of the start tag. The \var{attributes} argument is a
103dictionary of attributes, the key being the \var{name} and the value
104being the \var{value} of the attribute found inside the tag's
105\code{<>} brackets. Character and entity references in the
106\var{value} have been interpreted. For instance, for the start tag
Guido van Rossuma10768a1997-11-18 15:11:22 +0000107\code{<A HREF="http://www.cwi.nl/">}, this method would be called as
Guido van Rossumb083a9f1998-12-18 20:17:13 +0000108\code{handle_starttag('A', self.elements['A'][0], \{'HREF': 'http://www.cwi.nl/'\})}.
Fred Drake3b5da761998-03-12 15:33:05 +0000109The base implementation simply calls \var{method} with \var{attributes}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000110as the only argument.
Fred Drakefc576191998-04-04 07:15:02 +0000111\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000112
Fred Drakefc576191998-04-04 07:15:02 +0000113\begin{methoddesc}{handle_endtag}{tag, method}
Guido van Rossumb083a9f1998-12-18 20:17:13 +0000114This method is called to handle endtags for which an end tag handler
115is defined in the instance variable \member{elements}. The \var{tag}
116argument is the name of the tag, and the \var{method} argument is the
117function (method) which should be used to support semantic
118interpretation of the end tag. For instance, for the endtag
119\code{</A>}, this method would be called as \code{handle_endtag('A',
120self.elements['A'][1])}. The base implementation simply calls
Fred Drake3b5da761998-03-12 15:33:05 +0000121\var{method}.
Fred Drakefc576191998-04-04 07:15:02 +0000122\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000123
Fred Drakefc576191998-04-04 07:15:02 +0000124\begin{methoddesc}{handle_data}{data}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000125This method is called to process arbitrary data. It is intended to be
126overridden by a derived class; the base class implementation does
127nothing.
Fred Drakefc576191998-04-04 07:15:02 +0000128\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000129
Fred Drakefc576191998-04-04 07:15:02 +0000130\begin{methoddesc}{handle_charref}{ref}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000131This method is called to process a character reference of the form
Fred Drake7f6e2c41998-02-13 14:38:23 +0000132\samp{\&\#\var{ref};}. \var{ref} can either be a decimal number,
Fred Drakefc576191998-04-04 07:15:02 +0000133or a hexadecimal number when preceded by an \character{x}.
Guido van Rossuma10768a1997-11-18 15:11:22 +0000134In the base implementation, \var{ref} must be a number in the
135range 0-255. It translates the character to \ASCII{} and calls the
Fred Drake3b5da761998-03-12 15:33:05 +0000136method \method{handle_data()} with the character as argument. If
Guido van Rossuma10768a1997-11-18 15:11:22 +0000137\var{ref} is invalid or out of range, the method
138\code{unknown_charref(\var{ref})} is called to handle the error. A
139subclass must override this method to provide support for character
140references outside of the \ASCII{} range.
Fred Drakefc576191998-04-04 07:15:02 +0000141\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000142
Fred Drakefc576191998-04-04 07:15:02 +0000143\begin{methoddesc}{handle_entityref}{ref}
Fred Drake3b5da761998-03-12 15:33:05 +0000144This method is called to process a general entity reference of the
145form \samp{\&\var{ref};} where \var{ref} is an general entity
Guido van Rossuma10768a1997-11-18 15:11:22 +0000146reference. It looks for \var{ref} in the instance (or class)
Fred Drake3b5da761998-03-12 15:33:05 +0000147variable \member{entitydefs} which should be a mapping from entity
148names to corresponding translations.
149If a translation is found, it calls the method \method{handle_data()}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000150with the translation; otherwise, it calls the method
Fred Drake3b5da761998-03-12 15:33:05 +0000151\code{unknown_entityref(\var{ref})}. The default \member{entitydefs}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000152defines translations for \code{\&amp;}, \code{\&apos}, \code{\&gt;},
153\code{\&lt;}, and \code{\&quot;}.
Fred Drakefc576191998-04-04 07:15:02 +0000154\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000155
Fred Drakefc576191998-04-04 07:15:02 +0000156\begin{methoddesc}{handle_comment}{comment}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000157This method is called when a comment is encountered. The
Fred Drake3b5da761998-03-12 15:33:05 +0000158\var{comment} argument is a string containing the text between the
Fred Drake7f6e2c41998-02-13 14:38:23 +0000159\samp{<!--} and \samp{-->} delimiters, but not the delimiters
160themselves. For example, the comment \samp{<!--text-->} will
Guido van Rossuma10768a1997-11-18 15:11:22 +0000161cause this method to be called with the argument \code{'text'}. The
162default method does nothing.
Fred Drakefc576191998-04-04 07:15:02 +0000163\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000164
Fred Drakefc576191998-04-04 07:15:02 +0000165\begin{methoddesc}{handle_cdata}{data}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000166This method is called when a CDATA element is encountered. The
Fred Drake3b5da761998-03-12 15:33:05 +0000167\var{data} argument is a string containing the text between the
Fred Drake7f6e2c41998-02-13 14:38:23 +0000168\samp{<![CDATA[} and \samp{]]>} delimiters, but not the delimiters
169themselves. For example, the entity \samp{<![CDATA[text]]>} will
Guido van Rossuma10768a1997-11-18 15:11:22 +0000170cause this method to be called with the argument \code{'text'}. The
Fred Drake3b5da761998-03-12 15:33:05 +0000171default method does nothing, and is intended to be overridden.
Fred Drakefc576191998-04-04 07:15:02 +0000172\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000173
Fred Drakefc576191998-04-04 07:15:02 +0000174\begin{methoddesc}{handle_proc}{name, data}
Fred Drake3b5da761998-03-12 15:33:05 +0000175This method is called when a processing instruction (PI) is
176encountered. The \var{name} is the PI target, and the \var{data}
177argument is a string containing the text between the PI target and the
178closing delimiter, but not the delimiter itself. For example, the
179instruction \samp{<?XML text?>} will cause this method to be called
180with the arguments \code{'XML'} and \code{'text'}. The default method
Fred Drakefc576191998-04-04 07:15:02 +0000181does nothing. Note that if a document starts with \samp{<?xml
Guido van Rossumb083a9f1998-12-18 20:17:13 +0000182..?>}, \method{handle_xml()} is called to handle it.
Fred Drakefc576191998-04-04 07:15:02 +0000183\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000184
Fred Drakefc576191998-04-04 07:15:02 +0000185\begin{methoddesc}{handle_special}{data}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000186This method is called when a declaration is encountered. The
Fred Drake3b5da761998-03-12 15:33:05 +0000187\var{data} argument is a string containing the text between the
Fred Drake7f6e2c41998-02-13 14:38:23 +0000188\samp{<!} and \samp{>} delimiters, but not the delimiters
189themselves. For example, the entity \samp{<!ENTITY text>} will
Guido van Rossum02505e41998-01-29 14:55:24 +0000190cause this method to be called with the argument \code{'ENTITY text'}. The
Fred Drakefc576191998-04-04 07:15:02 +0000191default method does nothing. Note that \samp{<!DOCTYPE ...>} is
Guido van Rossum02505e41998-01-29 14:55:24 +0000192handled separately if it is located at the start of the document.
Fred Drakefc576191998-04-04 07:15:02 +0000193\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000194
Fred Drakefc576191998-04-04 07:15:02 +0000195\begin{methoddesc}{syntax_error}{message}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000196This method is called when a syntax error is encountered. The
Fred Drake3b5da761998-03-12 15:33:05 +0000197\var{message} is a description of what was wrong. The default method
198raises a \exception{RuntimeError} exception. If this method is
199overridden, it is permissable for it to return. This method is only
200called when the error can be recovered from. Unrecoverable errors
201raise a \exception{RuntimeError} without first calling
202\method{syntax_error()}.
Fred Drakefc576191998-04-04 07:15:02 +0000203\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000204
Fred Drakefc576191998-04-04 07:15:02 +0000205\begin{methoddesc}{unknown_starttag}{tag, attributes}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000206This method is called to process an unknown start tag. It is intended
207to be overridden by a derived class; the base class implementation
208does nothing.
Fred Drakefc576191998-04-04 07:15:02 +0000209\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000210
Fred Drakefc576191998-04-04 07:15:02 +0000211\begin{methoddesc}{unknown_endtag}{tag}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000212This method is called to process an unknown end tag. It is intended
213to be overridden by a derived class; the base class implementation
214does nothing.
Fred Drakefc576191998-04-04 07:15:02 +0000215\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000216
Fred Drakefc576191998-04-04 07:15:02 +0000217\begin{methoddesc}{unknown_charref}{ref}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000218This method is called to process unresolvable numeric character
219references. It is intended to be overridden by a derived class; the
220base class implementation does nothing.
Fred Drakefc576191998-04-04 07:15:02 +0000221\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000222
Fred Drakefc576191998-04-04 07:15:02 +0000223\begin{methoddesc}{unknown_entityref}{ref}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000224This method is called to process an unknown entity reference. It is
225intended to be overridden by a derived class; the base class
226implementation does nothing.
Fred Drakefc576191998-04-04 07:15:02 +0000227\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000228
Fred Drake34250111999-02-19 23:45:06 +0000229
230\subsection{XML Namespaces \label{xml-namespace}}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000231
Guido van Rossumb083a9f1998-12-18 20:17:13 +0000232This module has support for XML namespaces as defined in the XML
233Namespaces proposed recommendation.
Fred Drake34250111999-02-19 23:45:06 +0000234\indexii{XML}{namespaces}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000235
Guido van Rossumb083a9f1998-12-18 20:17:13 +0000236Tag and attribute names that are defined in an XML namespace are
237handled as if the name of the tag or element consisted of the
238namespace (i.e. the URL that defines the namespace) followed by a
239space and the name of the tag or attribute. For instance, the tag
240\code{<html xmlns='http://www.w3.org/TR/REC-html40'>} is treated as if
241the tag name was \code{'http://www.w3.org/TR/REC-html40 html'}, and
242the tag \code{<html:a href='http://frob.com'>} inside the above
243mentioned element is treated as if the tag name were
244\code{'http://www.w3.org/TR/REC-html40 a'} and the attribute name as
245if it were \code{'http://www.w3.org/TR/REC-html40 src'}.
Guido van Rossum02505e41998-01-29 14:55:24 +0000246
Guido van Rossumb083a9f1998-12-18 20:17:13 +0000247An older draft of the XML Namespaces proposal is also recognized, but
248triggers a warning.