blob: 0a097f49bdc3add1cdfce6cddd64c1b7f08df5a8 [file] [log] [blame]
Fred Drake295da241998-08-10 19:42:37 +00001\section{\module{xmllib} ---
2 A parser for XML documents.}
Fred Drakeb91e9341998-07-23 17:59:49 +00003\declaremodule{standard}{xmllib}
Fred Drake191f2851998-12-22 18:06:02 +00004\moduleauthor{Sjoerd Mullender}{Sjoerd.Mullender@cwi.nl}
5\sectionauthor{Sjoerd Mullender}{Sjoerd.Mullender@cwi.nl}
Fred Drakeb91e9341998-07-23 17:59:49 +00006
7\modulesynopsis{A parser for XML documents.}
8
Guido van Rossuma10768a1997-11-18 15:11:22 +00009\index{XML}
Fred Drake5cb48a41998-12-22 18:46:13 +000010\index{Extensible Markup Language}
11
12\versionchanged{1.5.2}
Guido van Rossuma10768a1997-11-18 15:11:22 +000013
Fred Drake3b5da761998-03-12 15:33:05 +000014This module defines a class \class{XMLParser} which serves as the basis
Fred Drake5cb48a41998-12-22 18:46:13 +000015for parsing text files formatted in XML (Extensible Markup Language).
Guido van Rossuma10768a1997-11-18 15:11:22 +000016
Fred Drake3b5da761998-03-12 15:33:05 +000017\begin{classdesc}{XMLParser}{}
18The \class{XMLParser} class must be instantiated without arguments.
19\end{classdesc}
20
Guido van Rossumb083a9f1998-12-18 20:17:13 +000021This class provides the following interface methods and instance variables:
22
23\begin{memberdesc}{attributes}
24A mapping of element names to mappings. The latter mapping maps
25attribute names that are valid for the element to the default value of
26the attribute, or if there is no default to \code{None}. The default
Guido van Rossum09da65e1999-02-02 17:55:12 +000027value is the empty dictionary. This variable is meant to be
28overridden, not extended since the default is shared by all instances
29of \class{XMLParser}.
Guido van Rossumb083a9f1998-12-18 20:17:13 +000030\end{memberdesc}
31
32\begin{memberdesc}{elements}
33A mapping of element names to tuples. The tuples contain a function
34for handling the start and end tag respectively of the element, or
35\code{None} if the method \method{unknown_starttag()} or
36\method{unknown_endtag()} is to be called. The default value is the
Guido van Rossum09da65e1999-02-02 17:55:12 +000037empty dictionary. This variable is meant to be overridden, not
38extended since the default is shared by all instances of
39\class{XMLParser}.
Guido van Rossumb083a9f1998-12-18 20:17:13 +000040\end{memberdesc}
41
42\begin{memberdesc}{entitydefs}
43A mapping of entitynames to their values. The default value contains
44definitions for \code{'lt'}, \code{'gt'}, \code{'amp'}, \code{'quot'},
45and \code{'apos'}.
46\end{memberdesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +000047
Fred Drakefc576191998-04-04 07:15:02 +000048\begin{methoddesc}{reset}{}
Guido van Rossuma10768a1997-11-18 15:11:22 +000049Reset the instance. Loses all unprocessed data. This is called
50implicitly at the instantiation time.
Fred Drakefc576191998-04-04 07:15:02 +000051\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +000052
Fred Drakefc576191998-04-04 07:15:02 +000053\begin{methoddesc}{setnomoretags}{}
Guido van Rossuma10768a1997-11-18 15:11:22 +000054Stop processing tags. Treat all following input as literal input
55(CDATA).
Fred Drakefc576191998-04-04 07:15:02 +000056\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +000057
Fred Drakefc576191998-04-04 07:15:02 +000058\begin{methoddesc}{setliteral}{}
Guido van Rossumf484a331998-12-07 21:59:56 +000059Enter literal mode (CDATA mode). This mode is automatically exited
60when the close tag matching the last unclosed open tag is encountered.
Fred Drakefc576191998-04-04 07:15:02 +000061\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +000062
Fred Drakefc576191998-04-04 07:15:02 +000063\begin{methoddesc}{feed}{data}
Guido van Rossuma10768a1997-11-18 15:11:22 +000064Feed some text to the parser. It is processed insofar as it consists
Guido van Rossumb083a9f1998-12-18 20:17:13 +000065of complete tags; incomplete data is buffered until more data is
Fred Drake3b5da761998-03-12 15:33:05 +000066fed or \method{close()} is called.
Fred Drakefc576191998-04-04 07:15:02 +000067\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +000068
Fred Drakefc576191998-04-04 07:15:02 +000069\begin{methoddesc}{close}{}
Guido van Rossuma10768a1997-11-18 15:11:22 +000070Force processing of all buffered data as if it were followed by an
71end-of-file mark. This method may be redefined by a derived class to
72define additional processing at the end of the input, but the
Fred Drake3b5da761998-03-12 15:33:05 +000073redefined version should always call \method{close()}.
Fred Drakefc576191998-04-04 07:15:02 +000074\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +000075
Fred Drakefc576191998-04-04 07:15:02 +000076\begin{methoddesc}{translate_references}{data}
Fred Drake3b5da761998-03-12 15:33:05 +000077Translate all entity and character references in \var{data} and
Guido van Rossum02505e41998-01-29 14:55:24 +000078returns the translated string.
Fred Drakefc576191998-04-04 07:15:02 +000079\end{methoddesc}
Guido van Rossum02505e41998-01-29 14:55:24 +000080
Fred Drakefc576191998-04-04 07:15:02 +000081\begin{methoddesc}{handle_xml}{encoding, standalone}
82This method is called when the \samp{<?xml ...?>} tag is processed.
Guido van Rossum02505e41998-01-29 14:55:24 +000083The arguments are the values of the encoding and standalone attributes
84in the tag. Both encoding and standalone are optional. The values
Fred Drake3b5da761998-03-12 15:33:05 +000085passed to \method{handle_xml()} default to \code{None} and the string
Guido van Rossum02505e41998-01-29 14:55:24 +000086\code{'no'} respectively.
Fred Drakefc576191998-04-04 07:15:02 +000087\end{methoddesc}
Guido van Rossum02505e41998-01-29 14:55:24 +000088
Fred Drakefc576191998-04-04 07:15:02 +000089\begin{methoddesc}{handle_doctype}{tag, data}
90This method is called when the \samp{<!DOCTYPE...>} tag is processed.
Guido van Rossum02505e41998-01-29 14:55:24 +000091The arguments are the name of the root element and the uninterpreted
92contents of the tag, starting after the white space after the name of
93the root element.
Fred Drakefc576191998-04-04 07:15:02 +000094\end{methoddesc}
Guido van Rossum02505e41998-01-29 14:55:24 +000095
Fred Drakefc576191998-04-04 07:15:02 +000096\begin{methoddesc}{handle_starttag}{tag, method, attributes}
Guido van Rossumb083a9f1998-12-18 20:17:13 +000097This method is called to handle start tags for which a start tag
98handler is defined in the instance variable \member{elements}. The
99\var{tag} argument is the name of the tag, and the \var{method}
100argument is the function (method) which should be used to support semantic
101interpretation of the start tag. The \var{attributes} argument is a
102dictionary of attributes, the key being the \var{name} and the value
103being the \var{value} of the attribute found inside the tag's
104\code{<>} brackets. Character and entity references in the
105\var{value} have been interpreted. For instance, for the start tag
Guido van Rossuma10768a1997-11-18 15:11:22 +0000106\code{<A HREF="http://www.cwi.nl/">}, this method would be called as
Guido van Rossumb083a9f1998-12-18 20:17:13 +0000107\code{handle_starttag('A', self.elements['A'][0], \{'HREF': 'http://www.cwi.nl/'\})}.
Fred Drake3b5da761998-03-12 15:33:05 +0000108The base implementation simply calls \var{method} with \var{attributes}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000109as the only argument.
Fred Drakefc576191998-04-04 07:15:02 +0000110\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000111
Fred Drakefc576191998-04-04 07:15:02 +0000112\begin{methoddesc}{handle_endtag}{tag, method}
Guido van Rossumb083a9f1998-12-18 20:17:13 +0000113This method is called to handle endtags for which an end tag handler
114is defined in the instance variable \member{elements}. The \var{tag}
115argument is the name of the tag, and the \var{method} argument is the
116function (method) which should be used to support semantic
117interpretation of the end tag. For instance, for the endtag
118\code{</A>}, this method would be called as \code{handle_endtag('A',
119self.elements['A'][1])}. The base implementation simply calls
Fred Drake3b5da761998-03-12 15:33:05 +0000120\var{method}.
Fred Drakefc576191998-04-04 07:15:02 +0000121\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000122
Fred Drakefc576191998-04-04 07:15:02 +0000123\begin{methoddesc}{handle_data}{data}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000124This method is called to process arbitrary data. It is intended to be
125overridden by a derived class; the base class implementation does
126nothing.
Fred Drakefc576191998-04-04 07:15:02 +0000127\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000128
Fred Drakefc576191998-04-04 07:15:02 +0000129\begin{methoddesc}{handle_charref}{ref}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000130This method is called to process a character reference of the form
Fred Drake7f6e2c41998-02-13 14:38:23 +0000131\samp{\&\#\var{ref};}. \var{ref} can either be a decimal number,
Fred Drakefc576191998-04-04 07:15:02 +0000132or a hexadecimal number when preceded by an \character{x}.
Guido van Rossuma10768a1997-11-18 15:11:22 +0000133In the base implementation, \var{ref} must be a number in the
134range 0-255. It translates the character to \ASCII{} and calls the
Fred Drake3b5da761998-03-12 15:33:05 +0000135method \method{handle_data()} with the character as argument. If
Guido van Rossuma10768a1997-11-18 15:11:22 +0000136\var{ref} is invalid or out of range, the method
137\code{unknown_charref(\var{ref})} is called to handle the error. A
138subclass must override this method to provide support for character
139references outside of the \ASCII{} range.
Fred Drakefc576191998-04-04 07:15:02 +0000140\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000141
Fred Drakefc576191998-04-04 07:15:02 +0000142\begin{methoddesc}{handle_entityref}{ref}
Fred Drake3b5da761998-03-12 15:33:05 +0000143This method is called to process a general entity reference of the
144form \samp{\&\var{ref};} where \var{ref} is an general entity
Guido van Rossuma10768a1997-11-18 15:11:22 +0000145reference. It looks for \var{ref} in the instance (or class)
Fred Drake3b5da761998-03-12 15:33:05 +0000146variable \member{entitydefs} which should be a mapping from entity
147names to corresponding translations.
148If a translation is found, it calls the method \method{handle_data()}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000149with the translation; otherwise, it calls the method
Fred Drake3b5da761998-03-12 15:33:05 +0000150\code{unknown_entityref(\var{ref})}. The default \member{entitydefs}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000151defines translations for \code{\&amp;}, \code{\&apos}, \code{\&gt;},
152\code{\&lt;}, and \code{\&quot;}.
Fred Drakefc576191998-04-04 07:15:02 +0000153\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000154
Fred Drakefc576191998-04-04 07:15:02 +0000155\begin{methoddesc}{handle_comment}{comment}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000156This method is called when a comment is encountered. The
Fred Drake3b5da761998-03-12 15:33:05 +0000157\var{comment} argument is a string containing the text between the
Fred Drake7f6e2c41998-02-13 14:38:23 +0000158\samp{<!--} and \samp{-->} delimiters, but not the delimiters
159themselves. For example, the comment \samp{<!--text-->} will
Guido van Rossuma10768a1997-11-18 15:11:22 +0000160cause this method to be called with the argument \code{'text'}. The
161default method does nothing.
Fred Drakefc576191998-04-04 07:15:02 +0000162\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000163
Fred Drakefc576191998-04-04 07:15:02 +0000164\begin{methoddesc}{handle_cdata}{data}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000165This method is called when a CDATA element is encountered. The
Fred Drake3b5da761998-03-12 15:33:05 +0000166\var{data} argument is a string containing the text between the
Fred Drake7f6e2c41998-02-13 14:38:23 +0000167\samp{<![CDATA[} and \samp{]]>} delimiters, but not the delimiters
168themselves. For example, the entity \samp{<![CDATA[text]]>} will
Guido van Rossuma10768a1997-11-18 15:11:22 +0000169cause this method to be called with the argument \code{'text'}. The
Fred Drake3b5da761998-03-12 15:33:05 +0000170default method does nothing, and is intended to be overridden.
Fred Drakefc576191998-04-04 07:15:02 +0000171\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000172
Fred Drakefc576191998-04-04 07:15:02 +0000173\begin{methoddesc}{handle_proc}{name, data}
Fred Drake3b5da761998-03-12 15:33:05 +0000174This method is called when a processing instruction (PI) is
175encountered. The \var{name} is the PI target, and the \var{data}
176argument is a string containing the text between the PI target and the
177closing delimiter, but not the delimiter itself. For example, the
178instruction \samp{<?XML text?>} will cause this method to be called
179with the arguments \code{'XML'} and \code{'text'}. The default method
Fred Drakefc576191998-04-04 07:15:02 +0000180does nothing. Note that if a document starts with \samp{<?xml
Guido van Rossumb083a9f1998-12-18 20:17:13 +0000181..?>}, \method{handle_xml()} is called to handle it.
Fred Drakefc576191998-04-04 07:15:02 +0000182\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000183
Fred Drakefc576191998-04-04 07:15:02 +0000184\begin{methoddesc}{handle_special}{data}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000185This method is called when a declaration is encountered. The
Fred Drake3b5da761998-03-12 15:33:05 +0000186\var{data} argument is a string containing the text between the
Fred Drake7f6e2c41998-02-13 14:38:23 +0000187\samp{<!} and \samp{>} delimiters, but not the delimiters
188themselves. For example, the entity \samp{<!ENTITY text>} will
Guido van Rossum02505e41998-01-29 14:55:24 +0000189cause this method to be called with the argument \code{'ENTITY text'}. The
Fred Drakefc576191998-04-04 07:15:02 +0000190default method does nothing. Note that \samp{<!DOCTYPE ...>} is
Guido van Rossum02505e41998-01-29 14:55:24 +0000191handled separately if it is located at the start of the document.
Fred Drakefc576191998-04-04 07:15:02 +0000192\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000193
Fred Drakefc576191998-04-04 07:15:02 +0000194\begin{methoddesc}{syntax_error}{message}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000195This method is called when a syntax error is encountered. The
Fred Drake3b5da761998-03-12 15:33:05 +0000196\var{message} is a description of what was wrong. The default method
197raises a \exception{RuntimeError} exception. If this method is
198overridden, it is permissable for it to return. This method is only
199called when the error can be recovered from. Unrecoverable errors
200raise a \exception{RuntimeError} without first calling
201\method{syntax_error()}.
Fred Drakefc576191998-04-04 07:15:02 +0000202\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000203
Fred Drakefc576191998-04-04 07:15:02 +0000204\begin{methoddesc}{unknown_starttag}{tag, attributes}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000205This method is called to process an unknown start tag. It is intended
206to be overridden by a derived class; the base class implementation
207does nothing.
Fred Drakefc576191998-04-04 07:15:02 +0000208\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000209
Fred Drakefc576191998-04-04 07:15:02 +0000210\begin{methoddesc}{unknown_endtag}{tag}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000211This method is called to process an unknown end tag. It is intended
212to be overridden by a derived class; the base class implementation
213does nothing.
Fred Drakefc576191998-04-04 07:15:02 +0000214\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000215
Fred Drakefc576191998-04-04 07:15:02 +0000216\begin{methoddesc}{unknown_charref}{ref}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000217This method is called to process unresolvable numeric character
218references. It is intended to be overridden by a derived class; the
219base class implementation does nothing.
Fred Drakefc576191998-04-04 07:15:02 +0000220\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000221
Fred Drakefc576191998-04-04 07:15:02 +0000222\begin{methoddesc}{unknown_entityref}{ref}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000223This method is called to process an unknown entity reference. It is
224intended to be overridden by a derived class; the base class
225implementation does nothing.
Fred Drakefc576191998-04-04 07:15:02 +0000226\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000227
Guido van Rossumb083a9f1998-12-18 20:17:13 +0000228\subsection{XML Namespaces}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000229
Guido van Rossumb083a9f1998-12-18 20:17:13 +0000230This module has support for XML namespaces as defined in the XML
231Namespaces proposed recommendation.
Guido van Rossuma10768a1997-11-18 15:11:22 +0000232
Guido van Rossumb083a9f1998-12-18 20:17:13 +0000233Tag and attribute names that are defined in an XML namespace are
234handled as if the name of the tag or element consisted of the
235namespace (i.e. the URL that defines the namespace) followed by a
236space and the name of the tag or attribute. For instance, the tag
237\code{<html xmlns='http://www.w3.org/TR/REC-html40'>} is treated as if
238the tag name was \code{'http://www.w3.org/TR/REC-html40 html'}, and
239the tag \code{<html:a href='http://frob.com'>} inside the above
240mentioned element is treated as if the tag name were
241\code{'http://www.w3.org/TR/REC-html40 a'} and the attribute name as
242if it were \code{'http://www.w3.org/TR/REC-html40 src'}.
Guido van Rossum02505e41998-01-29 14:55:24 +0000243
Guido van Rossumb083a9f1998-12-18 20:17:13 +0000244An older draft of the XML Namespaces proposal is also recognized, but
245triggers a warning.