blob: 7cbb20d5cb9e4a8ef70ff6b2ce35c5b8e1311dc2 [file] [log] [blame]
Fred Drake295da241998-08-10 19:42:37 +00001\section{\module{xmllib} ---
Fred Drake34250111999-02-19 23:45:06 +00002 A parser for XML documents}
3
Fred Drakeb91e9341998-07-23 17:59:49 +00004\declaremodule{standard}{xmllib}
Fred Drake34250111999-02-19 23:45:06 +00005\modulesynopsis{A parser for XML documents.}
Fred Drake191f2851998-12-22 18:06:02 +00006\moduleauthor{Sjoerd Mullender}{Sjoerd.Mullender@cwi.nl}
7\sectionauthor{Sjoerd Mullender}{Sjoerd.Mullender@cwi.nl}
Fred Drakeb91e9341998-07-23 17:59:49 +00008
Fred Drakeb91e9341998-07-23 17:59:49 +00009
Guido van Rossuma10768a1997-11-18 15:11:22 +000010\index{XML}
Fred Drake5cb48a41998-12-22 18:46:13 +000011\index{Extensible Markup Language}
12
13\versionchanged{1.5.2}
Guido van Rossuma10768a1997-11-18 15:11:22 +000014
Fred Drake3b5da761998-03-12 15:33:05 +000015This module defines a class \class{XMLParser} which serves as the basis
Fred Drake5cb48a41998-12-22 18:46:13 +000016for parsing text files formatted in XML (Extensible Markup Language).
Guido van Rossuma10768a1997-11-18 15:11:22 +000017
Fred Drake3b5da761998-03-12 15:33:05 +000018\begin{classdesc}{XMLParser}{}
Guido van Rossume7f19201999-08-26 15:57:44 +000019The \class{XMLParser} class must be instantiated without
20arguments.\footnote{Actually, a number of keyword arguments are
21recognized which influence the parser to accept certain non-standard
22constructs. The following keyword arguments are currently
Fred Drake011028c2000-07-06 04:45:14 +000023recognized. The defaults for all of these is \code{0} (false) except
24for the last one for which the default is \code{1} (true).
Guido van Rossume7f19201999-08-26 15:57:44 +000025\var{accept_unquoted_attributes} (accept certain attribute values
26without requiring quotes), \var{accept_missing_endtag_name} (accept
27end tags that look like \code{</>}), \var{map_case} (map upper case to
28lower case in tags and attributes), \var{accept_utf8} (allow UTF-8
29characters in input; this is required according to the XML standard,
30but Python does not as yet deal properly with these characters, so
Fred Drake011028c2000-07-06 04:45:14 +000031this is not the default), \var{translate_attribute_references} (don't
32attempt to translate character and entity references in attribute values).}
Fred Drake3b5da761998-03-12 15:33:05 +000033\end{classdesc}
34
Guido van Rossumb083a9f1998-12-18 20:17:13 +000035This class provides the following interface methods and instance variables:
36
37\begin{memberdesc}{attributes}
38A mapping of element names to mappings. The latter mapping maps
39attribute names that are valid for the element to the default value of
40the attribute, or if there is no default to \code{None}. The default
Guido van Rossum09da65e1999-02-02 17:55:12 +000041value is the empty dictionary. This variable is meant to be
42overridden, not extended since the default is shared by all instances
43of \class{XMLParser}.
Guido van Rossumb083a9f1998-12-18 20:17:13 +000044\end{memberdesc}
45
46\begin{memberdesc}{elements}
47A mapping of element names to tuples. The tuples contain a function
48for handling the start and end tag respectively of the element, or
49\code{None} if the method \method{unknown_starttag()} or
50\method{unknown_endtag()} is to be called. The default value is the
Guido van Rossum09da65e1999-02-02 17:55:12 +000051empty dictionary. This variable is meant to be overridden, not
52extended since the default is shared by all instances of
53\class{XMLParser}.
Guido van Rossumb083a9f1998-12-18 20:17:13 +000054\end{memberdesc}
55
56\begin{memberdesc}{entitydefs}
57A mapping of entitynames to their values. The default value contains
58definitions for \code{'lt'}, \code{'gt'}, \code{'amp'}, \code{'quot'},
59and \code{'apos'}.
60\end{memberdesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +000061
Fred Drakefc576191998-04-04 07:15:02 +000062\begin{methoddesc}{reset}{}
Guido van Rossuma10768a1997-11-18 15:11:22 +000063Reset the instance. Loses all unprocessed data. This is called
64implicitly at the instantiation time.
Fred Drakefc576191998-04-04 07:15:02 +000065\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +000066
Fred Drakefc576191998-04-04 07:15:02 +000067\begin{methoddesc}{setnomoretags}{}
Guido van Rossuma10768a1997-11-18 15:11:22 +000068Stop processing tags. Treat all following input as literal input
69(CDATA).
Fred Drakefc576191998-04-04 07:15:02 +000070\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +000071
Fred Drakefc576191998-04-04 07:15:02 +000072\begin{methoddesc}{setliteral}{}
Guido van Rossumf484a331998-12-07 21:59:56 +000073Enter literal mode (CDATA mode). This mode is automatically exited
74when the close tag matching the last unclosed open tag is encountered.
Fred Drakefc576191998-04-04 07:15:02 +000075\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +000076
Fred Drakefc576191998-04-04 07:15:02 +000077\begin{methoddesc}{feed}{data}
Guido van Rossuma10768a1997-11-18 15:11:22 +000078Feed some text to the parser. It is processed insofar as it consists
Guido van Rossumb083a9f1998-12-18 20:17:13 +000079of complete tags; incomplete data is buffered until more data is
Fred Drake3b5da761998-03-12 15:33:05 +000080fed or \method{close()} is called.
Fred Drakefc576191998-04-04 07:15:02 +000081\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +000082
Fred Drakefc576191998-04-04 07:15:02 +000083\begin{methoddesc}{close}{}
Guido van Rossuma10768a1997-11-18 15:11:22 +000084Force processing of all buffered data as if it were followed by an
85end-of-file mark. This method may be redefined by a derived class to
86define additional processing at the end of the input, but the
Fred Drake3b5da761998-03-12 15:33:05 +000087redefined version should always call \method{close()}.
Fred Drakefc576191998-04-04 07:15:02 +000088\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +000089
Fred Drakefc576191998-04-04 07:15:02 +000090\begin{methoddesc}{translate_references}{data}
Fred Drake3b5da761998-03-12 15:33:05 +000091Translate all entity and character references in \var{data} and
Fred Draked8a41e61999-02-19 17:54:10 +000092return the translated string.
Fred Drakefc576191998-04-04 07:15:02 +000093\end{methoddesc}
Guido van Rossum02505e41998-01-29 14:55:24 +000094
Sjoerd Mullender1c8feae2000-08-31 10:27:00 +000095\begin{methoddesc}{getnamespace}{}
96Return a mapping of namespace abbreviations to namespace URIs that are
97currently in effect.
98\end{methoddesc}
99
Fred Drakefc576191998-04-04 07:15:02 +0000100\begin{methoddesc}{handle_xml}{encoding, standalone}
101This method is called when the \samp{<?xml ...?>} tag is processed.
Guido van Rossum02505e41998-01-29 14:55:24 +0000102The arguments are the values of the encoding and standalone attributes
103in the tag. Both encoding and standalone are optional. The values
Fred Drake3b5da761998-03-12 15:33:05 +0000104passed to \method{handle_xml()} default to \code{None} and the string
Guido van Rossum02505e41998-01-29 14:55:24 +0000105\code{'no'} respectively.
Fred Drakefc576191998-04-04 07:15:02 +0000106\end{methoddesc}
Guido van Rossum02505e41998-01-29 14:55:24 +0000107
Fred Drake38e5d272000-04-03 20:13:55 +0000108\begin{methoddesc}{handle_doctype}{tag, pubid, syslit, data}
Fred Drake46479d32000-08-11 20:34:27 +0000109This\index{DOCTYPE declaration} method is called when the
110\samp{<!DOCTYPE...>} declaration is processed. The arguments are the
111tag name of the root element, the Formal Public\index{Formal Public
112Identifier} Identifier (or \code{None} if not specified), the system
113identifier, and the uninterpreted contents of the internal DTD subset
114as a string (or \code{None} if not present).
Fred Drakefc576191998-04-04 07:15:02 +0000115\end{methoddesc}
Guido van Rossum02505e41998-01-29 14:55:24 +0000116
Fred Drakefc576191998-04-04 07:15:02 +0000117\begin{methoddesc}{handle_starttag}{tag, method, attributes}
Guido van Rossumb083a9f1998-12-18 20:17:13 +0000118This method is called to handle start tags for which a start tag
119handler is defined in the instance variable \member{elements}. The
Fred Drake46479d32000-08-11 20:34:27 +0000120\var{tag} argument is the name of the tag, and the
121\var{method} argument is the function (method) which should be used to
122support semantic interpretation of the start tag. The
123\var{attributes} argument is a dictionary of attributes, the key being
124the \var{name} and the value being the \var{value} of the attribute
125found inside the tag's \code{<>} brackets. Character and entity
126references in the \var{value} have been interpreted. For instance,
127for the start tag \code{<A HREF="http://www.cwi.nl/">}, this method
128would be called as \code{handle_starttag('A', self.elements['A'][0],
129\{'HREF': 'http://www.cwi.nl/'\})}. The base implementation simply
130calls \var{method} with \var{attributes} as the only argument.
Fred Drakefc576191998-04-04 07:15:02 +0000131\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000132
Fred Drakefc576191998-04-04 07:15:02 +0000133\begin{methoddesc}{handle_endtag}{tag, method}
Guido van Rossumb083a9f1998-12-18 20:17:13 +0000134This method is called to handle endtags for which an end tag handler
135is defined in the instance variable \member{elements}. The \var{tag}
136argument is the name of the tag, and the \var{method} argument is the
137function (method) which should be used to support semantic
138interpretation of the end tag. For instance, for the endtag
139\code{</A>}, this method would be called as \code{handle_endtag('A',
140self.elements['A'][1])}. The base implementation simply calls
Fred Drake3b5da761998-03-12 15:33:05 +0000141\var{method}.
Fred Drakefc576191998-04-04 07:15:02 +0000142\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000143
Fred Drakefc576191998-04-04 07:15:02 +0000144\begin{methoddesc}{handle_data}{data}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000145This method is called to process arbitrary data. It is intended to be
146overridden by a derived class; the base class implementation does
147nothing.
Fred Drakefc576191998-04-04 07:15:02 +0000148\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000149
Fred Drakefc576191998-04-04 07:15:02 +0000150\begin{methoddesc}{handle_charref}{ref}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000151This method is called to process a character reference of the form
Fred Drake7f6e2c41998-02-13 14:38:23 +0000152\samp{\&\#\var{ref};}. \var{ref} can either be a decimal number,
Fred Drakefc576191998-04-04 07:15:02 +0000153or a hexadecimal number when preceded by an \character{x}.
Guido van Rossuma10768a1997-11-18 15:11:22 +0000154In the base implementation, \var{ref} must be a number in the
155range 0-255. It translates the character to \ASCII{} and calls the
Fred Drake3b5da761998-03-12 15:33:05 +0000156method \method{handle_data()} with the character as argument. If
Guido van Rossuma10768a1997-11-18 15:11:22 +0000157\var{ref} is invalid or out of range, the method
158\code{unknown_charref(\var{ref})} is called to handle the error. A
159subclass must override this method to provide support for character
160references outside of the \ASCII{} range.
Fred Drakefc576191998-04-04 07:15:02 +0000161\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000162
Fred Drakefc576191998-04-04 07:15:02 +0000163\begin{methoddesc}{handle_comment}{comment}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000164This method is called when a comment is encountered. The
Fred Drake3b5da761998-03-12 15:33:05 +0000165\var{comment} argument is a string containing the text between the
Fred Drake7f6e2c41998-02-13 14:38:23 +0000166\samp{<!--} and \samp{-->} delimiters, but not the delimiters
167themselves. For example, the comment \samp{<!--text-->} will
Guido van Rossuma10768a1997-11-18 15:11:22 +0000168cause this method to be called with the argument \code{'text'}. The
169default method does nothing.
Fred Drakefc576191998-04-04 07:15:02 +0000170\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000171
Fred Drakefc576191998-04-04 07:15:02 +0000172\begin{methoddesc}{handle_cdata}{data}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000173This method is called when a CDATA element is encountered. The
Fred Drake3b5da761998-03-12 15:33:05 +0000174\var{data} argument is a string containing the text between the
Fred Drake7f6e2c41998-02-13 14:38:23 +0000175\samp{<![CDATA[} and \samp{]]>} delimiters, but not the delimiters
176themselves. For example, the entity \samp{<![CDATA[text]]>} will
Guido van Rossuma10768a1997-11-18 15:11:22 +0000177cause this method to be called with the argument \code{'text'}. The
Fred Drake3b5da761998-03-12 15:33:05 +0000178default method does nothing, and is intended to be overridden.
Fred Drakefc576191998-04-04 07:15:02 +0000179\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000180
Fred Drakefc576191998-04-04 07:15:02 +0000181\begin{methoddesc}{handle_proc}{name, data}
Fred Drake3b5da761998-03-12 15:33:05 +0000182This method is called when a processing instruction (PI) is
183encountered. The \var{name} is the PI target, and the \var{data}
184argument is a string containing the text between the PI target and the
185closing delimiter, but not the delimiter itself. For example, the
186instruction \samp{<?XML text?>} will cause this method to be called
187with the arguments \code{'XML'} and \code{'text'}. The default method
Fred Drakefc576191998-04-04 07:15:02 +0000188does nothing. Note that if a document starts with \samp{<?xml
Guido van Rossumb083a9f1998-12-18 20:17:13 +0000189..?>}, \method{handle_xml()} is called to handle it.
Fred Drakefc576191998-04-04 07:15:02 +0000190\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000191
Fred Drakefc576191998-04-04 07:15:02 +0000192\begin{methoddesc}{handle_special}{data}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000193This method is called when a declaration is encountered. The
Fred Drake3b5da761998-03-12 15:33:05 +0000194\var{data} argument is a string containing the text between the
Fred Drake7f6e2c41998-02-13 14:38:23 +0000195\samp{<!} and \samp{>} delimiters, but not the delimiters
Fred Drake46479d32000-08-11 20:34:27 +0000196themselves. For example, the \index{ENTITY declaration}entity
197declaration \samp{<!ENTITY text>} will cause this method to be called
198with the argument \code{'ENTITY text'}. The default method does
199nothing. Note that \samp{<!DOCTYPE ...>} is handled separately if it
200is located at the start of the document.
Fred Drakefc576191998-04-04 07:15:02 +0000201\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000202
Fred Drakefc576191998-04-04 07:15:02 +0000203\begin{methoddesc}{syntax_error}{message}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000204This method is called when a syntax error is encountered. The
Fred Drake3b5da761998-03-12 15:33:05 +0000205\var{message} is a description of what was wrong. The default method
206raises a \exception{RuntimeError} exception. If this method is
Thomas Woutersf8316632000-07-16 19:01:10 +0000207overridden, it is permissible for it to return. This method is only
Fred Drake3b5da761998-03-12 15:33:05 +0000208called when the error can be recovered from. Unrecoverable errors
209raise a \exception{RuntimeError} without first calling
210\method{syntax_error()}.
Fred Drakefc576191998-04-04 07:15:02 +0000211\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000212
Fred Drakefc576191998-04-04 07:15:02 +0000213\begin{methoddesc}{unknown_starttag}{tag, attributes}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000214This method is called to process an unknown start tag. It is intended
215to be overridden by a derived class; the base class implementation
216does nothing.
Fred Drakefc576191998-04-04 07:15:02 +0000217\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000218
Fred Drakefc576191998-04-04 07:15:02 +0000219\begin{methoddesc}{unknown_endtag}{tag}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000220This method is called to process an unknown end tag. It is intended
221to be overridden by a derived class; the base class implementation
222does nothing.
Fred Drakefc576191998-04-04 07:15:02 +0000223\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000224
Fred Drakefc576191998-04-04 07:15:02 +0000225\begin{methoddesc}{unknown_charref}{ref}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000226This method is called to process unresolvable numeric character
227references. It is intended to be overridden by a derived class; the
228base class implementation does nothing.
Fred Drakefc576191998-04-04 07:15:02 +0000229\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000230
Fred Drakefc576191998-04-04 07:15:02 +0000231\begin{methoddesc}{unknown_entityref}{ref}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000232This method is called to process an unknown entity reference. It is
233intended to be overridden by a derived class; the base class
Guido van Rossume7f19201999-08-26 15:57:44 +0000234implementation calls \method{syntax_error()} to signal an error.
Fred Drakefc576191998-04-04 07:15:02 +0000235\end{methoddesc}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000236
Fred Drake34250111999-02-19 23:45:06 +0000237
Fred Drakec8c40ff1999-04-22 20:16:02 +0000238\begin{seealso}
Fred Drakeae86d432000-09-12 17:53:48 +0000239 \seetitle[http://www.w3.org/TR/REC-xml]{Extensible Markup Language
240 (XML) 1.0}{The XML specification, published by the World
241 Wide Web Consortium (W3C), defines the syntax and
242 processor requirements for XML. References to additional
243 material on XML, including translations of the
244 specification, are available at
245 \url{http://www.w3.org/XML/}.}
Fred Drake38e5d272000-04-03 20:13:55 +0000246
Fred Drakeae86d432000-09-12 17:53:48 +0000247 \seetitle[http://www.python.org/topics/xml/]{Python and XML
248 Processing}{The Python XML Topic Guide provides a great
249 deal of information on using XML from Python and links to
250 other sources of information on XML.}
Fred Drakec8c40ff1999-04-22 20:16:02 +0000251
Fred Drakeae86d432000-09-12 17:53:48 +0000252 \seetitle[http://www.python.org/sigs/xml-sig/]{SIG for XML
253 Processing in Python}{The Python XML Special Interest
254 Group is developing substantial support for processing XML
255 from Python.}
Fred Drakec8c40ff1999-04-22 20:16:02 +0000256\end{seealso}
257
258
Fred Drake34250111999-02-19 23:45:06 +0000259\subsection{XML Namespaces \label{xml-namespace}}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000260
Guido van Rossumb083a9f1998-12-18 20:17:13 +0000261This module has support for XML namespaces as defined in the XML
262Namespaces proposed recommendation.
Fred Drake34250111999-02-19 23:45:06 +0000263\indexii{XML}{namespaces}
Guido van Rossuma10768a1997-11-18 15:11:22 +0000264
Guido van Rossumb083a9f1998-12-18 20:17:13 +0000265Tag and attribute names that are defined in an XML namespace are
266handled as if the name of the tag or element consisted of the
267namespace (i.e. the URL that defines the namespace) followed by a
268space and the name of the tag or attribute. For instance, the tag
269\code{<html xmlns='http://www.w3.org/TR/REC-html40'>} is treated as if
270the tag name was \code{'http://www.w3.org/TR/REC-html40 html'}, and
271the tag \code{<html:a href='http://frob.com'>} inside the above
272mentioned element is treated as if the tag name were
273\code{'http://www.w3.org/TR/REC-html40 a'} and the attribute name as
274if it were \code{'http://www.w3.org/TR/REC-html40 src'}.
Guido van Rossum02505e41998-01-29 14:55:24 +0000275
Guido van Rossumb083a9f1998-12-18 20:17:13 +0000276An older draft of the XML Namespaces proposal is also recognized, but
277triggers a warning.
Fred Drakeae86d432000-09-12 17:53:48 +0000278
279\begin{seealso}
280 \seetitle[http://www.w3.org/TR/REC-xml-names/]{Namespaces in XML}{
281 This World-Wide Web Consortium recommendation describes the
282 proper syntax and processing requirements for namespaces in
283 XML.}
284\end{seealso}