blob: b0097298763be1fa7fa598ed07a952d96b206369 [file] [log] [blame]
Fred Drake7fbc85c2000-09-23 04:47:56 +00001\section{\module{xml.parsers.expat} ---
Fred Drakeefffe8e2000-10-29 05:10:30 +00002 Fast XML parsing using Expat}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +00003
Fred Drake7fbc85c2000-09-23 04:47:56 +00004\declaremodule{standard}{xml.parsers.expat}
5\modulesynopsis{An interface to the Expat non-validating XML parser.}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +00006\moduleauthor{Paul Prescod}{paul@prescod.net}
7\sectionauthor{A.M. Kuchling}{amk1@bigfoot.com}
8
Fred Drake7fbc85c2000-09-23 04:47:56 +00009\versionadded{2.0}
10
Fred Drakeefffe8e2000-10-29 05:10:30 +000011The \module{xml.parsers.expat} module is a Python interface to the
12Expat\index{Expat} non-validating XML parser.
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000013The module provides a single extension type, \class{xmlparser}, that
14represents the current state of an XML parser. After an
15\class{xmlparser} object has been created, various attributes of the object
16can be set to handler functions. When an XML document is then fed to
17the parser, the handler functions are called for the character data
18and markup in the XML document.
Fred Drake7fbc85c2000-09-23 04:47:56 +000019
20This module uses the \module{pyexpat}\refbimodindex{pyexpat} module to
21provide access to the Expat parser. Direct use of the
22\module{pyexpat} module is deprecated.
Fred Drakeefffe8e2000-10-29 05:10:30 +000023
24This module provides one exception and one type object:
25
26\begin{excdesc}{error}
27 The exception raised when Expat reports an error.
28\end{excdesc}
29
30\begin{datadesc}{XMLParserType}
31 The type of the return values from the \function{ParserCreate()}
32 function.
33\end{datadesc}
34
35
Fred Drake7fbc85c2000-09-23 04:47:56 +000036The \module{xml.parsers.expat} module contains two functions:
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000037
38\begin{funcdesc}{ErrorString}{errno}
39Returns an explanatory string for a given error number \var{errno}.
40\end{funcdesc}
41
Fred Drakeefffe8e2000-10-29 05:10:30 +000042\begin{funcdesc}{ParserCreate}{\optional{encoding\optional{,
43 namespace_separator}}}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000044Creates and returns a new \class{xmlparser} object.
45\var{encoding}, if specified, must be a string naming the encoding
46used by the XML data. Expat doesn't support as many encodings as
47Python does, and its repertoire of encodings can't be extended; it
48supports UTF-8, UTF-16, ISO-8859-1 (Latin1), and ASCII.
49
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000050Expat can optionally do XML namespace processing for you, enabled by
Fred Drakeefffe8e2000-10-29 05:10:30 +000051providing a value for \var{namespace_separator}. The value must be a
52one-character string; a \exception{ValueError} will be raised if the
53string has an illegal length (\code{None} is considered the same as
54omission). When namespace processing is enabled, element type names
55and attribute names that belong to a namespace will be expanded. The
56element name passed to the element handlers
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000057\function{StartElementHandler()} and \function{EndElementHandler()}
58will be the concatenation of the namespace URI, the namespace
59separator character, and the local part of the name. If the namespace
Fred Drakeefffe8e2000-10-29 05:10:30 +000060separator is a zero byte (\code{chr(0)}) then the namespace URI and
61the local part will be concatenated without any separator.
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000062
Fred Drake2fef3ab2000-11-28 06:38:22 +000063For example, if \var{namespace_separator} is set to a space character
64(\character{ }) and the following document is parsed:
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000065
66\begin{verbatim}
67<?xml version="1.0"?>
68<root xmlns = "http://default-namespace.org/"
69 xmlns:py = "http://www.python.org/ns/">
70 <py:elem1 />
71 <elem2 xmlns="" />
72</root>
73\end{verbatim}
74
Fred Draked79c33a2000-09-25 14:14:30 +000075\function{StartElementHandler()} will receive the following strings
76for each element:
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000077
78\begin{verbatim}
79http://default-namespace.org/ root
80http://www.python.org/ns/ elem1
81elem2
82\end{verbatim}
83
84\end{funcdesc}
85
86\class{xmlparser} objects have the following methods:
87
Fred Drake2fef3ab2000-11-28 06:38:22 +000088\begin{methoddesc}[xmlparser]{Parse}{data\optional{, isfinal}}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000089Parses the contents of the string \var{data}, calling the appropriate
90handler functions to process the parsed data. \var{isfinal} must be
Fred Drakec05cbb02000-07-05 02:03:34 +000091true on the final call to this method. \var{data} can be the empty
92string at any time.
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000093\end{methoddesc}
94
Fred Drakeefffe8e2000-10-29 05:10:30 +000095\begin{methoddesc}[xmlparser]{ParseFile}{file}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000096Parse XML data reading from the object \var{file}. \var{file} only
97needs to provide the \method{read(\var{nbytes})} method, returning the
98empty string when there's no more data.
99\end{methoddesc}
100
Fred Drakeefffe8e2000-10-29 05:10:30 +0000101\begin{methoddesc}[xmlparser]{SetBase}{base}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000102Sets the base to be used for resolving relative URIs in system identifiers in
103declarations. Resolving relative identifiers is left to the application:
104this value will be passed through as the base argument to the
105\function{ExternalEntityRefHandler}, \function{NotationDeclHandler},
106and \function{UnparsedEntityDeclHandler} functions.
107\end{methoddesc}
108
Fred Drakeefffe8e2000-10-29 05:10:30 +0000109\begin{methoddesc}[xmlparser]{GetBase}{}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000110Returns a string containing the base set by a previous call to
111\method{SetBase()}, or \code{None} if
112\method{SetBase()} hasn't been called.
113\end{methoddesc}
114
Fred Drakeefffe8e2000-10-29 05:10:30 +0000115
Fred Draked79c33a2000-09-25 14:14:30 +0000116\class{xmlparser} objects have the following attributes:
Andrew M. Kuchling0690c862000-08-17 23:15:21 +0000117
Fred Drakeefffe8e2000-10-29 05:10:30 +0000118\begin{memberdesc}[xmlparser]{returns_unicode}
Andrew M. Kuchling0690c862000-08-17 23:15:21 +0000119If this attribute is set to 1, the handler functions will be passed
120Unicode strings. If \member{returns_unicode} is 0, 8-bit strings
121containing UTF-8 encoded data will be passed to the handlers.
Fred Drakeefffe8e2000-10-29 05:10:30 +0000122\end{memberdesc}
Andrew M. Kuchling0690c862000-08-17 23:15:21 +0000123
124The following attributes contain values relating to the most recent
125error encountered by an \class{xmlparser} object, and will only have
126correct values once a call to \method{Parse()} or \method{ParseFile()}
Fred Drake7fbc85c2000-09-23 04:47:56 +0000127has raised a \exception{xml.parsers.expat.error} exception.
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000128
Fred Drakeefffe8e2000-10-29 05:10:30 +0000129\begin{memberdesc}[xmlparser]{ErrorByteIndex}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000130Byte index at which an error occurred.
Fred Drakeefffe8e2000-10-29 05:10:30 +0000131\end{memberdesc}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000132
Fred Drakeefffe8e2000-10-29 05:10:30 +0000133\begin{memberdesc}[xmlparser]{ErrorCode}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000134Numeric code specifying the problem. This value can be passed to the
135\function{ErrorString()} function, or compared to one of the constants
Fred Drake7fbc85c2000-09-23 04:47:56 +0000136defined in the \module{errors} object.
Fred Drakeefffe8e2000-10-29 05:10:30 +0000137\end{memberdesc}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000138
Fred Drakeefffe8e2000-10-29 05:10:30 +0000139\begin{memberdesc}[xmlparser]{ErrorColumnNumber}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000140Column number at which an error occurred.
Fred Drakeefffe8e2000-10-29 05:10:30 +0000141\end{memberdesc}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000142
Fred Drakeefffe8e2000-10-29 05:10:30 +0000143\begin{memberdesc}[xmlparser]{ErrorLineNumber}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000144Line number at which an error occurred.
Fred Drakeefffe8e2000-10-29 05:10:30 +0000145\end{memberdesc}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000146
147Here is the list of handlers that can be set. To set a handler on an
Fred Drakec05cbb02000-07-05 02:03:34 +0000148\class{xmlparser} object \var{o}, use
149\code{\var{o}.\var{handlername} = \var{func}}. \var{handlername} must
150be taken from the following list, and \var{func} must be a callable
151object accepting the correct number of arguments. The arguments are
152all strings, unless otherwise stated.
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000153
Fred Drakeefffe8e2000-10-29 05:10:30 +0000154\begin{methoddesc}[xmlparser]{StartElementHandler}{name, attributes}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000155Called for the start of every element. \var{name} is a string
156containing the element name, and \var{attributes} is a dictionary
157mapping attribute names to their values.
158\end{methoddesc}
159
Fred Drakeefffe8e2000-10-29 05:10:30 +0000160\begin{methoddesc}[xmlparser]{EndElementHandler}{name}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000161Called for the end of every element.
162\end{methoddesc}
163
Fred Drakeefffe8e2000-10-29 05:10:30 +0000164\begin{methoddesc}[xmlparser]{ProcessingInstructionHandler}{target, data}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000165Called for every processing instruction.
166\end{methoddesc}
167
Fred Drakeefffe8e2000-10-29 05:10:30 +0000168\begin{methoddesc}[xmlparser]{CharacterDataHandler}{data}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000169Called for character data.
170\end{methoddesc}
171
Fred Drakeefffe8e2000-10-29 05:10:30 +0000172\begin{methoddesc}[xmlparser]{UnparsedEntityDeclHandler}{entityName, base,
173 systemId, publicId,
174 notationName}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000175Called for unparsed (NDATA) entity declarations.
176\end{methoddesc}
177
Fred Drakeefffe8e2000-10-29 05:10:30 +0000178\begin{methoddesc}[xmlparser]{NotationDeclHandler}{notationName, base,
179 systemId, publicId}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000180Called for notation declarations.
181\end{methoddesc}
182
Fred Drakeefffe8e2000-10-29 05:10:30 +0000183\begin{methoddesc}[xmlparser]{StartNamespaceDeclHandler}{prefix, uri}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000184Called when an element contains a namespace declaration.
185\end{methoddesc}
186
Fred Drakeefffe8e2000-10-29 05:10:30 +0000187\begin{methoddesc}[xmlparser]{EndNamespaceDeclHandler}{prefix}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000188Called when the closing tag is reached for an element
189that contained a namespace declaration.
190\end{methoddesc}
191
Fred Drakeefffe8e2000-10-29 05:10:30 +0000192\begin{methoddesc}[xmlparser]{CommentHandler}{data}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000193Called for comments.
194\end{methoddesc}
195
Fred Drakeefffe8e2000-10-29 05:10:30 +0000196\begin{methoddesc}[xmlparser]{StartCdataSectionHandler}{}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000197Called at the start of a CDATA section.
198\end{methoddesc}
199
Fred Drakeefffe8e2000-10-29 05:10:30 +0000200\begin{methoddesc}[xmlparser]{EndCdataSectionHandler}{}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000201Called at the end of a CDATA section.
202\end{methoddesc}
203
Fred Drakeefffe8e2000-10-29 05:10:30 +0000204\begin{methoddesc}[xmlparser]{DefaultHandler}{data}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000205Called for any characters in the XML document for
206which no applicable handler has been specified. This means
207characters that are part of a construct which could be reported, but
208for which no handler has been supplied.
209\end{methoddesc}
210
Fred Drakeefffe8e2000-10-29 05:10:30 +0000211\begin{methoddesc}[xmlparser]{DefaultHandlerExpand}{data}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000212This is the same as the \function{DefaultHandler},
213but doesn't inhibit expansion of internal entities.
214The entity reference will not be passed to the default handler.
215\end{methoddesc}
216
Fred Drakeefffe8e2000-10-29 05:10:30 +0000217\begin{methoddesc}[xmlparser]{NotStandaloneHandler}{}
Fred Draked79c33a2000-09-25 14:14:30 +0000218Called if the XML document hasn't been declared as being a standalone
219document.
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000220\end{methoddesc}
221
Fred Drakeefffe8e2000-10-29 05:10:30 +0000222\begin{methoddesc}[xmlparser]{ExternalEntityRefHandler}{context, base,
223 systemId, publicId}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000224Called for references to external entities.
225\end{methoddesc}
226
227
Fred Drake7fbc85c2000-09-23 04:47:56 +0000228\subsection{Example \label{expat-example}}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000229
Fred Drakec05cbb02000-07-05 02:03:34 +0000230The following program defines three handlers that just print out their
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000231arguments.
232
233\begin{verbatim}
Fred Drake7fbc85c2000-09-23 04:47:56 +0000234import xml.parsers.expat
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000235
236# 3 handler functions
237def start_element(name, attrs):
238 print 'Start element:', name, attrs
239def end_element(name):
240 print 'End element:', name
241def char_data(data):
242 print 'Character data:', repr(data)
243
Fred Drake7fbc85c2000-09-23 04:47:56 +0000244p = xml.parsers.expat.ParserCreate()
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000245
246p.StartElementHandler = start_element
Fred Drake7fbc85c2000-09-23 04:47:56 +0000247p.EndElementHandler = end_element
248p.CharacterDataHandler = char_data
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000249
250p.Parse("""<?xml version="1.0"?>
251<parent id="top"><child1 name="paul">Text goes here</child1>
252<child2 name="fred">More text</child2>
253</parent>""")
254\end{verbatim}
255
256The output from this program is:
257
258\begin{verbatim}
259Start element: parent {'id': 'top'}
260Start element: child1 {'name': 'paul'}
261Character data: 'Text goes here'
262End element: child1
263Character data: '\012'
264Start element: child2 {'name': 'fred'}
265Character data: 'More text'
266End element: child2
267Character data: '\012'
268End element: parent
269\end{verbatim}
Fred Drakec05cbb02000-07-05 02:03:34 +0000270
271
Fred Drake7fbc85c2000-09-23 04:47:56 +0000272\subsection{Expat error constants \label{expat-errors}}
Fred Drakec05cbb02000-07-05 02:03:34 +0000273\sectionauthor{A.M. Kuchling}{amk1@bigfoot.com}
274
275The following table lists the error constants in the
Fred Drake7fbc85c2000-09-23 04:47:56 +0000276\code{errors} object of the \module{xml.parsers.expat} module. These
277constants are useful in interpreting some of the attributes of the
278parser object after an error has occurred.
Fred Drakec05cbb02000-07-05 02:03:34 +0000279
Fred Drake7fbc85c2000-09-23 04:47:56 +0000280The \code{errors} object has the following attributes:
Fred Drakec05cbb02000-07-05 02:03:34 +0000281
Fred Drakeacab3d62000-07-11 16:30:30 +0000282\begin{datadesc}{XML_ERROR_ASYNC_ENTITY}
283\end{datadesc}
284
285\begin{datadesc}{XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF}
286\end{datadesc}
287
288\begin{datadesc}{XML_ERROR_BAD_CHAR_REF}
289\end{datadesc}
290
291\begin{datadesc}{XML_ERROR_BINARY_ENTITY_REF}
292\end{datadesc}
293
294\begin{datadesc}{XML_ERROR_DUPLICATE_ATTRIBUTE}
295An attribute was used more than once in a start tag.
296\end{datadesc}
297
298\begin{datadesc}{XML_ERROR_INCORRECT_ENCODING}
299\end{datadesc}
300
301\begin{datadesc}{XML_ERROR_INVALID_TOKEN}
302\end{datadesc}
303
304\begin{datadesc}{XML_ERROR_JUNK_AFTER_DOC_ELEMENT}
305Something other than whitespace occurred after the document element.
306\end{datadesc}
307
308\begin{datadesc}{XML_ERROR_MISPLACED_XML_PI}
309\end{datadesc}
310
311\begin{datadesc}{XML_ERROR_NO_ELEMENTS}
312\end{datadesc}
313
314\begin{datadesc}{XML_ERROR_NO_MEMORY}
315Expat was not able to allocate memory internally.
316\end{datadesc}
317
318\begin{datadesc}{XML_ERROR_PARAM_ENTITY_REF}
319\end{datadesc}
320
321\begin{datadesc}{XML_ERROR_PARTIAL_CHAR}
322\end{datadesc}
323
324\begin{datadesc}{XML_ERROR_RECURSIVE_ENTITY_REF}
325\end{datadesc}
326
327\begin{datadesc}{XML_ERROR_SYNTAX}
328Some unspecified syntax error was encountered.
329\end{datadesc}
330
331\begin{datadesc}{XML_ERROR_TAG_MISMATCH}
332An end tag did not match the innermost open start tag.
333\end{datadesc}
334
335\begin{datadesc}{XML_ERROR_UNCLOSED_TOKEN}
336\end{datadesc}
337
338\begin{datadesc}{XML_ERROR_UNDEFINED_ENTITY}
339A reference was made to a entity which was not defined.
340\end{datadesc}
341
342\begin{datadesc}{XML_ERROR_UNKNOWN_ENCODING}
343The document encoding is not supported by Expat.
344\end{datadesc}