blob: 038e555d38bbf03a7bf4f7abcaeaa31d9be60762 [file] [log] [blame]
Fred Drake7fbc85c2000-09-23 04:47:56 +00001\section{\module{xml.parsers.expat} ---
Fred Drakeefffe8e2000-10-29 05:10:30 +00002 Fast XML parsing using Expat}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +00003
Fred Drake7fbc85c2000-09-23 04:47:56 +00004\declaremodule{standard}{xml.parsers.expat}
5\modulesynopsis{An interface to the Expat non-validating XML parser.}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +00006\moduleauthor{Paul Prescod}{paul@prescod.net}
7\sectionauthor{A.M. Kuchling}{amk1@bigfoot.com}
8
Fred Drake7fbc85c2000-09-23 04:47:56 +00009\versionadded{2.0}
10
Fred Drakeefffe8e2000-10-29 05:10:30 +000011The \module{xml.parsers.expat} module is a Python interface to the
12Expat\index{Expat} non-validating XML parser.
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000013The module provides a single extension type, \class{xmlparser}, that
14represents the current state of an XML parser. After an
15\class{xmlparser} object has been created, various attributes of the object
16can be set to handler functions. When an XML document is then fed to
17the parser, the handler functions are called for the character data
18and markup in the XML document.
Fred Drake7fbc85c2000-09-23 04:47:56 +000019
20This module uses the \module{pyexpat}\refbimodindex{pyexpat} module to
21provide access to the Expat parser. Direct use of the
22\module{pyexpat} module is deprecated.
Fred Drakeefffe8e2000-10-29 05:10:30 +000023
24This module provides one exception and one type object:
25
26\begin{excdesc}{error}
27 The exception raised when Expat reports an error.
28\end{excdesc}
29
30\begin{datadesc}{XMLParserType}
31 The type of the return values from the \function{ParserCreate()}
32 function.
33\end{datadesc}
34
35
Fred Drake7fbc85c2000-09-23 04:47:56 +000036The \module{xml.parsers.expat} module contains two functions:
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000037
38\begin{funcdesc}{ErrorString}{errno}
39Returns an explanatory string for a given error number \var{errno}.
40\end{funcdesc}
41
Fred Drakeefffe8e2000-10-29 05:10:30 +000042\begin{funcdesc}{ParserCreate}{\optional{encoding\optional{,
43 namespace_separator}}}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000044Creates and returns a new \class{xmlparser} object.
45\var{encoding}, if specified, must be a string naming the encoding
46used by the XML data. Expat doesn't support as many encodings as
47Python does, and its repertoire of encodings can't be extended; it
48supports UTF-8, UTF-16, ISO-8859-1 (Latin1), and ASCII.
49
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000050Expat can optionally do XML namespace processing for you, enabled by
Fred Drakeefffe8e2000-10-29 05:10:30 +000051providing a value for \var{namespace_separator}. The value must be a
52one-character string; a \exception{ValueError} will be raised if the
53string has an illegal length (\code{None} is considered the same as
54omission). When namespace processing is enabled, element type names
55and attribute names that belong to a namespace will be expanded. The
56element name passed to the element handlers
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000057\function{StartElementHandler()} and \function{EndElementHandler()}
58will be the concatenation of the namespace URI, the namespace
59separator character, and the local part of the name. If the namespace
Fred Drakeefffe8e2000-10-29 05:10:30 +000060separator is a zero byte (\code{chr(0)}) then the namespace URI and
61the local part will be concatenated without any separator.
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000062
Fred Drake2fef3ab2000-11-28 06:38:22 +000063For example, if \var{namespace_separator} is set to a space character
64(\character{ }) and the following document is parsed:
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000065
66\begin{verbatim}
67<?xml version="1.0"?>
68<root xmlns = "http://default-namespace.org/"
69 xmlns:py = "http://www.python.org/ns/">
70 <py:elem1 />
71 <elem2 xmlns="" />
72</root>
73\end{verbatim}
74
Fred Draked79c33a2000-09-25 14:14:30 +000075\function{StartElementHandler()} will receive the following strings
76for each element:
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000077
78\begin{verbatim}
79http://default-namespace.org/ root
80http://www.python.org/ns/ elem1
81elem2
82\end{verbatim}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000083\end{funcdesc}
84
Fred Drakef08cbb12000-12-23 22:19:05 +000085
86\subsection{XMLParser Objects \label{xmlparser-objects}}
87
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000088\class{xmlparser} objects have the following methods:
89
Fred Drake2fef3ab2000-11-28 06:38:22 +000090\begin{methoddesc}[xmlparser]{Parse}{data\optional{, isfinal}}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000091Parses the contents of the string \var{data}, calling the appropriate
92handler functions to process the parsed data. \var{isfinal} must be
Fred Drakef08cbb12000-12-23 22:19:05 +000093true on the final call to this method. \var{data} can be the empty
Fred Drakec05cbb02000-07-05 02:03:34 +000094string at any time.
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000095\end{methoddesc}
96
Fred Drakeefffe8e2000-10-29 05:10:30 +000097\begin{methoddesc}[xmlparser]{ParseFile}{file}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000098Parse XML data reading from the object \var{file}. \var{file} only
99needs to provide the \method{read(\var{nbytes})} method, returning the
100empty string when there's no more data.
101\end{methoddesc}
102
Fred Drakeefffe8e2000-10-29 05:10:30 +0000103\begin{methoddesc}[xmlparser]{SetBase}{base}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000104Sets the base to be used for resolving relative URIs in system identifiers in
105declarations. Resolving relative identifiers is left to the application:
106this value will be passed through as the base argument to the
107\function{ExternalEntityRefHandler}, \function{NotationDeclHandler},
108and \function{UnparsedEntityDeclHandler} functions.
109\end{methoddesc}
110
Fred Drakeefffe8e2000-10-29 05:10:30 +0000111\begin{methoddesc}[xmlparser]{GetBase}{}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000112Returns a string containing the base set by a previous call to
113\method{SetBase()}, or \code{None} if
114\method{SetBase()} hasn't been called.
115\end{methoddesc}
116
Fred Drakef08cbb12000-12-23 22:19:05 +0000117\begin{methoddesc}[xmlparser]{ExternalEntityParserCreate}{context\optional{,
118 encoding}}
119Create a ``child'' parser which can be used to parse an external
120parsed entity referred to by content parsed by the parent parser. The
121\var{content} parameter should be the string passed to the
122\method{ExternalEntityRefHandler()} handler function, described below.
123\end{methoddesc}
124
Fred Drakeefffe8e2000-10-29 05:10:30 +0000125
Fred Draked79c33a2000-09-25 14:14:30 +0000126\class{xmlparser} objects have the following attributes:
Andrew M. Kuchling0690c862000-08-17 23:15:21 +0000127
Fred Drakeefffe8e2000-10-29 05:10:30 +0000128\begin{memberdesc}[xmlparser]{returns_unicode}
Andrew M. Kuchling0690c862000-08-17 23:15:21 +0000129If this attribute is set to 1, the handler functions will be passed
130Unicode strings. If \member{returns_unicode} is 0, 8-bit strings
131containing UTF-8 encoded data will be passed to the handlers.
Fred Drakeb62966c2000-12-07 00:00:21 +0000132\versionchanged[Can be changed at any time to affect the result
133 type.]{1.6}
Fred Drakeefffe8e2000-10-29 05:10:30 +0000134\end{memberdesc}
Andrew M. Kuchling0690c862000-08-17 23:15:21 +0000135
136The following attributes contain values relating to the most recent
137error encountered by an \class{xmlparser} object, and will only have
138correct values once a call to \method{Parse()} or \method{ParseFile()}
Fred Drake7fbc85c2000-09-23 04:47:56 +0000139has raised a \exception{xml.parsers.expat.error} exception.
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000140
Fred Drakeefffe8e2000-10-29 05:10:30 +0000141\begin{memberdesc}[xmlparser]{ErrorByteIndex}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000142Byte index at which an error occurred.
Fred Drakeefffe8e2000-10-29 05:10:30 +0000143\end{memberdesc}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000144
Fred Drakeefffe8e2000-10-29 05:10:30 +0000145\begin{memberdesc}[xmlparser]{ErrorCode}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000146Numeric code specifying the problem. This value can be passed to the
147\function{ErrorString()} function, or compared to one of the constants
Fred Drake7fbc85c2000-09-23 04:47:56 +0000148defined in the \module{errors} object.
Fred Drakeefffe8e2000-10-29 05:10:30 +0000149\end{memberdesc}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000150
Fred Drakeefffe8e2000-10-29 05:10:30 +0000151\begin{memberdesc}[xmlparser]{ErrorColumnNumber}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000152Column number at which an error occurred.
Fred Drakeefffe8e2000-10-29 05:10:30 +0000153\end{memberdesc}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000154
Fred Drakeefffe8e2000-10-29 05:10:30 +0000155\begin{memberdesc}[xmlparser]{ErrorLineNumber}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000156Line number at which an error occurred.
Fred Drakeefffe8e2000-10-29 05:10:30 +0000157\end{memberdesc}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000158
159Here is the list of handlers that can be set. To set a handler on an
Fred Drakec05cbb02000-07-05 02:03:34 +0000160\class{xmlparser} object \var{o}, use
161\code{\var{o}.\var{handlername} = \var{func}}. \var{handlername} must
162be taken from the following list, and \var{func} must be a callable
163object accepting the correct number of arguments. The arguments are
164all strings, unless otherwise stated.
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000165
Fred Drakeefffe8e2000-10-29 05:10:30 +0000166\begin{methoddesc}[xmlparser]{StartElementHandler}{name, attributes}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000167Called for the start of every element. \var{name} is a string
168containing the element name, and \var{attributes} is a dictionary
169mapping attribute names to their values.
170\end{methoddesc}
171
Fred Drakeefffe8e2000-10-29 05:10:30 +0000172\begin{methoddesc}[xmlparser]{EndElementHandler}{name}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000173Called for the end of every element.
174\end{methoddesc}
175
Fred Drakeefffe8e2000-10-29 05:10:30 +0000176\begin{methoddesc}[xmlparser]{ProcessingInstructionHandler}{target, data}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000177Called for every processing instruction.
178\end{methoddesc}
179
Fred Drakeefffe8e2000-10-29 05:10:30 +0000180\begin{methoddesc}[xmlparser]{CharacterDataHandler}{data}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000181Called for character data.
182\end{methoddesc}
183
Fred Drakeefffe8e2000-10-29 05:10:30 +0000184\begin{methoddesc}[xmlparser]{UnparsedEntityDeclHandler}{entityName, base,
185 systemId, publicId,
186 notationName}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000187Called for unparsed (NDATA) entity declarations.
188\end{methoddesc}
189
Fred Drakeefffe8e2000-10-29 05:10:30 +0000190\begin{methoddesc}[xmlparser]{NotationDeclHandler}{notationName, base,
191 systemId, publicId}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000192Called for notation declarations.
193\end{methoddesc}
194
Fred Drakeefffe8e2000-10-29 05:10:30 +0000195\begin{methoddesc}[xmlparser]{StartNamespaceDeclHandler}{prefix, uri}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000196Called when an element contains a namespace declaration.
197\end{methoddesc}
198
Fred Drakeefffe8e2000-10-29 05:10:30 +0000199\begin{methoddesc}[xmlparser]{EndNamespaceDeclHandler}{prefix}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000200Called when the closing tag is reached for an element
201that contained a namespace declaration.
202\end{methoddesc}
203
Fred Drakeefffe8e2000-10-29 05:10:30 +0000204\begin{methoddesc}[xmlparser]{CommentHandler}{data}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000205Called for comments.
206\end{methoddesc}
207
Fred Drakeefffe8e2000-10-29 05:10:30 +0000208\begin{methoddesc}[xmlparser]{StartCdataSectionHandler}{}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000209Called at the start of a CDATA section.
210\end{methoddesc}
211
Fred Drakeefffe8e2000-10-29 05:10:30 +0000212\begin{methoddesc}[xmlparser]{EndCdataSectionHandler}{}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000213Called at the end of a CDATA section.
214\end{methoddesc}
215
Fred Drakeefffe8e2000-10-29 05:10:30 +0000216\begin{methoddesc}[xmlparser]{DefaultHandler}{data}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000217Called for any characters in the XML document for
218which no applicable handler has been specified. This means
219characters that are part of a construct which could be reported, but
220for which no handler has been supplied.
221\end{methoddesc}
222
Fred Drakeefffe8e2000-10-29 05:10:30 +0000223\begin{methoddesc}[xmlparser]{DefaultHandlerExpand}{data}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000224This is the same as the \function{DefaultHandler},
225but doesn't inhibit expansion of internal entities.
226The entity reference will not be passed to the default handler.
227\end{methoddesc}
228
Fred Drakeefffe8e2000-10-29 05:10:30 +0000229\begin{methoddesc}[xmlparser]{NotStandaloneHandler}{}
Fred Draked79c33a2000-09-25 14:14:30 +0000230Called if the XML document hasn't been declared as being a standalone
231document.
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000232\end{methoddesc}
233
Fred Drakeefffe8e2000-10-29 05:10:30 +0000234\begin{methoddesc}[xmlparser]{ExternalEntityRefHandler}{context, base,
235 systemId, publicId}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000236Called for references to external entities.
237\end{methoddesc}
238
239
Fred Drake7fbc85c2000-09-23 04:47:56 +0000240\subsection{Example \label{expat-example}}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000241
Fred Drakec05cbb02000-07-05 02:03:34 +0000242The following program defines three handlers that just print out their
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000243arguments.
244
245\begin{verbatim}
Fred Drake7fbc85c2000-09-23 04:47:56 +0000246import xml.parsers.expat
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000247
248# 3 handler functions
249def start_element(name, attrs):
250 print 'Start element:', name, attrs
251def end_element(name):
252 print 'End element:', name
253def char_data(data):
254 print 'Character data:', repr(data)
255
Fred Drake7fbc85c2000-09-23 04:47:56 +0000256p = xml.parsers.expat.ParserCreate()
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000257
258p.StartElementHandler = start_element
Fred Drake7fbc85c2000-09-23 04:47:56 +0000259p.EndElementHandler = end_element
260p.CharacterDataHandler = char_data
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000261
262p.Parse("""<?xml version="1.0"?>
263<parent id="top"><child1 name="paul">Text goes here</child1>
264<child2 name="fred">More text</child2>
265</parent>""")
266\end{verbatim}
267
268The output from this program is:
269
270\begin{verbatim}
271Start element: parent {'id': 'top'}
272Start element: child1 {'name': 'paul'}
273Character data: 'Text goes here'
274End element: child1
275Character data: '\012'
276Start element: child2 {'name': 'fred'}
277Character data: 'More text'
278End element: child2
279Character data: '\012'
280End element: parent
281\end{verbatim}
Fred Drakec05cbb02000-07-05 02:03:34 +0000282
283
Fred Drake7fbc85c2000-09-23 04:47:56 +0000284\subsection{Expat error constants \label{expat-errors}}
Fred Drakec05cbb02000-07-05 02:03:34 +0000285\sectionauthor{A.M. Kuchling}{amk1@bigfoot.com}
286
287The following table lists the error constants in the
Fred Drake7fbc85c2000-09-23 04:47:56 +0000288\code{errors} object of the \module{xml.parsers.expat} module. These
289constants are useful in interpreting some of the attributes of the
290parser object after an error has occurred.
Fred Drakec05cbb02000-07-05 02:03:34 +0000291
Fred Drake7fbc85c2000-09-23 04:47:56 +0000292The \code{errors} object has the following attributes:
Fred Drakec05cbb02000-07-05 02:03:34 +0000293
Fred Drakeacab3d62000-07-11 16:30:30 +0000294\begin{datadesc}{XML_ERROR_ASYNC_ENTITY}
295\end{datadesc}
296
297\begin{datadesc}{XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF}
298\end{datadesc}
299
300\begin{datadesc}{XML_ERROR_BAD_CHAR_REF}
301\end{datadesc}
302
303\begin{datadesc}{XML_ERROR_BINARY_ENTITY_REF}
304\end{datadesc}
305
306\begin{datadesc}{XML_ERROR_DUPLICATE_ATTRIBUTE}
307An attribute was used more than once in a start tag.
308\end{datadesc}
309
310\begin{datadesc}{XML_ERROR_INCORRECT_ENCODING}
311\end{datadesc}
312
313\begin{datadesc}{XML_ERROR_INVALID_TOKEN}
314\end{datadesc}
315
316\begin{datadesc}{XML_ERROR_JUNK_AFTER_DOC_ELEMENT}
317Something other than whitespace occurred after the document element.
318\end{datadesc}
319
320\begin{datadesc}{XML_ERROR_MISPLACED_XML_PI}
321\end{datadesc}
322
323\begin{datadesc}{XML_ERROR_NO_ELEMENTS}
324\end{datadesc}
325
326\begin{datadesc}{XML_ERROR_NO_MEMORY}
327Expat was not able to allocate memory internally.
328\end{datadesc}
329
330\begin{datadesc}{XML_ERROR_PARAM_ENTITY_REF}
331\end{datadesc}
332
333\begin{datadesc}{XML_ERROR_PARTIAL_CHAR}
334\end{datadesc}
335
336\begin{datadesc}{XML_ERROR_RECURSIVE_ENTITY_REF}
337\end{datadesc}
338
339\begin{datadesc}{XML_ERROR_SYNTAX}
340Some unspecified syntax error was encountered.
341\end{datadesc}
342
343\begin{datadesc}{XML_ERROR_TAG_MISMATCH}
344An end tag did not match the innermost open start tag.
345\end{datadesc}
346
347\begin{datadesc}{XML_ERROR_UNCLOSED_TOKEN}
348\end{datadesc}
349
350\begin{datadesc}{XML_ERROR_UNDEFINED_ENTITY}
351A reference was made to a entity which was not defined.
352\end{datadesc}
353
354\begin{datadesc}{XML_ERROR_UNKNOWN_ENCODING}
355The document encoding is not supported by Expat.
356\end{datadesc}