blob: fe2086d194f6204776aaf7d2a213b5a866dc5a95 [file] [log] [blame]
Fred Drake7fbc85c2000-09-23 04:47:56 +00001\section{\module{xml.parsers.expat} ---
2 Fast XML parsing using the Expat library}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +00003
Fred Drake7fbc85c2000-09-23 04:47:56 +00004\declaremodule{standard}{xml.parsers.expat}
5\modulesynopsis{An interface to the Expat non-validating XML parser.}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +00006\moduleauthor{Paul Prescod}{paul@prescod.net}
7\sectionauthor{A.M. Kuchling}{amk1@bigfoot.com}
8
Fred Drake7fbc85c2000-09-23 04:47:56 +00009\versionadded{2.0}
10
11The \module{xml.parsers.expat} module is a Python interface to the Expat
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000012non-validating XML parser.
13The module provides a single extension type, \class{xmlparser}, that
14represents the current state of an XML parser. After an
15\class{xmlparser} object has been created, various attributes of the object
16can be set to handler functions. When an XML document is then fed to
17the parser, the handler functions are called for the character data
18and markup in the XML document.
Fred Drake7fbc85c2000-09-23 04:47:56 +000019
20This module uses the \module{pyexpat}\refbimodindex{pyexpat} module to
21provide access to the Expat parser. Direct use of the
22\module{pyexpat} module is deprecated.
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000023
Fred Drake7fbc85c2000-09-23 04:47:56 +000024The \module{xml.parsers.expat} module contains two functions:
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000025
26\begin{funcdesc}{ErrorString}{errno}
27Returns an explanatory string for a given error number \var{errno}.
28\end{funcdesc}
29
30\begin{funcdesc}{ParserCreate}{\optional{encoding, namespace_separator}}
31Creates and returns a new \class{xmlparser} object.
32\var{encoding}, if specified, must be a string naming the encoding
33used by the XML data. Expat doesn't support as many encodings as
34Python does, and its repertoire of encodings can't be extended; it
35supports UTF-8, UTF-16, ISO-8859-1 (Latin1), and ASCII.
36
37% XXX pyexpat.c should only allow a 1-char string for this parameter
38Expat can optionally do XML namespace processing for you, enabled by
39providing a value for \var{namespace_separator}. When namespace
40processing is enabled, element type names and attribute names that
41belong to a namespace will be expanded. The element name
42passed to the element handlers
43\function{StartElementHandler()} and \function{EndElementHandler()}
44will be the concatenation of the namespace URI, the namespace
45separator character, and the local part of the name. If the namespace
46separator is a zero byte (\code{chr(0)})
47then the namespace URI and the local part will be
48concatenated without any separator.
49
50For example, if \var{namespace_separator} is set to
51\samp{ }, and the following document is parsed:
52
53\begin{verbatim}
54<?xml version="1.0"?>
55<root xmlns = "http://default-namespace.org/"
56 xmlns:py = "http://www.python.org/ns/">
57 <py:elem1 />
58 <elem2 xmlns="" />
59</root>
60\end{verbatim}
61
62\function{StartElementHandler()} will receive the following strings for each element:
63
64\begin{verbatim}
65http://default-namespace.org/ root
66http://www.python.org/ns/ elem1
67elem2
68\end{verbatim}
69
70\end{funcdesc}
71
72\class{xmlparser} objects have the following methods:
73
74\begin{methoddesc}{Parse}{data \optional{, isfinal}}
75Parses the contents of the string \var{data}, calling the appropriate
76handler functions to process the parsed data. \var{isfinal} must be
Fred Drakec05cbb02000-07-05 02:03:34 +000077true on the final call to this method. \var{data} can be the empty
78string at any time.
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000079\end{methoddesc}
80
81\begin{methoddesc}{ParseFile}{file}
82Parse XML data reading from the object \var{file}. \var{file} only
83needs to provide the \method{read(\var{nbytes})} method, returning the
84empty string when there's no more data.
85\end{methoddesc}
86
87\begin{methoddesc}{SetBase}{base}
88Sets the base to be used for resolving relative URIs in system identifiers in
89declarations. Resolving relative identifiers is left to the application:
90this value will be passed through as the base argument to the
91\function{ExternalEntityRefHandler}, \function{NotationDeclHandler},
92and \function{UnparsedEntityDeclHandler} functions.
93\end{methoddesc}
94
95\begin{methoddesc}{GetBase}{}
96Returns a string containing the base set by a previous call to
97\method{SetBase()}, or \code{None} if
98\method{SetBase()} hasn't been called.
99\end{methoddesc}
100
Andrew M. Kuchling0690c862000-08-17 23:15:21 +0000101\class{xmlparser} objects have the following attributes.
102
103\begin{datadesc}{returns_unicode}
104If this attribute is set to 1, the handler functions will be passed
105Unicode strings. If \member{returns_unicode} is 0, 8-bit strings
106containing UTF-8 encoded data will be passed to the handlers.
107\end{datadesc}
108
109The following attributes contain values relating to the most recent
110error encountered by an \class{xmlparser} object, and will only have
111correct values once a call to \method{Parse()} or \method{ParseFile()}
Fred Drake7fbc85c2000-09-23 04:47:56 +0000112has raised a \exception{xml.parsers.expat.error} exception.
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000113
114\begin{datadesc}{ErrorByteIndex}
115Byte index at which an error occurred.
116\end{datadesc}
117
118\begin{datadesc}{ErrorCode}
119Numeric code specifying the problem. This value can be passed to the
120\function{ErrorString()} function, or compared to one of the constants
Fred Drake7fbc85c2000-09-23 04:47:56 +0000121defined in the \module{errors} object.
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000122\end{datadesc}
123
124\begin{datadesc}{ErrorColumnNumber}
125Column number at which an error occurred.
126\end{datadesc}
127
128\begin{datadesc}{ErrorLineNumber}
129Line number at which an error occurred.
130\end{datadesc}
131
132Here is the list of handlers that can be set. To set a handler on an
Fred Drakec05cbb02000-07-05 02:03:34 +0000133\class{xmlparser} object \var{o}, use
134\code{\var{o}.\var{handlername} = \var{func}}. \var{handlername} must
135be taken from the following list, and \var{func} must be a callable
136object accepting the correct number of arguments. The arguments are
137all strings, unless otherwise stated.
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000138
139\begin{methoddesc}{StartElementHandler}{name, attributes}
140Called for the start of every element. \var{name} is a string
141containing the element name, and \var{attributes} is a dictionary
142mapping attribute names to their values.
143\end{methoddesc}
144
145\begin{methoddesc}{EndElementHandler}{name}
146Called for the end of every element.
147\end{methoddesc}
148
149\begin{methoddesc}{ProcessingInstructionHandler}{target, data}
150Called for every processing instruction.
151\end{methoddesc}
152
153\begin{methoddesc}{CharacterDataHandler}{\var{data}}
154Called for character data.
155\end{methoddesc}
156
157\begin{methoddesc}{UnparsedEntityDeclHandler}{entityName, base, systemId, publicId, notationName}
158Called for unparsed (NDATA) entity declarations.
159\end{methoddesc}
160
161\begin{methoddesc}{NotationDeclHandler}{notationName, base, systemId, publicId}
162Called for notation declarations.
163\end{methoddesc}
164
165\begin{methoddesc}{StartNamespaceDeclHandler}{prefix, uri}
166Called when an element contains a namespace declaration.
167\end{methoddesc}
168
169\begin{methoddesc}{EndNamespaceDeclHandler}{prefix}
170Called when the closing tag is reached for an element
171that contained a namespace declaration.
172\end{methoddesc}
173
174\begin{methoddesc}{CommentHandler}{data}
175Called for comments.
176\end{methoddesc}
177
178\begin{methoddesc}{StartCdataSectionHandler}{}
179Called at the start of a CDATA section.
180\end{methoddesc}
181
182\begin{methoddesc}{EndCdataSectionHandler}{}
183Called at the end of a CDATA section.
184\end{methoddesc}
185
186\begin{methoddesc}{DefaultHandler}{data}
187Called for any characters in the XML document for
188which no applicable handler has been specified. This means
189characters that are part of a construct which could be reported, but
190for which no handler has been supplied.
191\end{methoddesc}
192
193\begin{methoddesc}{DefaultHandlerExpand}{data}
194This is the same as the \function{DefaultHandler},
195but doesn't inhibit expansion of internal entities.
196The entity reference will not be passed to the default handler.
197\end{methoddesc}
198
199\begin{methoddesc}{NotStandaloneHandler}{}
200Called if the XML document hasn't been declared as being a standalone document.
201\end{methoddesc}
202
203\begin{methoddesc}{ExternalEntityRefHandler}{context, base, systemId, publicId}
204Called for references to external entities.
205\end{methoddesc}
206
207
Fred Drake7fbc85c2000-09-23 04:47:56 +0000208\subsection{Example \label{expat-example}}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000209
Fred Drakec05cbb02000-07-05 02:03:34 +0000210The following program defines three handlers that just print out their
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000211arguments.
212
213\begin{verbatim}
Fred Drake7fbc85c2000-09-23 04:47:56 +0000214import xml.parsers.expat
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000215
216# 3 handler functions
217def start_element(name, attrs):
218 print 'Start element:', name, attrs
219def end_element(name):
220 print 'End element:', name
221def char_data(data):
222 print 'Character data:', repr(data)
223
Fred Drake7fbc85c2000-09-23 04:47:56 +0000224p = xml.parsers.expat.ParserCreate()
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000225
226p.StartElementHandler = start_element
Fred Drake7fbc85c2000-09-23 04:47:56 +0000227p.EndElementHandler = end_element
228p.CharacterDataHandler = char_data
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000229
230p.Parse("""<?xml version="1.0"?>
231<parent id="top"><child1 name="paul">Text goes here</child1>
232<child2 name="fred">More text</child2>
233</parent>""")
234\end{verbatim}
235
236The output from this program is:
237
238\begin{verbatim}
239Start element: parent {'id': 'top'}
240Start element: child1 {'name': 'paul'}
241Character data: 'Text goes here'
242End element: child1
243Character data: '\012'
244Start element: child2 {'name': 'fred'}
245Character data: 'More text'
246End element: child2
247Character data: '\012'
248End element: parent
249\end{verbatim}
Fred Drakec05cbb02000-07-05 02:03:34 +0000250
251
Fred Drake7fbc85c2000-09-23 04:47:56 +0000252\subsection{Expat error constants \label{expat-errors}}
Fred Drakec05cbb02000-07-05 02:03:34 +0000253\sectionauthor{A.M. Kuchling}{amk1@bigfoot.com}
254
255The following table lists the error constants in the
Fred Drake7fbc85c2000-09-23 04:47:56 +0000256\code{errors} object of the \module{xml.parsers.expat} module. These
257constants are useful in interpreting some of the attributes of the
258parser object after an error has occurred.
Fred Drakec05cbb02000-07-05 02:03:34 +0000259
Fred Drake7fbc85c2000-09-23 04:47:56 +0000260The \code{errors} object has the following attributes:
Fred Drakec05cbb02000-07-05 02:03:34 +0000261
Fred Drakeacab3d62000-07-11 16:30:30 +0000262\begin{datadesc}{XML_ERROR_ASYNC_ENTITY}
263\end{datadesc}
264
265\begin{datadesc}{XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF}
266\end{datadesc}
267
268\begin{datadesc}{XML_ERROR_BAD_CHAR_REF}
269\end{datadesc}
270
271\begin{datadesc}{XML_ERROR_BINARY_ENTITY_REF}
272\end{datadesc}
273
274\begin{datadesc}{XML_ERROR_DUPLICATE_ATTRIBUTE}
275An attribute was used more than once in a start tag.
276\end{datadesc}
277
278\begin{datadesc}{XML_ERROR_INCORRECT_ENCODING}
279\end{datadesc}
280
281\begin{datadesc}{XML_ERROR_INVALID_TOKEN}
282\end{datadesc}
283
284\begin{datadesc}{XML_ERROR_JUNK_AFTER_DOC_ELEMENT}
285Something other than whitespace occurred after the document element.
286\end{datadesc}
287
288\begin{datadesc}{XML_ERROR_MISPLACED_XML_PI}
289\end{datadesc}
290
291\begin{datadesc}{XML_ERROR_NO_ELEMENTS}
292\end{datadesc}
293
294\begin{datadesc}{XML_ERROR_NO_MEMORY}
295Expat was not able to allocate memory internally.
296\end{datadesc}
297
298\begin{datadesc}{XML_ERROR_PARAM_ENTITY_REF}
299\end{datadesc}
300
301\begin{datadesc}{XML_ERROR_PARTIAL_CHAR}
302\end{datadesc}
303
304\begin{datadesc}{XML_ERROR_RECURSIVE_ENTITY_REF}
305\end{datadesc}
306
307\begin{datadesc}{XML_ERROR_SYNTAX}
308Some unspecified syntax error was encountered.
309\end{datadesc}
310
311\begin{datadesc}{XML_ERROR_TAG_MISMATCH}
312An end tag did not match the innermost open start tag.
313\end{datadesc}
314
315\begin{datadesc}{XML_ERROR_UNCLOSED_TOKEN}
316\end{datadesc}
317
318\begin{datadesc}{XML_ERROR_UNDEFINED_ENTITY}
319A reference was made to a entity which was not defined.
320\end{datadesc}
321
322\begin{datadesc}{XML_ERROR_UNKNOWN_ENCODING}
323The document encoding is not supported by Expat.
324\end{datadesc}