blob: 78434b4fc93a692abac3a8908ea03df2a7552b8c [file] [log] [blame]
Fred Drake7fbc85c2000-09-23 04:47:56 +00001\section{\module{xml.parsers.expat} ---
2 Fast XML parsing using the Expat library}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +00003
Fred Drake7fbc85c2000-09-23 04:47:56 +00004\declaremodule{standard}{xml.parsers.expat}
5\modulesynopsis{An interface to the Expat non-validating XML parser.}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +00006\moduleauthor{Paul Prescod}{paul@prescod.net}
7\sectionauthor{A.M. Kuchling}{amk1@bigfoot.com}
8
Fred Drake7fbc85c2000-09-23 04:47:56 +00009\versionadded{2.0}
10
11The \module{xml.parsers.expat} module is a Python interface to the Expat
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000012non-validating XML parser.
13The module provides a single extension type, \class{xmlparser}, that
14represents the current state of an XML parser. After an
15\class{xmlparser} object has been created, various attributes of the object
16can be set to handler functions. When an XML document is then fed to
17the parser, the handler functions are called for the character data
18and markup in the XML document.
Fred Drake7fbc85c2000-09-23 04:47:56 +000019
20This module uses the \module{pyexpat}\refbimodindex{pyexpat} module to
21provide access to the Expat parser. Direct use of the
22\module{pyexpat} module is deprecated.
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000023
Fred Drake7fbc85c2000-09-23 04:47:56 +000024The \module{xml.parsers.expat} module contains two functions:
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000025
26\begin{funcdesc}{ErrorString}{errno}
27Returns an explanatory string for a given error number \var{errno}.
28\end{funcdesc}
29
30\begin{funcdesc}{ParserCreate}{\optional{encoding, namespace_separator}}
31Creates and returns a new \class{xmlparser} object.
32\var{encoding}, if specified, must be a string naming the encoding
33used by the XML data. Expat doesn't support as many encodings as
34Python does, and its repertoire of encodings can't be extended; it
35supports UTF-8, UTF-16, ISO-8859-1 (Latin1), and ASCII.
36
37% XXX pyexpat.c should only allow a 1-char string for this parameter
38Expat can optionally do XML namespace processing for you, enabled by
39providing a value for \var{namespace_separator}. When namespace
40processing is enabled, element type names and attribute names that
41belong to a namespace will be expanded. The element name
42passed to the element handlers
43\function{StartElementHandler()} and \function{EndElementHandler()}
44will be the concatenation of the namespace URI, the namespace
45separator character, and the local part of the name. If the namespace
46separator is a zero byte (\code{chr(0)})
47then the namespace URI and the local part will be
48concatenated without any separator.
49
50For example, if \var{namespace_separator} is set to
51\samp{ }, and the following document is parsed:
52
53\begin{verbatim}
54<?xml version="1.0"?>
55<root xmlns = "http://default-namespace.org/"
56 xmlns:py = "http://www.python.org/ns/">
57 <py:elem1 />
58 <elem2 xmlns="" />
59</root>
60\end{verbatim}
61
Fred Draked79c33a2000-09-25 14:14:30 +000062\function{StartElementHandler()} will receive the following strings
63for each element:
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000064
65\begin{verbatim}
66http://default-namespace.org/ root
67http://www.python.org/ns/ elem1
68elem2
69\end{verbatim}
70
71\end{funcdesc}
72
73\class{xmlparser} objects have the following methods:
74
75\begin{methoddesc}{Parse}{data \optional{, isfinal}}
76Parses the contents of the string \var{data}, calling the appropriate
77handler functions to process the parsed data. \var{isfinal} must be
Fred Drakec05cbb02000-07-05 02:03:34 +000078true on the final call to this method. \var{data} can be the empty
79string at any time.
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000080\end{methoddesc}
81
82\begin{methoddesc}{ParseFile}{file}
83Parse XML data reading from the object \var{file}. \var{file} only
84needs to provide the \method{read(\var{nbytes})} method, returning the
85empty string when there's no more data.
86\end{methoddesc}
87
88\begin{methoddesc}{SetBase}{base}
89Sets the base to be used for resolving relative URIs in system identifiers in
90declarations. Resolving relative identifiers is left to the application:
91this value will be passed through as the base argument to the
92\function{ExternalEntityRefHandler}, \function{NotationDeclHandler},
93and \function{UnparsedEntityDeclHandler} functions.
94\end{methoddesc}
95
96\begin{methoddesc}{GetBase}{}
97Returns a string containing the base set by a previous call to
98\method{SetBase()}, or \code{None} if
99\method{SetBase()} hasn't been called.
100\end{methoddesc}
101
Fred Draked79c33a2000-09-25 14:14:30 +0000102\class{xmlparser} objects have the following attributes:
Andrew M. Kuchling0690c862000-08-17 23:15:21 +0000103
104\begin{datadesc}{returns_unicode}
105If this attribute is set to 1, the handler functions will be passed
106Unicode strings. If \member{returns_unicode} is 0, 8-bit strings
107containing UTF-8 encoded data will be passed to the handlers.
108\end{datadesc}
109
110The following attributes contain values relating to the most recent
111error encountered by an \class{xmlparser} object, and will only have
112correct values once a call to \method{Parse()} or \method{ParseFile()}
Fred Drake7fbc85c2000-09-23 04:47:56 +0000113has raised a \exception{xml.parsers.expat.error} exception.
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000114
115\begin{datadesc}{ErrorByteIndex}
116Byte index at which an error occurred.
117\end{datadesc}
118
119\begin{datadesc}{ErrorCode}
120Numeric code specifying the problem. This value can be passed to the
121\function{ErrorString()} function, or compared to one of the constants
Fred Drake7fbc85c2000-09-23 04:47:56 +0000122defined in the \module{errors} object.
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000123\end{datadesc}
124
125\begin{datadesc}{ErrorColumnNumber}
126Column number at which an error occurred.
127\end{datadesc}
128
129\begin{datadesc}{ErrorLineNumber}
130Line number at which an error occurred.
131\end{datadesc}
132
133Here is the list of handlers that can be set. To set a handler on an
Fred Drakec05cbb02000-07-05 02:03:34 +0000134\class{xmlparser} object \var{o}, use
135\code{\var{o}.\var{handlername} = \var{func}}. \var{handlername} must
136be taken from the following list, and \var{func} must be a callable
137object accepting the correct number of arguments. The arguments are
138all strings, unless otherwise stated.
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000139
140\begin{methoddesc}{StartElementHandler}{name, attributes}
141Called for the start of every element. \var{name} is a string
142containing the element name, and \var{attributes} is a dictionary
143mapping attribute names to their values.
144\end{methoddesc}
145
146\begin{methoddesc}{EndElementHandler}{name}
147Called for the end of every element.
148\end{methoddesc}
149
150\begin{methoddesc}{ProcessingInstructionHandler}{target, data}
151Called for every processing instruction.
152\end{methoddesc}
153
154\begin{methoddesc}{CharacterDataHandler}{\var{data}}
155Called for character data.
156\end{methoddesc}
157
Fred Draked79c33a2000-09-25 14:14:30 +0000158\begin{methoddesc}{UnparsedEntityDeclHandler}{entityName, base,
159 systemId, publicId,
160 notationName}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000161Called for unparsed (NDATA) entity declarations.
162\end{methoddesc}
163
Fred Draked79c33a2000-09-25 14:14:30 +0000164\begin{methoddesc}{NotationDeclHandler}{notationName, base, systemId,
165 publicId}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000166Called for notation declarations.
167\end{methoddesc}
168
169\begin{methoddesc}{StartNamespaceDeclHandler}{prefix, uri}
170Called when an element contains a namespace declaration.
171\end{methoddesc}
172
173\begin{methoddesc}{EndNamespaceDeclHandler}{prefix}
174Called when the closing tag is reached for an element
175that contained a namespace declaration.
176\end{methoddesc}
177
178\begin{methoddesc}{CommentHandler}{data}
179Called for comments.
180\end{methoddesc}
181
182\begin{methoddesc}{StartCdataSectionHandler}{}
183Called at the start of a CDATA section.
184\end{methoddesc}
185
186\begin{methoddesc}{EndCdataSectionHandler}{}
187Called at the end of a CDATA section.
188\end{methoddesc}
189
190\begin{methoddesc}{DefaultHandler}{data}
191Called for any characters in the XML document for
192which no applicable handler has been specified. This means
193characters that are part of a construct which could be reported, but
194for which no handler has been supplied.
195\end{methoddesc}
196
197\begin{methoddesc}{DefaultHandlerExpand}{data}
198This is the same as the \function{DefaultHandler},
199but doesn't inhibit expansion of internal entities.
200The entity reference will not be passed to the default handler.
201\end{methoddesc}
202
203\begin{methoddesc}{NotStandaloneHandler}{}
Fred Draked79c33a2000-09-25 14:14:30 +0000204Called if the XML document hasn't been declared as being a standalone
205document.
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000206\end{methoddesc}
207
Fred Draked79c33a2000-09-25 14:14:30 +0000208\begin{methoddesc}{ExternalEntityRefHandler}{context, base, systemId,
209 publicId}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000210Called for references to external entities.
211\end{methoddesc}
212
213
Fred Drake7fbc85c2000-09-23 04:47:56 +0000214\subsection{Example \label{expat-example}}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000215
Fred Drakec05cbb02000-07-05 02:03:34 +0000216The following program defines three handlers that just print out their
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000217arguments.
218
219\begin{verbatim}
Fred Drake7fbc85c2000-09-23 04:47:56 +0000220import xml.parsers.expat
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000221
222# 3 handler functions
223def start_element(name, attrs):
224 print 'Start element:', name, attrs
225def end_element(name):
226 print 'End element:', name
227def char_data(data):
228 print 'Character data:', repr(data)
229
Fred Drake7fbc85c2000-09-23 04:47:56 +0000230p = xml.parsers.expat.ParserCreate()
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000231
232p.StartElementHandler = start_element
Fred Drake7fbc85c2000-09-23 04:47:56 +0000233p.EndElementHandler = end_element
234p.CharacterDataHandler = char_data
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000235
236p.Parse("""<?xml version="1.0"?>
237<parent id="top"><child1 name="paul">Text goes here</child1>
238<child2 name="fred">More text</child2>
239</parent>""")
240\end{verbatim}
241
242The output from this program is:
243
244\begin{verbatim}
245Start element: parent {'id': 'top'}
246Start element: child1 {'name': 'paul'}
247Character data: 'Text goes here'
248End element: child1
249Character data: '\012'
250Start element: child2 {'name': 'fred'}
251Character data: 'More text'
252End element: child2
253Character data: '\012'
254End element: parent
255\end{verbatim}
Fred Drakec05cbb02000-07-05 02:03:34 +0000256
257
Fred Drake7fbc85c2000-09-23 04:47:56 +0000258\subsection{Expat error constants \label{expat-errors}}
Fred Drakec05cbb02000-07-05 02:03:34 +0000259\sectionauthor{A.M. Kuchling}{amk1@bigfoot.com}
260
261The following table lists the error constants in the
Fred Drake7fbc85c2000-09-23 04:47:56 +0000262\code{errors} object of the \module{xml.parsers.expat} module. These
263constants are useful in interpreting some of the attributes of the
264parser object after an error has occurred.
Fred Drakec05cbb02000-07-05 02:03:34 +0000265
Fred Drake7fbc85c2000-09-23 04:47:56 +0000266The \code{errors} object has the following attributes:
Fred Drakec05cbb02000-07-05 02:03:34 +0000267
Fred Drakeacab3d62000-07-11 16:30:30 +0000268\begin{datadesc}{XML_ERROR_ASYNC_ENTITY}
269\end{datadesc}
270
271\begin{datadesc}{XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF}
272\end{datadesc}
273
274\begin{datadesc}{XML_ERROR_BAD_CHAR_REF}
275\end{datadesc}
276
277\begin{datadesc}{XML_ERROR_BINARY_ENTITY_REF}
278\end{datadesc}
279
280\begin{datadesc}{XML_ERROR_DUPLICATE_ATTRIBUTE}
281An attribute was used more than once in a start tag.
282\end{datadesc}
283
284\begin{datadesc}{XML_ERROR_INCORRECT_ENCODING}
285\end{datadesc}
286
287\begin{datadesc}{XML_ERROR_INVALID_TOKEN}
288\end{datadesc}
289
290\begin{datadesc}{XML_ERROR_JUNK_AFTER_DOC_ELEMENT}
291Something other than whitespace occurred after the document element.
292\end{datadesc}
293
294\begin{datadesc}{XML_ERROR_MISPLACED_XML_PI}
295\end{datadesc}
296
297\begin{datadesc}{XML_ERROR_NO_ELEMENTS}
298\end{datadesc}
299
300\begin{datadesc}{XML_ERROR_NO_MEMORY}
301Expat was not able to allocate memory internally.
302\end{datadesc}
303
304\begin{datadesc}{XML_ERROR_PARAM_ENTITY_REF}
305\end{datadesc}
306
307\begin{datadesc}{XML_ERROR_PARTIAL_CHAR}
308\end{datadesc}
309
310\begin{datadesc}{XML_ERROR_RECURSIVE_ENTITY_REF}
311\end{datadesc}
312
313\begin{datadesc}{XML_ERROR_SYNTAX}
314Some unspecified syntax error was encountered.
315\end{datadesc}
316
317\begin{datadesc}{XML_ERROR_TAG_MISMATCH}
318An end tag did not match the innermost open start tag.
319\end{datadesc}
320
321\begin{datadesc}{XML_ERROR_UNCLOSED_TOKEN}
322\end{datadesc}
323
324\begin{datadesc}{XML_ERROR_UNDEFINED_ENTITY}
325A reference was made to a entity which was not defined.
326\end{datadesc}
327
328\begin{datadesc}{XML_ERROR_UNKNOWN_ENCODING}
329The document encoding is not supported by Expat.
330\end{datadesc}