blob: d663c63cf1d920c23f0dce4a86bff72f326941f5 [file] [log] [blame]
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +00001\section{\module{pyexpat} ---
2 Fast XML parsing using the Expat C library}
3
4\declaremodule{builtin}{pyexpat}
5\modulesynopsis{An interface to the Expat XML parser.}
6\moduleauthor{Paul Prescod}{paul@prescod.net}
7\sectionauthor{A.M. Kuchling}{amk1@bigfoot.com}
8
9The \module{pyexpat} module is a Python interface to the Expat
10non-validating XML parser.
11The module provides a single extension type, \class{xmlparser}, that
12represents the current state of an XML parser. After an
13\class{xmlparser} object has been created, various attributes of the object
14can be set to handler functions. When an XML document is then fed to
15the parser, the handler functions are called for the character data
16and markup in the XML document.
17
18The \module{pyexpat} module contains two functions:
19
20\begin{funcdesc}{ErrorString}{errno}
21Returns an explanatory string for a given error number \var{errno}.
22\end{funcdesc}
23
24\begin{funcdesc}{ParserCreate}{\optional{encoding, namespace_separator}}
25Creates and returns a new \class{xmlparser} object.
26\var{encoding}, if specified, must be a string naming the encoding
27used by the XML data. Expat doesn't support as many encodings as
28Python does, and its repertoire of encodings can't be extended; it
29supports UTF-8, UTF-16, ISO-8859-1 (Latin1), and ASCII.
30
31% XXX pyexpat.c should only allow a 1-char string for this parameter
32Expat can optionally do XML namespace processing for you, enabled by
33providing a value for \var{namespace_separator}. When namespace
34processing is enabled, element type names and attribute names that
35belong to a namespace will be expanded. The element name
36passed to the element handlers
37\function{StartElementHandler()} and \function{EndElementHandler()}
38will be the concatenation of the namespace URI, the namespace
39separator character, and the local part of the name. If the namespace
40separator is a zero byte (\code{chr(0)})
41then the namespace URI and the local part will be
42concatenated without any separator.
43
44For example, if \var{namespace_separator} is set to
45\samp{ }, and the following document is parsed:
46
47\begin{verbatim}
48<?xml version="1.0"?>
49<root xmlns = "http://default-namespace.org/"
50 xmlns:py = "http://www.python.org/ns/">
51 <py:elem1 />
52 <elem2 xmlns="" />
53</root>
54\end{verbatim}
55
56\function{StartElementHandler()} will receive the following strings for each element:
57
58\begin{verbatim}
59http://default-namespace.org/ root
60http://www.python.org/ns/ elem1
61elem2
62\end{verbatim}
63
64\end{funcdesc}
65
66\class{xmlparser} objects have the following methods:
67
68\begin{methoddesc}{Parse}{data \optional{, isfinal}}
69Parses the contents of the string \var{data}, calling the appropriate
70handler functions to process the parsed data. \var{isfinal} must be
Fred Drakec05cbb02000-07-05 02:03:34 +000071true on the final call to this method. \var{data} can be the empty
72string at any time.
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +000073\end{methoddesc}
74
75\begin{methoddesc}{ParseFile}{file}
76Parse XML data reading from the object \var{file}. \var{file} only
77needs to provide the \method{read(\var{nbytes})} method, returning the
78empty string when there's no more data.
79\end{methoddesc}
80
81\begin{methoddesc}{SetBase}{base}
82Sets the base to be used for resolving relative URIs in system identifiers in
83declarations. Resolving relative identifiers is left to the application:
84this value will be passed through as the base argument to the
85\function{ExternalEntityRefHandler}, \function{NotationDeclHandler},
86and \function{UnparsedEntityDeclHandler} functions.
87\end{methoddesc}
88
89\begin{methoddesc}{GetBase}{}
90Returns a string containing the base set by a previous call to
91\method{SetBase()}, or \code{None} if
92\method{SetBase()} hasn't been called.
93\end{methoddesc}
94
Andrew M. Kuchling0690c862000-08-17 23:15:21 +000095\class{xmlparser} objects have the following attributes.
96
97\begin{datadesc}{returns_unicode}
98If this attribute is set to 1, the handler functions will be passed
99Unicode strings. If \member{returns_unicode} is 0, 8-bit strings
100containing UTF-8 encoded data will be passed to the handlers.
101\end{datadesc}
102
103The following attributes contain values relating to the most recent
104error encountered by an \class{xmlparser} object, and will only have
105correct values once a call to \method{Parse()} or \method{ParseFile()}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000106has raised a \exception{pyexpat.error} exception.
107
108\begin{datadesc}{ErrorByteIndex}
109Byte index at which an error occurred.
110\end{datadesc}
111
112\begin{datadesc}{ErrorCode}
113Numeric code specifying the problem. This value can be passed to the
114\function{ErrorString()} function, or compared to one of the constants
115defined in the \module{pyexpat.errors} submodule.
116\end{datadesc}
117
118\begin{datadesc}{ErrorColumnNumber}
119Column number at which an error occurred.
120\end{datadesc}
121
122\begin{datadesc}{ErrorLineNumber}
123Line number at which an error occurred.
124\end{datadesc}
125
126Here is the list of handlers that can be set. To set a handler on an
Fred Drakec05cbb02000-07-05 02:03:34 +0000127\class{xmlparser} object \var{o}, use
128\code{\var{o}.\var{handlername} = \var{func}}. \var{handlername} must
129be taken from the following list, and \var{func} must be a callable
130object accepting the correct number of arguments. The arguments are
131all strings, unless otherwise stated.
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000132
133\begin{methoddesc}{StartElementHandler}{name, attributes}
134Called for the start of every element. \var{name} is a string
135containing the element name, and \var{attributes} is a dictionary
136mapping attribute names to their values.
137\end{methoddesc}
138
139\begin{methoddesc}{EndElementHandler}{name}
140Called for the end of every element.
141\end{methoddesc}
142
143\begin{methoddesc}{ProcessingInstructionHandler}{target, data}
144Called for every processing instruction.
145\end{methoddesc}
146
147\begin{methoddesc}{CharacterDataHandler}{\var{data}}
148Called for character data.
149\end{methoddesc}
150
151\begin{methoddesc}{UnparsedEntityDeclHandler}{entityName, base, systemId, publicId, notationName}
152Called for unparsed (NDATA) entity declarations.
153\end{methoddesc}
154
155\begin{methoddesc}{NotationDeclHandler}{notationName, base, systemId, publicId}
156Called for notation declarations.
157\end{methoddesc}
158
159\begin{methoddesc}{StartNamespaceDeclHandler}{prefix, uri}
160Called when an element contains a namespace declaration.
161\end{methoddesc}
162
163\begin{methoddesc}{EndNamespaceDeclHandler}{prefix}
164Called when the closing tag is reached for an element
165that contained a namespace declaration.
166\end{methoddesc}
167
168\begin{methoddesc}{CommentHandler}{data}
169Called for comments.
170\end{methoddesc}
171
172\begin{methoddesc}{StartCdataSectionHandler}{}
173Called at the start of a CDATA section.
174\end{methoddesc}
175
176\begin{methoddesc}{EndCdataSectionHandler}{}
177Called at the end of a CDATA section.
178\end{methoddesc}
179
180\begin{methoddesc}{DefaultHandler}{data}
181Called for any characters in the XML document for
182which no applicable handler has been specified. This means
183characters that are part of a construct which could be reported, but
184for which no handler has been supplied.
185\end{methoddesc}
186
187\begin{methoddesc}{DefaultHandlerExpand}{data}
188This is the same as the \function{DefaultHandler},
189but doesn't inhibit expansion of internal entities.
190The entity reference will not be passed to the default handler.
191\end{methoddesc}
192
193\begin{methoddesc}{NotStandaloneHandler}{}
194Called if the XML document hasn't been declared as being a standalone document.
195\end{methoddesc}
196
197\begin{methoddesc}{ExternalEntityRefHandler}{context, base, systemId, publicId}
198Called for references to external entities.
199\end{methoddesc}
200
201
Fred Drakec05cbb02000-07-05 02:03:34 +0000202\subsection{Example \label{pyexpat-example}}
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000203
Fred Drakec05cbb02000-07-05 02:03:34 +0000204The following program defines three handlers that just print out their
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +0000205arguments.
206
207\begin{verbatim}
208
209import pyexpat
210
211# 3 handler functions
212def start_element(name, attrs):
213 print 'Start element:', name, attrs
214def end_element(name):
215 print 'End element:', name
216def char_data(data):
217 print 'Character data:', repr(data)
218
219p=pyexpat.ParserCreate()
220
221p.StartElementHandler = start_element
222p.EndElementHandler = end_element
223p.CharacterDataHandler= char_data
224
225p.Parse("""<?xml version="1.0"?>
226<parent id="top"><child1 name="paul">Text goes here</child1>
227<child2 name="fred">More text</child2>
228</parent>""")
229\end{verbatim}
230
231The output from this program is:
232
233\begin{verbatim}
234Start element: parent {'id': 'top'}
235Start element: child1 {'name': 'paul'}
236Character data: 'Text goes here'
237End element: child1
238Character data: '\012'
239Start element: child2 {'name': 'fred'}
240Character data: 'More text'
241End element: child2
242Character data: '\012'
243End element: parent
244\end{verbatim}
Fred Drakec05cbb02000-07-05 02:03:34 +0000245
246
247\section{\module{pyexpat.errors} --- Error constants}
248
249\declaremodule{builtin}{pyexpat.errors}
250\modulesynopsis{Error constants defined for the Expat parser}
251\moduleauthor{Paul Prescod}{paul@prescod.net}
252\sectionauthor{A.M. Kuchling}{amk1@bigfoot.com}
253
254The following table lists the error constants in the
255\module{pyexpat.errors} submodule, available once the
256\refmodule{pyexpat} module has been imported.
257
258Note that this module cannot be imported directly until
259\refmodule{pyexpat} has been imported.
260
261The following constants are defined:
262
Fred Drakeacab3d62000-07-11 16:30:30 +0000263\begin{datadesc}{XML_ERROR_ASYNC_ENTITY}
264\end{datadesc}
265
266\begin{datadesc}{XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF}
267\end{datadesc}
268
269\begin{datadesc}{XML_ERROR_BAD_CHAR_REF}
270\end{datadesc}
271
272\begin{datadesc}{XML_ERROR_BINARY_ENTITY_REF}
273\end{datadesc}
274
275\begin{datadesc}{XML_ERROR_DUPLICATE_ATTRIBUTE}
276An attribute was used more than once in a start tag.
277\end{datadesc}
278
279\begin{datadesc}{XML_ERROR_INCORRECT_ENCODING}
280\end{datadesc}
281
282\begin{datadesc}{XML_ERROR_INVALID_TOKEN}
283\end{datadesc}
284
285\begin{datadesc}{XML_ERROR_JUNK_AFTER_DOC_ELEMENT}
286Something other than whitespace occurred after the document element.
287\end{datadesc}
288
289\begin{datadesc}{XML_ERROR_MISPLACED_XML_PI}
290\end{datadesc}
291
292\begin{datadesc}{XML_ERROR_NO_ELEMENTS}
293\end{datadesc}
294
295\begin{datadesc}{XML_ERROR_NO_MEMORY}
296Expat was not able to allocate memory internally.
297\end{datadesc}
298
299\begin{datadesc}{XML_ERROR_PARAM_ENTITY_REF}
300\end{datadesc}
301
302\begin{datadesc}{XML_ERROR_PARTIAL_CHAR}
303\end{datadesc}
304
305\begin{datadesc}{XML_ERROR_RECURSIVE_ENTITY_REF}
306\end{datadesc}
307
308\begin{datadesc}{XML_ERROR_SYNTAX}
309Some unspecified syntax error was encountered.
310\end{datadesc}
311
312\begin{datadesc}{XML_ERROR_TAG_MISMATCH}
313An end tag did not match the innermost open start tag.
314\end{datadesc}
315
316\begin{datadesc}{XML_ERROR_UNCLOSED_TOKEN}
317\end{datadesc}
318
319\begin{datadesc}{XML_ERROR_UNDEFINED_ENTITY}
320A reference was made to a entity which was not defined.
321\end{datadesc}
322
323\begin{datadesc}{XML_ERROR_UNKNOWN_ENCODING}
324The document encoding is not supported by Expat.
325\end{datadesc}
326