blob: c1fe4d1e9b3d46878745204f48c1f86c5ddd923c [file] [log] [blame]
Andrew M. Kuchling6b14eeb2000-06-11 02:42:07 +00001\section{\module{pyexpat} ---
2 Fast XML parsing using the Expat C library}
3
4\declaremodule{builtin}{pyexpat}
5\modulesynopsis{An interface to the Expat XML parser.}
6\moduleauthor{Paul Prescod}{paul@prescod.net}
7\sectionauthor{A.M. Kuchling}{amk1@bigfoot.com}
8
9The \module{pyexpat} module is a Python interface to the Expat
10non-validating XML parser.
11The module provides a single extension type, \class{xmlparser}, that
12represents the current state of an XML parser. After an
13\class{xmlparser} object has been created, various attributes of the object
14can be set to handler functions. When an XML document is then fed to
15the parser, the handler functions are called for the character data
16and markup in the XML document.
17
18The \module{pyexpat} module contains two functions:
19
20\begin{funcdesc}{ErrorString}{errno}
21Returns an explanatory string for a given error number \var{errno}.
22\end{funcdesc}
23
24\begin{funcdesc}{ParserCreate}{\optional{encoding, namespace_separator}}
25Creates and returns a new \class{xmlparser} object.
26\var{encoding}, if specified, must be a string naming the encoding
27used by the XML data. Expat doesn't support as many encodings as
28Python does, and its repertoire of encodings can't be extended; it
29supports UTF-8, UTF-16, ISO-8859-1 (Latin1), and ASCII.
30
31% XXX pyexpat.c should only allow a 1-char string for this parameter
32Expat can optionally do XML namespace processing for you, enabled by
33providing a value for \var{namespace_separator}. When namespace
34processing is enabled, element type names and attribute names that
35belong to a namespace will be expanded. The element name
36passed to the element handlers
37\function{StartElementHandler()} and \function{EndElementHandler()}
38will be the concatenation of the namespace URI, the namespace
39separator character, and the local part of the name. If the namespace
40separator is a zero byte (\code{chr(0)})
41then the namespace URI and the local part will be
42concatenated without any separator.
43
44For example, if \var{namespace_separator} is set to
45\samp{ }, and the following document is parsed:
46
47\begin{verbatim}
48<?xml version="1.0"?>
49<root xmlns = "http://default-namespace.org/"
50 xmlns:py = "http://www.python.org/ns/">
51 <py:elem1 />
52 <elem2 xmlns="" />
53</root>
54\end{verbatim}
55
56\function{StartElementHandler()} will receive the following strings for each element:
57
58\begin{verbatim}
59http://default-namespace.org/ root
60http://www.python.org/ns/ elem1
61elem2
62\end{verbatim}
63
64\end{funcdesc}
65
66\class{xmlparser} objects have the following methods:
67
68\begin{methoddesc}{Parse}{data \optional{, isfinal}}
69Parses the contents of the string \var{data}, calling the appropriate
70handler functions to process the parsed data. \var{isfinal} must be
71true on the final call to this method. \var{data} can be the empty string at any time.
72\end{methoddesc}
73
74\begin{methoddesc}{ParseFile}{file}
75Parse XML data reading from the object \var{file}. \var{file} only
76needs to provide the \method{read(\var{nbytes})} method, returning the
77empty string when there's no more data.
78\end{methoddesc}
79
80\begin{methoddesc}{SetBase}{base}
81Sets the base to be used for resolving relative URIs in system identifiers in
82declarations. Resolving relative identifiers is left to the application:
83this value will be passed through as the base argument to the
84\function{ExternalEntityRefHandler}, \function{NotationDeclHandler},
85and \function{UnparsedEntityDeclHandler} functions.
86\end{methoddesc}
87
88\begin{methoddesc}{GetBase}{}
89Returns a string containing the base set by a previous call to
90\method{SetBase()}, or \code{None} if
91\method{SetBase()} hasn't been called.
92\end{methoddesc}
93
94\class{xmlparser} objects have the following attributes, containing
95values relating to the most recent error encountered by an
96\class{xmlparser} object. These attributes will only have correct
97values once a call to \method{Parse()} or \method{ParseFile()}
98has raised a \exception{pyexpat.error} exception.
99
100\begin{datadesc}{ErrorByteIndex}
101Byte index at which an error occurred.
102\end{datadesc}
103
104\begin{datadesc}{ErrorCode}
105Numeric code specifying the problem. This value can be passed to the
106\function{ErrorString()} function, or compared to one of the constants
107defined in the \module{pyexpat.errors} submodule.
108\end{datadesc}
109
110\begin{datadesc}{ErrorColumnNumber}
111Column number at which an error occurred.
112\end{datadesc}
113
114\begin{datadesc}{ErrorLineNumber}
115Line number at which an error occurred.
116\end{datadesc}
117
118Here is the list of handlers that can be set. To set a handler on an
119\class{xmlparser} object \var{o}, use \code{\var{o}.\var{handlername} = \var{func}}. \var{handlername} must be taken from the following list, and \var{func} must be a callable object accepting the correct number of arguments. The arguments are all strings, unless otherwise stated.
120
121\begin{methoddesc}{StartElementHandler}{name, attributes}
122Called for the start of every element. \var{name} is a string
123containing the element name, and \var{attributes} is a dictionary
124mapping attribute names to their values.
125\end{methoddesc}
126
127\begin{methoddesc}{EndElementHandler}{name}
128Called for the end of every element.
129\end{methoddesc}
130
131\begin{methoddesc}{ProcessingInstructionHandler}{target, data}
132Called for every processing instruction.
133\end{methoddesc}
134
135\begin{methoddesc}{CharacterDataHandler}{\var{data}}
136Called for character data.
137\end{methoddesc}
138
139\begin{methoddesc}{UnparsedEntityDeclHandler}{entityName, base, systemId, publicId, notationName}
140Called for unparsed (NDATA) entity declarations.
141\end{methoddesc}
142
143\begin{methoddesc}{NotationDeclHandler}{notationName, base, systemId, publicId}
144Called for notation declarations.
145\end{methoddesc}
146
147\begin{methoddesc}{StartNamespaceDeclHandler}{prefix, uri}
148Called when an element contains a namespace declaration.
149\end{methoddesc}
150
151\begin{methoddesc}{EndNamespaceDeclHandler}{prefix}
152Called when the closing tag is reached for an element
153that contained a namespace declaration.
154\end{methoddesc}
155
156\begin{methoddesc}{CommentHandler}{data}
157Called for comments.
158\end{methoddesc}
159
160\begin{methoddesc}{StartCdataSectionHandler}{}
161Called at the start of a CDATA section.
162\end{methoddesc}
163
164\begin{methoddesc}{EndCdataSectionHandler}{}
165Called at the end of a CDATA section.
166\end{methoddesc}
167
168\begin{methoddesc}{DefaultHandler}{data}
169Called for any characters in the XML document for
170which no applicable handler has been specified. This means
171characters that are part of a construct which could be reported, but
172for which no handler has been supplied.
173\end{methoddesc}
174
175\begin{methoddesc}{DefaultHandlerExpand}{data}
176This is the same as the \function{DefaultHandler},
177but doesn't inhibit expansion of internal entities.
178The entity reference will not be passed to the default handler.
179\end{methoddesc}
180
181\begin{methoddesc}{NotStandaloneHandler}{}
182Called if the XML document hasn't been declared as being a standalone document.
183\end{methoddesc}
184
185\begin{methoddesc}{ExternalEntityRefHandler}{context, base, systemId, publicId}
186Called for references to external entities.
187\end{methoddesc}
188
189
190
191
192
193\subsection{\module{pyexpat.errors} -- Error constants}
194
195The following table lists the error constants in the
196\module{pyexpat.errors} submodule, available once the \module{pyexpat} module has been imported.
197
198\begin{tableii}{l|l}{code}{Constants}{}{}
199 \lineii {XML_ERROR_ASYNC_ENTITY}
200 {XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF}
201 \lineii {XML_ERROR_BAD_CHAR_REF}
202 {XML_ERROR_BINARY_ENTITY_REF}
203 \lineii {XML_ERROR_DUPLICATE_ATTRIBUTE}
204 {XML_ERROR_INCORRECT_ENCODING}
205 \lineii {XML_ERROR_INVALID_TOKEN}
206 {XML_ERROR_JUNK_AFTER_DOC_ELEMENT}
207 \lineii {XML_ERROR_MISPLACED_XML_PI}
208 {XML_ERROR_NO_ELEMENTS}
209 \lineii {XML_ERROR_NO_MEMORY}
210 {XML_ERROR_PARAM_ENTITY_REF}
211 \lineii {XML_ERROR_PARTIAL_CHAR}
212 {XML_ERROR_RECURSIVE_ENTITY_REF}
213 \lineii {XML_ERROR_SYNTAX}
214 {XML_ERROR_TAG_MISMATCH}
215 \lineii {XML_ERROR_UNCLOSED_TOKEN}
216 {XML_ERROR_UNDEFINED_ENTITY}
217 \lineii {XML_ERROR_UNKNOWN_ENCODING}{}
218\end{tableii}
219
220\subsection{Example}
221
222The following program defines 3 handlers that just print out their
223arguments.
224
225\begin{verbatim}
226
227import pyexpat
228
229# 3 handler functions
230def start_element(name, attrs):
231 print 'Start element:', name, attrs
232def end_element(name):
233 print 'End element:', name
234def char_data(data):
235 print 'Character data:', repr(data)
236
237p=pyexpat.ParserCreate()
238
239p.StartElementHandler = start_element
240p.EndElementHandler = end_element
241p.CharacterDataHandler= char_data
242
243p.Parse("""<?xml version="1.0"?>
244<parent id="top"><child1 name="paul">Text goes here</child1>
245<child2 name="fred">More text</child2>
246</parent>""")
247\end{verbatim}
248
249The output from this program is:
250
251\begin{verbatim}
252Start element: parent {'id': 'top'}
253Start element: child1 {'name': 'paul'}
254Character data: 'Text goes here'
255End element: child1
256Character data: '\012'
257Start element: child2 {'name': 'fred'}
258Character data: 'More text'
259End element: child2
260Character data: '\012'
261End element: parent
262\end{verbatim}