blob: 5e99f27f612095ec6678b737ea23860b975303af [file] [log] [blame]
Fred Drake3b755d42001-05-30 04:59:00 +00001\section{\module{HTMLParser} ---
2 Simple HTML and XHTML parser}
3
4\declaremodule{standard}{HTMLParser}
5\modulesynopsis{A simple parser that can handle HTML and XHTML.}
6
Fred Drake961c2882004-09-10 01:20:21 +00007\versionadded{2.2}
8
Fred Drake3b755d42001-05-30 04:59:00 +00009This module defines a class \class{HTMLParser} which serves as the
10basis for parsing text files formatted in HTML\index{HTML} (HyperText
Fred Drake25211f52001-07-05 16:34:36 +000011Mark-up Language) and XHTML.\index{XHTML} Unlike the parser in
12\refmodule{htmllib}, this parser is not based on the SGML parser in
13\refmodule{sgmllib}.
Fred Drake3b755d42001-05-30 04:59:00 +000014
15
16\begin{classdesc}{HTMLParser}{}
17The \class{HTMLParser} class is instantiated without arguments.
18
19An HTMLParser instance is fed HTML data and calls handler functions
20when tags begin and end. The \class{HTMLParser} class is meant to be
21overridden by the user to provide a desired behavior.
Fred Drake25211f52001-07-05 16:34:36 +000022
23Unlike the parser in \refmodule{htmllib}, this parser does not check
24that end tags match start tags or call the end-tag handler for
25elements which are closed implicitly by closing an outer element.
Fred Drake3b755d42001-05-30 04:59:00 +000026\end{classdesc}
27
Fred Drake961c2882004-09-10 01:20:21 +000028An exception is defined as well:
29
30\begin{excdesc}{HTMLParseError}
31Exception raised by the \class{HTMLParser} class when it encounters an
32error while parsing. This exception provides three attributes:
33\member{msg} is a brief message explaining the error, \member{lineno}
34is the number of the line on which the broken construct was detected,
35and \member{offset} is the number of characters into the line at which
36the construct starts.
37\end{excdesc}
38
Fred Drake3b755d42001-05-30 04:59:00 +000039
40\class{HTMLParser} instances have the following methods:
41
42\begin{methoddesc}{reset}{}
43Reset the instance. Loses all unprocessed data. This is called
44implicitly at instantiation time.
45\end{methoddesc}
46
47\begin{methoddesc}{feed}{data}
48Feed some text to the parser. It is processed insofar as it consists
49of complete elements; incomplete data is buffered until more data is
50fed or \method{close()} is called.
51\end{methoddesc}
52
53\begin{methoddesc}{close}{}
54Force processing of all buffered data as if it were followed by an
55end-of-file mark. This method may be redefined by a derived class to
56define additional processing at the end of the input, but the
57redefined version should always call the \class{HTMLParser} base class
58method \method{close()}.
59\end{methoddesc}
60
61\begin{methoddesc}{getpos}{}
62Return current line number and offset.
63\end{methoddesc}
64
65\begin{methoddesc}{get_starttag_text}{}
66Return the text of the most recently opened start tag. This should
67not normally be needed for structured processing, but may be useful in
68dealing with HTML ``as deployed'' or for re-generating input with
69minimal changes (whitespace between attributes can be preserved,
70etc.).
71\end{methoddesc}
72
73\begin{methoddesc}{handle_starttag}{tag, attrs}
74This method is called to handle the start of a tag. It is intended to
75be overridden by a derived class; the base class implementation does
76nothing.
77
Guido van Rossumd8faa362007-04-27 19:54:29 +000078The \var{tag} argument is the name of the tag converted to lower case.
79The \var{attrs} argument is a list of \code{(\var{name}, \var{value})}
80pairs containing the attributes found inside the tag's \code{<>}
81brackets. The \var{name} will be translated to lower case, and quotes
82in the \var{value} have been removed, and character and entity
83references have been replaced. For instance, for the tag \code{<A
84 HREF="http://www.cwi.nl/">}, this method would be called as
Fred Drake3b755d42001-05-30 04:59:00 +000085\samp{handle_starttag('a', [('href', 'http://www.cwi.nl/')])}.
Guido van Rossumd8faa362007-04-27 19:54:29 +000086
87\versionchanged[All entity references from htmlentitydefs are now
88replaced in the attribute values]{2.6}
89
Fred Drake3b755d42001-05-30 04:59:00 +000090\end{methoddesc}
91
92\begin{methoddesc}{handle_startendtag}{tag, attrs}
93Similar to \method{handle_starttag()}, but called when the parser
94encounters an XHTML-style empty tag (\code{<a .../>}). This method
95may be overridden by subclasses which require this particular lexical
96information; the default implementation simple calls
97\method{handle_starttag()} and \method{handle_endtag()}.
98\end{methoddesc}
99
100\begin{methoddesc}{handle_endtag}{tag}
101This method is called to handle the end tag of an element. It is
102intended to be overridden by a derived class; the base class
103implementation does nothing. The \var{tag} argument is the name of
104the tag converted to lower case.
105\end{methoddesc}
106
107\begin{methoddesc}{handle_data}{data}
108This method is called to process arbitrary data. It is intended to be
109overridden by a derived class; the base class implementation does
110nothing.
111\end{methoddesc}
112
113\begin{methoddesc}{handle_charref}{name} This method is called to
114process a character reference of the form \samp{\&\#\var{ref};}. It
115is intended to be overridden by a derived class; the base class
116implementation does nothing.
117\end{methoddesc}
118
119\begin{methoddesc}{handle_entityref}{name}
120This method is called to process a general entity reference of the
121form \samp{\&\var{name};} where \var{name} is an general entity
122reference. It is intended to be overridden by a derived class; the
123base class implementation does nothing.
124\end{methoddesc}
125
126\begin{methoddesc}{handle_comment}{data}
127This method is called when a comment is encountered. The
128\var{comment} argument is a string containing the text between the
Fred Drake4922cae2003-12-30 16:18:23 +0000129\samp{--} and \samp{--} delimiters, but not the delimiters
130themselves. For example, the comment \samp{<!--text-->} will
Raymond Hettinger29553052003-12-07 12:46:16 +0000131cause this method to be called with the argument \code{'text'}. It is
Fred Drake3b755d42001-05-30 04:59:00 +0000132intended to be overridden by a derived class; the base class
133implementation does nothing.
134\end{methoddesc}
135
136\begin{methoddesc}{handle_decl}{decl}
137Method called when an SGML declaration is read by the parser. The
138\var{decl} parameter will be the entire contents of the declaration
Thomas Wouters477c8d52006-05-27 19:21:47 +0000139inside the \code{<!}...\code{>} markup. It is intended to be overridden
Fred Drake3b755d42001-05-30 04:59:00 +0000140by a derived class; the base class implementation does nothing.
141\end{methoddesc}
142
Fred Drake30b6e822003-04-17 22:36:52 +0000143\begin{methoddesc}{handle_pi}{data}
144Method called when a processing instruction is encountered. The
145\var{data} parameter will contain the entire processing instruction.
146For example, for the processing instruction \code{<?proc color='red'>},
147this method would be called as \code{handle_pi("proc color='red'")}. It
148is intended to be overridden by a derived class; the base class
149implementation does nothing.
Fred Drake3b755d42001-05-30 04:59:00 +0000150
Fred Drake30b6e822003-04-17 22:36:52 +0000151\note{The \class{HTMLParser} class uses the SGML syntactic rules for
Fred Drake4922cae2003-12-30 16:18:23 +0000152processing instructions. An XHTML processing instruction using the
Fred Drake30b6e822003-04-17 22:36:52 +0000153trailing \character{?} will cause the \character{?} to be included in
154\var{data}.}
155\end{methoddesc}
156
157
158\subsection{Example HTML Parser Application \label{htmlparser-example}}
Fred Drake3b755d42001-05-30 04:59:00 +0000159
160As a basic example, below is a very basic HTML parser that uses the
161\class{HTMLParser} class to print out tags as they are encountered:
162
163\begin{verbatim}
164from HTMLParser import HTMLParser
165
166class MyHTMLParser(HTMLParser):
167
168 def handle_starttag(self, tag, attrs):
169 print "Encountered the beginning of a %s tag" % tag
170
171 def handle_endtag(self, tag):
172 print "Encountered the end of a %s tag" % tag
173\end{verbatim}