blob: e8b4dd92c0e1bf6c93438ca60a0228253cc84433 [file] [log] [blame]
Fred Drake3b755d42001-05-30 04:59:00 +00001\section{\module{HTMLParser} ---
2 Simple HTML and XHTML parser}
3
4\declaremodule{standard}{HTMLParser}
5\modulesynopsis{A simple parser that can handle HTML and XHTML.}
6
7This module defines a class \class{HTMLParser} which serves as the
8basis for parsing text files formatted in HTML\index{HTML} (HyperText
9Mark-up Language) and XHTML.\index{XHTML}
10
11
12\begin{classdesc}{HTMLParser}{}
13The \class{HTMLParser} class is instantiated without arguments.
14
15An HTMLParser instance is fed HTML data and calls handler functions
16when tags begin and end. The \class{HTMLParser} class is meant to be
17overridden by the user to provide a desired behavior.
18\end{classdesc}
19
20
21\class{HTMLParser} instances have the following methods:
22
23\begin{methoddesc}{reset}{}
24Reset the instance. Loses all unprocessed data. This is called
25implicitly at instantiation time.
26\end{methoddesc}
27
28\begin{methoddesc}{feed}{data}
29Feed some text to the parser. It is processed insofar as it consists
30of complete elements; incomplete data is buffered until more data is
31fed or \method{close()} is called.
32\end{methoddesc}
33
34\begin{methoddesc}{close}{}
35Force processing of all buffered data as if it were followed by an
36end-of-file mark. This method may be redefined by a derived class to
37define additional processing at the end of the input, but the
38redefined version should always call the \class{HTMLParser} base class
39method \method{close()}.
40\end{methoddesc}
41
42\begin{methoddesc}{getpos}{}
43Return current line number and offset.
44\end{methoddesc}
45
46\begin{methoddesc}{get_starttag_text}{}
47Return the text of the most recently opened start tag. This should
48not normally be needed for structured processing, but may be useful in
49dealing with HTML ``as deployed'' or for re-generating input with
50minimal changes (whitespace between attributes can be preserved,
51etc.).
52\end{methoddesc}
53
54\begin{methoddesc}{handle_starttag}{tag, attrs}
55This method is called to handle the start of a tag. It is intended to
56be overridden by a derived class; the base class implementation does
57nothing.
58
59The \var{tag} argument is the name of the tag converted to
60lower case. The \var{attrs} argument is a list of \code{(\var{name},
61\var{value})} pairs containing the attributes found inside the tag's
62\code{<>} brackets. The \var{name} will be translated to lower case
63and double quotes and backslashes in the \var{value} have been
64interpreted. For instance, for the tag \code{<A
65HREF="http://www.cwi.nl/">}, this method would be called as
66\samp{handle_starttag('a', [('href', 'http://www.cwi.nl/')])}.
67\end{methoddesc}
68
69\begin{methoddesc}{handle_startendtag}{tag, attrs}
70Similar to \method{handle_starttag()}, but called when the parser
71encounters an XHTML-style empty tag (\code{<a .../>}). This method
72may be overridden by subclasses which require this particular lexical
73information; the default implementation simple calls
74\method{handle_starttag()} and \method{handle_endtag()}.
75\end{methoddesc}
76
77\begin{methoddesc}{handle_endtag}{tag}
78This method is called to handle the end tag of an element. It is
79intended to be overridden by a derived class; the base class
80implementation does nothing. The \var{tag} argument is the name of
81the tag converted to lower case.
82\end{methoddesc}
83
84\begin{methoddesc}{handle_data}{data}
85This method is called to process arbitrary data. It is intended to be
86overridden by a derived class; the base class implementation does
87nothing.
88\end{methoddesc}
89
90\begin{methoddesc}{handle_charref}{name} This method is called to
91process a character reference of the form \samp{\&\#\var{ref};}. It
92is intended to be overridden by a derived class; the base class
93implementation does nothing.
94\end{methoddesc}
95
96\begin{methoddesc}{handle_entityref}{name}
97This method is called to process a general entity reference of the
98form \samp{\&\var{name};} where \var{name} is an general entity
99reference. It is intended to be overridden by a derived class; the
100base class implementation does nothing.
101\end{methoddesc}
102
103\begin{methoddesc}{handle_comment}{data}
104This method is called when a comment is encountered. The
105\var{comment} argument is a string containing the text between the
106\samp{<!--} and \samp{-->} delimiters, but not the delimiters
107themselves. For example, the comment \samp{<!--text-->} will cause
108this method to be called with the argument \code{'text'}. It is
109intended to be overridden by a derived class; the base class
110implementation does nothing.
111\end{methoddesc}
112
113\begin{methoddesc}{handle_decl}{decl}
114Method called when an SGML declaration is read by the parser. The
115\var{decl} parameter will be the entire contents of the declaration
116inside the \code{<!}...\code{>} markup.It is intended to be overridden
117by a derived class; the base class implementation does nothing.
118\end{methoddesc}
119
120
121\subsection{Example HTML Parser \label{htmlparser-example}}
122
123As a basic example, below is a very basic HTML parser that uses the
124\class{HTMLParser} class to print out tags as they are encountered:
125
126\begin{verbatim}
127from HTMLParser import HTMLParser
128
129class MyHTMLParser(HTMLParser):
130
131 def handle_starttag(self, tag, attrs):
132 print "Encountered the beginning of a %s tag" % tag
133
134 def handle_endtag(self, tag):
135 print "Encountered the end of a %s tag" % tag
136\end{verbatim}