blob: 68c93e015e86a5b503d2f88afb982ae272445560 [file] [log] [blame]
Fred Drake3b755d42001-05-30 04:59:00 +00001\section{\module{HTMLParser} ---
2 Simple HTML and XHTML parser}
3
4\declaremodule{standard}{HTMLParser}
5\modulesynopsis{A simple parser that can handle HTML and XHTML.}
6
7This module defines a class \class{HTMLParser} which serves as the
8basis for parsing text files formatted in HTML\index{HTML} (HyperText
Fred Drake25211f52001-07-05 16:34:36 +00009Mark-up Language) and XHTML.\index{XHTML} Unlike the parser in
10\refmodule{htmllib}, this parser is not based on the SGML parser in
11\refmodule{sgmllib}.
Fred Drake3b755d42001-05-30 04:59:00 +000012
13
14\begin{classdesc}{HTMLParser}{}
15The \class{HTMLParser} class is instantiated without arguments.
16
17An HTMLParser instance is fed HTML data and calls handler functions
18when tags begin and end. The \class{HTMLParser} class is meant to be
19overridden by the user to provide a desired behavior.
Fred Drake25211f52001-07-05 16:34:36 +000020
21Unlike the parser in \refmodule{htmllib}, this parser does not check
22that end tags match start tags or call the end-tag handler for
23elements which are closed implicitly by closing an outer element.
Fred Drake3b755d42001-05-30 04:59:00 +000024\end{classdesc}
25
26
27\class{HTMLParser} instances have the following methods:
28
29\begin{methoddesc}{reset}{}
30Reset the instance. Loses all unprocessed data. This is called
31implicitly at instantiation time.
32\end{methoddesc}
33
34\begin{methoddesc}{feed}{data}
35Feed some text to the parser. It is processed insofar as it consists
36of complete elements; incomplete data is buffered until more data is
37fed or \method{close()} is called.
38\end{methoddesc}
39
40\begin{methoddesc}{close}{}
41Force processing of all buffered data as if it were followed by an
42end-of-file mark. This method may be redefined by a derived class to
43define additional processing at the end of the input, but the
44redefined version should always call the \class{HTMLParser} base class
45method \method{close()}.
46\end{methoddesc}
47
48\begin{methoddesc}{getpos}{}
49Return current line number and offset.
50\end{methoddesc}
51
52\begin{methoddesc}{get_starttag_text}{}
53Return the text of the most recently opened start tag. This should
54not normally be needed for structured processing, but may be useful in
55dealing with HTML ``as deployed'' or for re-generating input with
56minimal changes (whitespace between attributes can be preserved,
57etc.).
58\end{methoddesc}
59
60\begin{methoddesc}{handle_starttag}{tag, attrs}
61This method is called to handle the start of a tag. It is intended to
62be overridden by a derived class; the base class implementation does
63nothing.
64
65The \var{tag} argument is the name of the tag converted to
66lower case. The \var{attrs} argument is a list of \code{(\var{name},
67\var{value})} pairs containing the attributes found inside the tag's
68\code{<>} brackets. The \var{name} will be translated to lower case
69and double quotes and backslashes in the \var{value} have been
70interpreted. For instance, for the tag \code{<A
71HREF="http://www.cwi.nl/">}, this method would be called as
72\samp{handle_starttag('a', [('href', 'http://www.cwi.nl/')])}.
73\end{methoddesc}
74
75\begin{methoddesc}{handle_startendtag}{tag, attrs}
76Similar to \method{handle_starttag()}, but called when the parser
77encounters an XHTML-style empty tag (\code{<a .../>}). This method
78may be overridden by subclasses which require this particular lexical
79information; the default implementation simple calls
80\method{handle_starttag()} and \method{handle_endtag()}.
81\end{methoddesc}
82
83\begin{methoddesc}{handle_endtag}{tag}
84This method is called to handle the end tag of an element. It is
85intended to be overridden by a derived class; the base class
86implementation does nothing. The \var{tag} argument is the name of
87the tag converted to lower case.
88\end{methoddesc}
89
90\begin{methoddesc}{handle_data}{data}
91This method is called to process arbitrary data. It is intended to be
92overridden by a derived class; the base class implementation does
93nothing.
94\end{methoddesc}
95
96\begin{methoddesc}{handle_charref}{name} This method is called to
97process a character reference of the form \samp{\&\#\var{ref};}. It
98is intended to be overridden by a derived class; the base class
99implementation does nothing.
100\end{methoddesc}
101
102\begin{methoddesc}{handle_entityref}{name}
103This method is called to process a general entity reference of the
104form \samp{\&\var{name};} where \var{name} is an general entity
105reference. It is intended to be overridden by a derived class; the
106base class implementation does nothing.
107\end{methoddesc}
108
109\begin{methoddesc}{handle_comment}{data}
110This method is called when a comment is encountered. The
111\var{comment} argument is a string containing the text between the
112\samp{<!--} and \samp{-->} delimiters, but not the delimiters
113themselves. For example, the comment \samp{<!--text-->} will cause
114this method to be called with the argument \code{'text'}. It is
115intended to be overridden by a derived class; the base class
116implementation does nothing.
117\end{methoddesc}
118
119\begin{methoddesc}{handle_decl}{decl}
120Method called when an SGML declaration is read by the parser. The
121\var{decl} parameter will be the entire contents of the declaration
122inside the \code{<!}...\code{>} markup.It is intended to be overridden
123by a derived class; the base class implementation does nothing.
124\end{methoddesc}
125
126
127\subsection{Example HTML Parser \label{htmlparser-example}}
128
129As a basic example, below is a very basic HTML parser that uses the
130\class{HTMLParser} class to print out tags as they are encountered:
131
132\begin{verbatim}
133from HTMLParser import HTMLParser
134
135class MyHTMLParser(HTMLParser):
136
137 def handle_starttag(self, tag, attrs):
138 print "Encountered the beginning of a %s tag" % tag
139
140 def handle_endtag(self, tag):
141 print "Encountered the end of a %s tag" % tag
142\end{verbatim}