blob: 8ff74e47ef469b658f0501760ce281d8ffe4aa87 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Bjorn Reese70a9da52001-04-21 16:57:29 +00009#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000010#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000011
Owen Taylor3473f882001-02-23 17:55:21 +000012#include <string.h>
13#ifdef HAVE_CTYPE_H
14#include <ctype.h>
15#endif
16#ifdef HAVE_STDLIB_H
17#include <stdlib.h>
18#endif
19#ifdef HAVE_SYS_STAT_H
20#include <sys/stat.h>
21#endif
22#ifdef HAVE_FCNTL_H
23#include <fcntl.h>
24#endif
25#ifdef HAVE_UNISTD_H
26#include <unistd.h>
27#endif
28#ifdef HAVE_ZLIB_H
29#include <zlib.h>
30#endif
31
32#include <libxml/xmlmemory.h>
33#include <libxml/tree.h>
34#include <libxml/parser.h>
35#include <libxml/parserInternals.h>
36#include <libxml/xmlerror.h>
37#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000038#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000039#include <libxml/entities.h>
40#include <libxml/encoding.h>
41#include <libxml/valid.h>
42#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000043#include <libxml/globals.h>
Owen Taylor3473f882001-02-23 17:55:21 +000044
45#define HTML_MAX_NAMELEN 1000
46#define HTML_PARSER_BIG_BUFFER_SIZE 1000
47#define HTML_PARSER_BUFFER_SIZE 100
48
49/* #define DEBUG */
50/* #define DEBUG_PUSH */
51
Daniel Veillard22090732001-07-16 00:06:07 +000052static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000053
Daniel Veillard56a4cb82001-03-24 17:00:36 +000054xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
55 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000056static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000057
58/************************************************************************
59 * *
Owen Taylor3473f882001-02-23 17:55:21 +000060 * Parser stacks related functions and macros *
61 * *
62 ************************************************************************/
63
64/*
65 * Generic function for accessing stacks in the Parser Context
66 */
67
68#define PUSH_AND_POP(scope, type, name) \
69scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
70 if (ctxt->name##Nr >= ctxt->name##Max) { \
71 ctxt->name##Max *= 2; \
72 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
73 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
74 if (ctxt->name##Tab == NULL) { \
75 xmlGenericError(xmlGenericErrorContext, \
76 "realloc failed !\n"); \
77 return(0); \
78 } \
79 } \
80 ctxt->name##Tab[ctxt->name##Nr] = value; \
81 ctxt->name = value; \
82 return(ctxt->name##Nr++); \
83} \
84scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
85 type ret; \
86 if (ctxt->name##Nr < 0) return(0); \
87 ctxt->name##Nr--; \
88 if (ctxt->name##Nr < 0) return(0); \
89 if (ctxt->name##Nr > 0) \
90 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
91 else \
92 ctxt->name = NULL; \
93 ret = ctxt->name##Tab[ctxt->name##Nr]; \
94 ctxt->name##Tab[ctxt->name##Nr] = 0; \
95 return(ret); \
96} \
97
Daniel Veillard56a4cb82001-03-24 17:00:36 +000098/* PUSH_AND_POP(static, xmlNodePtr, node) */
99PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor3473f882001-02-23 17:55:21 +0000100
101/*
102 * Macros for accessing the content. Those should be used only by the parser,
103 * and not exported.
104 *
105 * Dirty macros, i.e. one need to make assumption on the context to use them
106 *
107 * CUR_PTR return the current pointer to the xmlChar to be parsed.
108 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
109 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
110 * in UNICODE mode. This should be used internally by the parser
111 * only to compare to ASCII values otherwise it would break when
112 * running with UTF-8 encoding.
113 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
114 * to compare on ASCII based substring.
115 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
116 * it should be used only to compare on ASCII based substring.
117 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
118 * strings within the parser.
119 *
120 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
121 *
122 * CURRENT Returns the current char value, with the full decoding of
123 * UTF-8 if we are using this mode. It returns an int.
124 * NEXT Skip to the next character, this does the proper decoding
125 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
126 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
127 */
128
129#define UPPER (toupper(*ctxt->input->cur))
130
131#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
132
133#define NXT(val) ctxt->input->cur[(val)]
134
135#define UPP(val) (toupper(ctxt->input->cur[(val)]))
136
137#define CUR_PTR ctxt->input->cur
138
139#define SHRINK xmlParserInputShrink(ctxt->input)
140
141#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
142
143#define CURRENT ((int) (*ctxt->input->cur))
144
145#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
146
147/* Inported from XML */
148
149/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
150#define CUR ((int) (*ctxt->input->cur))
151#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
152
153#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
154#define NXT(val) ctxt->input->cur[(val)]
155#define CUR_PTR ctxt->input->cur
156
157
158#define NEXTL(l) do { \
159 if (*(ctxt->input->cur) == '\n') { \
160 ctxt->input->line++; ctxt->input->col = 1; \
161 } else ctxt->input->col++; \
162 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
163 } while (0)
164
165/************
166 \
167 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
168 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
169 ************/
170
171#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
172#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
173
174#define COPY_BUF(l,b,i,v) \
175 if (l == 1) b[i++] = (xmlChar) v; \
176 else i += xmlCopyChar(l,&b[i],v)
177
178/**
179 * htmlCurrentChar:
180 * @ctxt: the HTML parser context
181 * @len: pointer to the length of the char read
182 *
183 * The current char value, if using UTF-8 this may actaully span multiple
184 * bytes in the input buffer. Implement the end of line normalization:
185 * 2.11 End-of-Line Handling
186 * If the encoding is unspecified, in the case we find an ISO-Latin-1
187 * char, then the encoding converter is plugged in automatically.
188 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000189 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000190 */
191
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000192static int
Owen Taylor3473f882001-02-23 17:55:21 +0000193htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
194 if (ctxt->instate == XML_PARSER_EOF)
195 return(0);
196
197 if (ctxt->token != 0) {
198 *len = 0;
199 return(ctxt->token);
200 }
201 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
202 /*
203 * We are supposed to handle UTF8, check it's valid
204 * From rfc2044: encoding of the Unicode values on UTF-8:
205 *
206 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
207 * 0000 0000-0000 007F 0xxxxxxx
208 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
209 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
210 *
211 * Check for the 0x110000 limit too
212 */
213 const unsigned char *cur = ctxt->input->cur;
214 unsigned char c;
215 unsigned int val;
216
217 c = *cur;
218 if (c & 0x80) {
219 if (cur[1] == 0)
220 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
221 if ((cur[1] & 0xc0) != 0x80)
222 goto encoding_error;
223 if ((c & 0xe0) == 0xe0) {
224
225 if (cur[2] == 0)
226 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
227 if ((cur[2] & 0xc0) != 0x80)
228 goto encoding_error;
229 if ((c & 0xf0) == 0xf0) {
230 if (cur[3] == 0)
231 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
232 if (((c & 0xf8) != 0xf0) ||
233 ((cur[3] & 0xc0) != 0x80))
234 goto encoding_error;
235 /* 4-byte code */
236 *len = 4;
237 val = (cur[0] & 0x7) << 18;
238 val |= (cur[1] & 0x3f) << 12;
239 val |= (cur[2] & 0x3f) << 6;
240 val |= cur[3] & 0x3f;
241 } else {
242 /* 3-byte code */
243 *len = 3;
244 val = (cur[0] & 0xf) << 12;
245 val |= (cur[1] & 0x3f) << 6;
246 val |= cur[2] & 0x3f;
247 }
248 } else {
249 /* 2-byte code */
250 *len = 2;
251 val = (cur[0] & 0x1f) << 6;
252 val |= cur[1] & 0x3f;
253 }
254 if (!IS_CHAR(val)) {
255 ctxt->errNo = XML_ERR_INVALID_ENCODING;
256 if ((ctxt->sax != NULL) &&
257 (ctxt->sax->error != NULL))
258 ctxt->sax->error(ctxt->userData,
259 "Char 0x%X out of allowed range\n", val);
260 ctxt->wellFormed = 0;
261 ctxt->disableSAX = 1;
262 }
263 return(val);
264 } else {
265 /* 1-byte code */
266 *len = 1;
267 return((int) *ctxt->input->cur);
268 }
269 }
270 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000271 * Assume it's a fixed length encoding (1) with
Owen Taylor3473f882001-02-23 17:55:21 +0000272 * a compatibke encoding for the ASCII set, since
273 * XML constructs only use < 128 chars
274 */
275 *len = 1;
276 if ((int) *ctxt->input->cur < 0x80)
277 return((int) *ctxt->input->cur);
278
279 /*
280 * Humm this is bad, do an automatic flow conversion
281 */
282 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
283 ctxt->charset = XML_CHAR_ENCODING_UTF8;
284 return(xmlCurrentChar(ctxt, len));
285
286encoding_error:
287 /*
288 * If we detect an UTF8 error that probably mean that the
289 * input encoding didn't get properly advertized in the
290 * declaration header. Report the error and switch the encoding
291 * to ISO-Latin-1 (if you don't like this policy, just declare the
292 * encoding !)
293 */
294 ctxt->errNo = XML_ERR_INVALID_ENCODING;
295 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
296 ctxt->sax->error(ctxt->userData,
297 "Input is not proper UTF-8, indicate encoding !\n");
298 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
299 ctxt->input->cur[0], ctxt->input->cur[1],
300 ctxt->input->cur[2], ctxt->input->cur[3]);
301 }
302
303 ctxt->charset = XML_CHAR_ENCODING_8859_1;
304 *len = 1;
305 return((int) *ctxt->input->cur);
306}
307
308/**
Owen Taylor3473f882001-02-23 17:55:21 +0000309 * htmlSkipBlankChars:
310 * @ctxt: the HTML parser context
311 *
312 * skip all blanks character found at that point in the input streams.
313 *
314 * Returns the number of space chars skipped
315 */
316
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000317static int
Owen Taylor3473f882001-02-23 17:55:21 +0000318htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
319 int res = 0;
320
321 while (IS_BLANK(*(ctxt->input->cur))) {
322 if ((*ctxt->input->cur == 0) &&
323 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
324 xmlPopInput(ctxt);
325 } else {
326 if (*(ctxt->input->cur) == '\n') {
327 ctxt->input->line++; ctxt->input->col = 1;
328 } else ctxt->input->col++;
329 ctxt->input->cur++;
330 ctxt->nbChars++;
331 if (*ctxt->input->cur == 0)
332 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
333 }
334 res++;
335 }
336 return(res);
337}
338
339
340
341/************************************************************************
342 * *
343 * The list of HTML elements and their properties *
344 * *
345 ************************************************************************/
346
347/*
348 * Start Tag: 1 means the start tag can be ommited
349 * End Tag: 1 means the end tag can be ommited
350 * 2 means it's forbidden (empty elements)
Daniel Veillard56098d42001-04-24 12:51:09 +0000351 * 3 means the tag is stylistic and should be closed easilly
Owen Taylor3473f882001-02-23 17:55:21 +0000352 * Depr: this element is deprecated
353 * DTD: 1 means that this element is valid only in the Loose DTD
354 * 2 means that this element is valid only in the Frameset DTD
355 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000356 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor3473f882001-02-23 17:55:21 +0000357 */
Daniel Veillard22090732001-07-16 00:06:07 +0000358static const htmlElemDesc
359html40ElementTable[] = {
Daniel Veillard02bb1702001-06-13 21:11:59 +0000360{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
361{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
362{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
363{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
364{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
365{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
366{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
367{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
368{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
369{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
370{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
371{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
372{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
373{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
374{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
375{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
376{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
377{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
378{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
379{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
380{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
381{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
382{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
383{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
384{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
385{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
386{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
387{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
388{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
389{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
390{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
391{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
392{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
393{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
394{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
395{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
396{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
397{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
398{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
399{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
400{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
401{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
402{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
403{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
404{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
405{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
406{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
407{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
408{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
409{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
410{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
411{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
412{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
413{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
414{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
415{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
416{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
417{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
418{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
419{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
420{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
421{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
422{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
423{ "p", 0, 1, 1, 0, 0, 0, 0, "paragraph " },
424{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
425{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
426{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
427{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
428{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
429{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
430{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
431{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
432{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
433{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
434{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
435{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
436{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
437{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
438{ "table", 0, 0, 0, 0, 0, 0, 0, "&#160;" },
439{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
440{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
441{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
442{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
443{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
444{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
445{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
446{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
447{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
448{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
449{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
450{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor3473f882001-02-23 17:55:21 +0000451};
452
453/*
Owen Taylor3473f882001-02-23 17:55:21 +0000454 * start tags that imply the end of current element
455 */
Daniel Veillard22090732001-07-16 00:06:07 +0000456static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000457"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
458 "dl", "ul", "ol", "menu", "dir", "address", "pre",
459 "listing", "xmp", "head", NULL,
460"head", "p", NULL,
461"title", "p", NULL,
462"body", "head", "style", "link", "title", "p", NULL,
463"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
464 "pre", "listing", "xmp", "head", "li", NULL,
465"hr", "p", "head", NULL,
466"h1", "p", "head", NULL,
467"h2", "p", "head", NULL,
468"h3", "p", "head", NULL,
469"h4", "p", "head", NULL,
470"h5", "p", "head", NULL,
471"h6", "p", "head", NULL,
472"dir", "p", "head", NULL,
473"address", "p", "head", "ul", NULL,
474"pre", "p", "head", "ul", NULL,
475"listing", "p", "head", NULL,
476"xmp", "p", "head", NULL,
477"blockquote", "p", "head", NULL,
478"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
479 "xmp", "head", NULL,
480"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
481 "head", "dd", NULL,
482"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
483 "head", "dt", NULL,
484"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
485 "listing", "xmp", NULL,
486"ol", "p", "head", "ul", NULL,
487"menu", "p", "head", "ul", NULL,
488"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
489"div", "p", "head", NULL,
490"noscript", "p", "head", NULL,
491"center", "font", "b", "i", "p", "head", NULL,
492"a", "a", NULL,
493"caption", "p", NULL,
494"colgroup", "caption", "colgroup", "col", "p", NULL,
495"col", "caption", "col", "p", NULL,
496"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
497 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000498"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
499"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000500"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
501"thead", "caption", "col", "colgroup", NULL,
502"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
503 "tbody", "p", NULL,
504"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
505 "tfoot", "tbody", "p", NULL,
506"optgroup", "option", NULL,
507"option", "option", NULL,
508"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
509 "pre", "listing", "xmp", "a", NULL,
510NULL
511};
512
513/*
514 * The list of HTML elements which are supposed not to have
515 * CDATA content and where a p element will be implied
516 *
517 * TODO: extend that list by reading the HTML SGML DtD on
518 * implied paragraph
519 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000520static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000521 "html",
522 "head",
523 "body",
524 NULL
525};
526
527/*
528 * The list of HTML attributes which are of content %Script;
529 * NOTE: when adding ones, check htmlIsScriptAttribute() since
530 * it assumes the name starts with 'on'
531 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000532static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000533 "onclick",
534 "ondblclick",
535 "onmousedown",
536 "onmouseup",
537 "onmouseover",
538 "onmousemove",
539 "onmouseout",
540 "onkeypress",
541 "onkeydown",
542 "onkeyup",
543 "onload",
544 "onunload",
545 "onfocus",
546 "onblur",
547 "onsubmit",
548 "onrest",
549 "onchange",
550 "onselect"
551};
552
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000553/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000554 * This table is used by the htmlparser to know what to do with
555 * broken html pages. By assigning different priorities to different
556 * elements the parser can decide how to handle extra endtags.
557 * Endtags are only allowed to close elements with lower or equal
558 * priority.
559 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000560
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000561typedef struct {
562 const char *name;
563 int priority;
564} elementPriority;
565
Daniel Veillard22090732001-07-16 00:06:07 +0000566static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000567 {"div", 150},
568 {"td", 160},
569 {"th", 160},
570 {"tr", 170},
571 {"thead", 180},
572 {"tbody", 180},
573 {"tfoot", 180},
574 {"table", 190},
575 {"head", 200},
576 {"body", 200},
577 {"html", 220},
578 {NULL, 100} /* Default priority */
579};
Owen Taylor3473f882001-02-23 17:55:21 +0000580
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000581static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000582static int htmlStartCloseIndexinitialized = 0;
583
584/************************************************************************
585 * *
586 * functions to handle HTML specific data *
587 * *
588 ************************************************************************/
589
590/**
591 * htmlInitAutoClose:
592 *
593 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
594 * This is not reentrant. Call xmlInitParser() once before processing in
595 * case of use in multithreaded programs.
596 */
597void
598htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000599 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000600
601 if (htmlStartCloseIndexinitialized) return;
602
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000603 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
604 indx = 0;
605 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
606 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000607 while (htmlStartClose[i] != NULL) i++;
608 i++;
609 }
610 htmlStartCloseIndexinitialized = 1;
611}
612
613/**
614 * htmlTagLookup:
615 * @tag: The tag name in lowercase
616 *
617 * Lookup the HTML tag in the ElementTable
618 *
619 * Returns the related htmlElemDescPtr or NULL if not found.
620 */
Daniel Veillardbb371292001-08-16 23:26:59 +0000621const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +0000622htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000623 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000624
625 for (i = 0; i < (sizeof(html40ElementTable) /
626 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000627 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard22090732001-07-16 00:06:07 +0000628 return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +0000629 }
630 return(NULL);
631}
632
633/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000634 * htmlGetEndPriority:
635 * @name: The name of the element to look up the priority for.
636 *
637 * Return value: The "endtag" priority.
638 **/
639static int
640htmlGetEndPriority (const xmlChar *name) {
641 int i = 0;
642
643 while ((htmlEndPriority[i].name != NULL) &&
644 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
645 i++;
646
647 return(htmlEndPriority[i].priority);
648}
649
650/**
Owen Taylor3473f882001-02-23 17:55:21 +0000651 * htmlCheckAutoClose:
652 * @newtag: The new tag name
653 * @oldtag: The old tag name
654 *
655 * Checks wether the new tag is one of the registered valid tags for closing old.
656 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
657 *
658 * Returns 0 if no, 1 if yes.
659 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000660static int
Owen Taylor3473f882001-02-23 17:55:21 +0000661htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000662 int i, indx;
663 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +0000664
665 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
666
667 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000668 for (indx = 0; indx < 100;indx++) {
669 closed = htmlStartCloseIndex[indx];
670 if (closed == NULL) return(0);
671 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +0000672 }
673
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000674 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +0000675 i++;
676 while (htmlStartClose[i] != NULL) {
677 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
678 return(1);
679 }
680 i++;
681 }
682 return(0);
683}
684
685/**
686 * htmlAutoCloseOnClose:
687 * @ctxt: an HTML parser context
688 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000689 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +0000690 *
691 * The HTmL DtD allows an ending tag to implicitely close other tags.
692 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000693static void
Owen Taylor3473f882001-02-23 17:55:21 +0000694htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardbb371292001-08-16 23:26:59 +0000695 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +0000696 xmlChar *oldname;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000697 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +0000698
699#ifdef DEBUG
700 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
701 for (i = 0;i < ctxt->nameNr;i++)
702 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
703#endif
704
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000705 priority = htmlGetEndPriority (newtag);
706
Owen Taylor3473f882001-02-23 17:55:21 +0000707 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000708
Owen Taylor3473f882001-02-23 17:55:21 +0000709 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000710 /*
711 * A missplaced endtagad can only close elements with lower
712 * or equal priority, so if we find an element with higher
713 * priority before we find an element with
714 * matching name, we just ignore this endtag
715 */
716 if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor3473f882001-02-23 17:55:21 +0000717 }
718 if (i < 0) return;
719
720 while (!xmlStrEqual(newtag, ctxt->name)) {
721 info = htmlTagLookup(ctxt->name);
722 if ((info == NULL) || (info->endTag == 1)) {
723#ifdef DEBUG
724 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
725#endif
Daniel Veillard56098d42001-04-24 12:51:09 +0000726 } else if (info->endTag == 3) {
727#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +0000728 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack1633d182001-10-05 15:41:19 +0000729
Daniel Veillard56098d42001-04-24 12:51:09 +0000730#endif
731 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
732 ctxt->sax->error(ctxt->userData,
733 "Opening and ending tag mismatch: %s and %s\n",
734 newtag, ctxt->name);
735 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000736 }
737 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
738 ctxt->sax->endElement(ctxt->userData, ctxt->name);
739 oldname = htmlnamePop(ctxt);
740 if (oldname != NULL) {
741#ifdef DEBUG
742 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
743#endif
744 xmlFree(oldname);
745 }
746 }
747}
748
749/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000750 * htmlAutoCloseOnEnd:
751 * @ctxt: an HTML parser context
752 *
753 * Close all remaining tags at the end of the stream
754 */
755static void
756htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
757 xmlChar *oldname;
758 int i;
759
760 if (ctxt->nameNr == 0)
761 return;
762#ifdef DEBUG
763 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
764#endif
765
766 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
767#ifdef DEBUG
768 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
769#endif
770 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
771 ctxt->sax->endElement(ctxt->userData, ctxt->name);
772 oldname = htmlnamePop(ctxt);
773 if (oldname != NULL) {
774#ifdef DEBUG
775 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
776#endif
777 xmlFree(oldname);
778 }
779 }
780}
781
782/**
Owen Taylor3473f882001-02-23 17:55:21 +0000783 * htmlAutoClose:
784 * @ctxt: an HTML parser context
785 * @newtag: The new tag name or NULL
786 *
787 * The HTmL DtD allows a tag to implicitely close other tags.
788 * The list is kept in htmlStartClose array. This function is
789 * called when a new tag has been detected and generates the
790 * appropriates closes if possible/needed.
791 * If newtag is NULL this mean we are at the end of the resource
792 * and we should check
793 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000794static void
Owen Taylor3473f882001-02-23 17:55:21 +0000795htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
796 xmlChar *oldname;
797 while ((newtag != NULL) && (ctxt->name != NULL) &&
798 (htmlCheckAutoClose(newtag, ctxt->name))) {
799#ifdef DEBUG
800 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
801#endif
802 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
803 ctxt->sax->endElement(ctxt->userData, ctxt->name);
804 oldname = htmlnamePop(ctxt);
805 if (oldname != NULL) {
806#ifdef DEBUG
807 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
808#endif
809 xmlFree(oldname);
810 }
811 }
812 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000813 htmlAutoCloseOnEnd(ctxt);
814 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000815 }
816 while ((newtag == NULL) && (ctxt->name != NULL) &&
817 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
818 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
819 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
820#ifdef DEBUG
821 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
822#endif
823 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
824 ctxt->sax->endElement(ctxt->userData, ctxt->name);
825 oldname = htmlnamePop(ctxt);
826 if (oldname != NULL) {
827#ifdef DEBUG
828 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
829#endif
830 xmlFree(oldname);
831 }
832 }
833
834}
835
836/**
837 * htmlAutoCloseTag:
838 * @doc: the HTML document
839 * @name: The tag name
840 * @elem: the HTML element
841 *
842 * The HTmL DtD allows a tag to implicitely close other tags.
843 * The list is kept in htmlStartClose array. This function checks
844 * if the element or one of it's children would autoclose the
845 * given tag.
846 *
847 * Returns 1 if autoclose, 0 otherwise
848 */
849int
850htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
851 htmlNodePtr child;
852
853 if (elem == NULL) return(1);
854 if (xmlStrEqual(name, elem->name)) return(0);
855 if (htmlCheckAutoClose(elem->name, name)) return(1);
856 child = elem->children;
857 while (child != NULL) {
858 if (htmlAutoCloseTag(doc, name, child)) return(1);
859 child = child->next;
860 }
861 return(0);
862}
863
864/**
865 * htmlIsAutoClosed:
866 * @doc: the HTML document
867 * @elem: the HTML element
868 *
869 * The HTmL DtD allows a tag to implicitely close other tags.
870 * The list is kept in htmlStartClose array. This function checks
871 * if a tag is autoclosed by one of it's child
872 *
873 * Returns 1 if autoclosed, 0 otherwise
874 */
875int
876htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
877 htmlNodePtr child;
878
879 if (elem == NULL) return(1);
880 child = elem->children;
881 while (child != NULL) {
882 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
883 child = child->next;
884 }
885 return(0);
886}
887
888/**
889 * htmlCheckImplied:
890 * @ctxt: an HTML parser context
891 * @newtag: The new tag name
892 *
893 * The HTML DtD allows a tag to exists only implicitely
894 * called when a new tag has been detected and generates the
895 * appropriates implicit tags if missing
896 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000897static void
Owen Taylor3473f882001-02-23 17:55:21 +0000898htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
899 if (!htmlOmittedDefaultValue)
900 return;
901 if (xmlStrEqual(newtag, BAD_CAST"html"))
902 return;
903 if (ctxt->nameNr <= 0) {
904#ifdef DEBUG
905 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
906#endif
907 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
908 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
909 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
910 }
911 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
912 return;
913 if ((ctxt->nameNr <= 1) &&
914 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
915 (xmlStrEqual(newtag, BAD_CAST"style")) ||
916 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
917 (xmlStrEqual(newtag, BAD_CAST"link")) ||
918 (xmlStrEqual(newtag, BAD_CAST"title")) ||
919 (xmlStrEqual(newtag, BAD_CAST"base")))) {
920 /*
921 * dropped OBJECT ... i you put it first BODY will be
922 * assumed !
923 */
924#ifdef DEBUG
925 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
926#endif
927 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
928 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
929 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
930 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
931 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
932 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
933 int i;
934 for (i = 0;i < ctxt->nameNr;i++) {
935 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
936 return;
937 }
938 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
939 return;
940 }
941 }
942
943#ifdef DEBUG
944 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
945#endif
946 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
947 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
948 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
949 }
950}
951
952/**
953 * htmlCheckParagraph
954 * @ctxt: an HTML parser context
955 *
956 * Check whether a p element need to be implied before inserting
957 * characters in the current element.
958 *
959 * Returns 1 if a paragraph has been inserted, 0 if not and -1
960 * in case of error.
961 */
962
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000963static int
Owen Taylor3473f882001-02-23 17:55:21 +0000964htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
965 const xmlChar *tag;
966 int i;
967
968 if (ctxt == NULL)
969 return(-1);
970 tag = ctxt->name;
971 if (tag == NULL) {
972 htmlAutoClose(ctxt, BAD_CAST"p");
973 htmlCheckImplied(ctxt, BAD_CAST"p");
974 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
975 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
976 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
977 return(1);
978 }
979 if (!htmlOmittedDefaultValue)
980 return(0);
981 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
982 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
983#ifdef DEBUG
984 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
985#endif
986 htmlAutoClose(ctxt, BAD_CAST"p");
987 htmlCheckImplied(ctxt, BAD_CAST"p");
988 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
989 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
990 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
991 return(1);
992 }
993 }
994 return(0);
995}
996
997/**
998 * htmlIsScriptAttribute:
999 * @name: an attribute name
1000 *
1001 * Check if an attribute is of content type Script
1002 *
1003 * Returns 1 is the attribute is a script 0 otherwise
1004 */
1005int
1006htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001007 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001008
1009 if (name == NULL)
1010 return(0);
1011 /*
1012 * all script attributes start with 'on'
1013 */
1014 if ((name[0] != 'o') || (name[1] != 'n'))
1015 return(0);
1016 for (i = 0;
1017 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1018 i++) {
1019 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1020 return(1);
1021 }
1022 return(0);
1023}
1024
1025/************************************************************************
1026 * *
1027 * The list of HTML predefined entities *
1028 * *
1029 ************************************************************************/
1030
1031
Daniel Veillard22090732001-07-16 00:06:07 +00001032static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001033/*
1034 * the 4 absolute ones, plus apostrophe.
1035 */
1036{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1037{ 38, "amp", "ampersand, U+0026 ISOnum" },
1038{ 39, "apos", "single quote" },
1039{ 60, "lt", "less-than sign, U+003C ISOnum" },
1040{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1041
1042/*
1043 * A bunch still in the 128-255 range
1044 * Replacing them depend really on the charset used.
1045 */
1046{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1047{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1048{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1049{ 163, "pound","pound sign, U+00A3 ISOnum" },
1050{ 164, "curren","currency sign, U+00A4 ISOnum" },
1051{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1052{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1053{ 167, "sect", "section sign, U+00A7 ISOnum" },
1054{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1055{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1056{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1057{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1058{ 172, "not", "not sign, U+00AC ISOnum" },
1059{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1060{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1061{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1062{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1063{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1064{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1065{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1066{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1067{ 181, "micro","micro sign, U+00B5 ISOnum" },
1068{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1069{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1070{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1071{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1072{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1073{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1074{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1075{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1076{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1077{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1078{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1079{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1080{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1081{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1082{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1083{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1084{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1085{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1086{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1087{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1088{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1089{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1090{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1091{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1092{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1093{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1094{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1095{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1096{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1097{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1098{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1099{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1100{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1101{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1102{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1103{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1104{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1105{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1106{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1107{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1108{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1109{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1110{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1111{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1112{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1113{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1114{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1115{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1116{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1117{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1118{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1119{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1120{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1121{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1122{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1123{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1124{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1125{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1126{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1127{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1128{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1129{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1130{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1131{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1132{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1133{ 247, "divide","division sign, U+00F7 ISOnum" },
1134{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1135{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1136{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1137{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1138{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1139{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1140{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1141{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1142
1143{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1144{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1145{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1146{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1147{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1148
1149/*
1150 * Anything below should really be kept as entities references
1151 */
1152{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1153
1154{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1155{ 732, "tilde","small tilde, U+02DC ISOdia" },
1156
1157{ 913, "Alpha","greek capital letter alpha, U+0391" },
1158{ 914, "Beta", "greek capital letter beta, U+0392" },
1159{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1160{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1161{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1162{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1163{ 919, "Eta", "greek capital letter eta, U+0397" },
1164{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1165{ 921, "Iota", "greek capital letter iota, U+0399" },
1166{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001167{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001168{ 924, "Mu", "greek capital letter mu, U+039C" },
1169{ 925, "Nu", "greek capital letter nu, U+039D" },
1170{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1171{ 927, "Omicron","greek capital letter omicron, U+039F" },
1172{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1173{ 929, "Rho", "greek capital letter rho, U+03A1" },
1174{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1175{ 932, "Tau", "greek capital letter tau, U+03A4" },
1176{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1177{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1178{ 935, "Chi", "greek capital letter chi, U+03A7" },
1179{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1180{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1181
1182{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1183{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1184{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1185{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1186{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1187{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1188{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1189{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1190{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1191{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1192{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1193{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1194{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1195{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1196{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1197{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1198{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1199{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1200{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1201{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1202{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1203{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1204{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1205{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1206{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1207{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1208{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1209{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1210
1211{ 8194, "ensp", "en space, U+2002 ISOpub" },
1212{ 8195, "emsp", "em space, U+2003 ISOpub" },
1213{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1214{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1215{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1216{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1217{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1218{ 8211, "ndash","en dash, U+2013 ISOpub" },
1219{ 8212, "mdash","em dash, U+2014 ISOpub" },
1220{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1221{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1222{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1223{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1224{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1225{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1226{ 8224, "dagger","dagger, U+2020 ISOpub" },
1227{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1228
1229{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1230{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1231
1232{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1233
1234{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1235{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1236
1237{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1238{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1239
1240{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1241{ 8260, "frasl","fraction slash, U+2044 NEW" },
1242
1243{ 8364, "euro", "euro sign, U+20AC NEW" },
1244
1245{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1246{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1247{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1248{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1249{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1250{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1251{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1252{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1253{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1254{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1255{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1256{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1257{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1258{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1259{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1260{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1261
1262{ 8704, "forall","for all, U+2200 ISOtech" },
1263{ 8706, "part", "partial differential, U+2202 ISOtech" },
1264{ 8707, "exist","there exists, U+2203 ISOtech" },
1265{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1266{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1267{ 8712, "isin", "element of, U+2208 ISOtech" },
1268{ 8713, "notin","not an element of, U+2209 ISOtech" },
1269{ 8715, "ni", "contains as member, U+220B ISOtech" },
1270{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1271{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
1272{ 8722, "minus","minus sign, U+2212 ISOtech" },
1273{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1274{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1275{ 8733, "prop", "proportional to, U+221D ISOtech" },
1276{ 8734, "infin","infinity, U+221E ISOtech" },
1277{ 8736, "ang", "angle, U+2220 ISOamso" },
1278{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1279{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1280{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1281{ 8746, "cup", "union = cup, U+222A ISOtech" },
1282{ 8747, "int", "integral, U+222B ISOtech" },
1283{ 8756, "there4","therefore, U+2234 ISOtech" },
1284{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1285{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1286{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1287{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1288{ 8801, "equiv","identical to, U+2261 ISOtech" },
1289{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1290{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1291{ 8834, "sub", "subset of, U+2282 ISOtech" },
1292{ 8835, "sup", "superset of, U+2283 ISOtech" },
1293{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1294{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1295{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1296{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1297{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1298{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1299{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1300{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1301{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1302{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1303{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1304{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1305{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1306{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1307
1308{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1309{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1310{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1311{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1312
1313};
1314
1315/************************************************************************
1316 * *
1317 * Commodity functions to handle entities *
1318 * *
1319 ************************************************************************/
1320
1321/*
1322 * Macro used to grow the current buffer.
1323 */
1324#define growBuffer(buffer) { \
1325 buffer##_size *= 2; \
1326 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1327 if (buffer == NULL) { \
1328 perror("realloc failed"); \
1329 return(NULL); \
1330 } \
1331}
1332
1333/**
1334 * htmlEntityLookup:
1335 * @name: the entity name
1336 *
1337 * Lookup the given entity in EntitiesTable
1338 *
1339 * TODO: the linear scan is really ugly, an hash table is really needed.
1340 *
1341 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1342 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001343const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001344htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001345 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001346
1347 for (i = 0;i < (sizeof(html40EntitiesTable)/
1348 sizeof(html40EntitiesTable[0]));i++) {
1349 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1350#ifdef DEBUG
1351 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1352#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001353 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001354 }
1355 }
1356 return(NULL);
1357}
1358
1359/**
1360 * htmlEntityValueLookup:
1361 * @value: the entity's unicode value
1362 *
1363 * Lookup the given entity in EntitiesTable
1364 *
1365 * TODO: the linear scan is really ugly, an hash table is really needed.
1366 *
1367 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1368 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001369const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001370htmlEntityValueLookup(unsigned int value) {
1371 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001372#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001373 unsigned int lv = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001374#endif
1375
1376 for (i = 0;i < (sizeof(html40EntitiesTable)/
1377 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001378 if (html40EntitiesTable[i].value >= value) {
1379 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001380 break;
1381#ifdef DEBUG
1382 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1383#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001384 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001385 }
1386#ifdef DEBUG
1387 if (lv > html40EntitiesTable[i].value) {
1388 xmlGenericError(xmlGenericErrorContext,
1389 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1390 lv, html40EntitiesTable[i].value);
1391 }
1392 lv = html40EntitiesTable[i].value;
1393#endif
1394 }
1395 return(NULL);
1396}
1397
1398/**
1399 * UTF8ToHtml:
1400 * @out: a pointer to an array of bytes to store the result
1401 * @outlen: the length of @out
1402 * @in: a pointer to an array of UTF-8 chars
1403 * @inlen: the length of @in
1404 *
1405 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1406 * plus HTML entities block of chars out.
1407 *
1408 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1409 * The value of @inlen after return is the number of octets consumed
1410 * as the return value is positive, else unpredictiable.
1411 * The value of @outlen after return is the number of octets consumed.
1412 */
1413int
1414UTF8ToHtml(unsigned char* out, int *outlen,
1415 const unsigned char* in, int *inlen) {
1416 const unsigned char* processed = in;
1417 const unsigned char* outend;
1418 const unsigned char* outstart = out;
1419 const unsigned char* instart = in;
1420 const unsigned char* inend;
1421 unsigned int c, d;
1422 int trailing;
1423
1424 if (in == NULL) {
1425 /*
1426 * initialization nothing to do
1427 */
1428 *outlen = 0;
1429 *inlen = 0;
1430 return(0);
1431 }
1432 inend = in + (*inlen);
1433 outend = out + (*outlen);
1434 while (in < inend) {
1435 d = *in++;
1436 if (d < 0x80) { c= d; trailing= 0; }
1437 else if (d < 0xC0) {
1438 /* trailing byte in leading position */
1439 *outlen = out - outstart;
1440 *inlen = processed - instart;
1441 return(-2);
1442 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1443 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1444 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1445 else {
1446 /* no chance for this in Ascii */
1447 *outlen = out - outstart;
1448 *inlen = processed - instart;
1449 return(-2);
1450 }
1451
1452 if (inend - in < trailing) {
1453 break;
1454 }
1455
1456 for ( ; trailing; trailing--) {
1457 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1458 break;
1459 c <<= 6;
1460 c |= d & 0x3F;
1461 }
1462
1463 /* assertion: c is a single UTF-4 value */
1464 if (c < 0x80) {
1465 if (out + 1 >= outend)
1466 break;
1467 *out++ = c;
1468 } else {
1469 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001470 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001471
1472 /*
1473 * Try to lookup a predefined HTML entity for it
1474 */
1475
1476 ent = htmlEntityValueLookup(c);
1477 if (ent == NULL) {
1478 /* no chance for this in Ascii */
1479 *outlen = out - outstart;
1480 *inlen = processed - instart;
1481 return(-2);
1482 }
1483 len = strlen(ent->name);
1484 if (out + 2 + len >= outend)
1485 break;
1486 *out++ = '&';
1487 memcpy(out, ent->name, len);
1488 out += len;
1489 *out++ = ';';
1490 }
1491 processed = in;
1492 }
1493 *outlen = out - outstart;
1494 *inlen = processed - instart;
1495 return(0);
1496}
1497
1498/**
1499 * htmlEncodeEntities:
1500 * @out: a pointer to an array of bytes to store the result
1501 * @outlen: the length of @out
1502 * @in: a pointer to an array of UTF-8 chars
1503 * @inlen: the length of @in
1504 * @quoteChar: the quote character to escape (' or ") or zero.
1505 *
1506 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1507 * plus HTML entities block of chars out.
1508 *
1509 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1510 * The value of @inlen after return is the number of octets consumed
1511 * as the return value is positive, else unpredictiable.
1512 * The value of @outlen after return is the number of octets consumed.
1513 */
1514int
1515htmlEncodeEntities(unsigned char* out, int *outlen,
1516 const unsigned char* in, int *inlen, int quoteChar) {
1517 const unsigned char* processed = in;
1518 const unsigned char* outend = out + (*outlen);
1519 const unsigned char* outstart = out;
1520 const unsigned char* instart = in;
1521 const unsigned char* inend = in + (*inlen);
1522 unsigned int c, d;
1523 int trailing;
1524
1525 while (in < inend) {
1526 d = *in++;
1527 if (d < 0x80) { c= d; trailing= 0; }
1528 else if (d < 0xC0) {
1529 /* trailing byte in leading position */
1530 *outlen = out - outstart;
1531 *inlen = processed - instart;
1532 return(-2);
1533 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1534 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1535 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1536 else {
1537 /* no chance for this in Ascii */
1538 *outlen = out - outstart;
1539 *inlen = processed - instart;
1540 return(-2);
1541 }
1542
1543 if (inend - in < trailing)
1544 break;
1545
1546 while (trailing--) {
1547 if (((d= *in++) & 0xC0) != 0x80) {
1548 *outlen = out - outstart;
1549 *inlen = processed - instart;
1550 return(-2);
1551 }
1552 c <<= 6;
1553 c |= d & 0x3F;
1554 }
1555
1556 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001557 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1558 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001559 if (out >= outend)
1560 break;
1561 *out++ = c;
1562 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001563 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001564 const char *cp;
1565 char nbuf[16];
1566 int len;
1567
1568 /*
1569 * Try to lookup a predefined HTML entity for it
1570 */
1571 ent = htmlEntityValueLookup(c);
1572 if (ent == NULL) {
1573 sprintf(nbuf, "#%u", c);
1574 cp = nbuf;
1575 }
1576 else
1577 cp = ent->name;
1578 len = strlen(cp);
1579 if (out + 2 + len > outend)
1580 break;
1581 *out++ = '&';
1582 memcpy(out, cp, len);
1583 out += len;
1584 *out++ = ';';
1585 }
1586 processed = in;
1587 }
1588 *outlen = out - outstart;
1589 *inlen = processed - instart;
1590 return(0);
1591}
1592
1593/**
1594 * htmlDecodeEntities:
1595 * @ctxt: the parser context
1596 * @len: the len to decode (in bytes !), -1 for no size limit
1597 * @end: an end marker xmlChar, 0 if none
1598 * @end2: an end marker xmlChar, 0 if none
1599 * @end3: an end marker xmlChar, 0 if none
1600 *
1601 * Subtitute the HTML entities by their value
1602 *
1603 * DEPRECATED !!!!
1604 *
1605 * Returns A newly allocated string with the substitution done. The caller
1606 * must deallocate it !
1607 */
1608xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001609htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1610 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001611 static int deprecated = 0;
1612 if (!deprecated) {
1613 xmlGenericError(xmlGenericErrorContext,
1614 "htmlDecodeEntities() deprecated function reached\n");
1615 deprecated = 1;
1616 }
1617 return(NULL);
1618#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00001619 xmlChar *name = NULL;
1620 xmlChar *buffer = NULL;
1621 unsigned int buffer_size = 0;
1622 unsigned int nbchars = 0;
1623 htmlEntityDescPtr ent;
1624 unsigned int max = (unsigned int) len;
1625 int c,l;
1626
1627 if (ctxt->depth > 40) {
1628 ctxt->errNo = XML_ERR_ENTITY_LOOP;
1629 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1630 ctxt->sax->error(ctxt->userData,
1631 "Detected entity reference loop\n");
1632 ctxt->wellFormed = 0;
1633 ctxt->disableSAX = 1;
1634 return(NULL);
1635 }
1636
1637 /*
1638 * allocate a translation buffer.
1639 */
1640 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1641 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1642 if (buffer == NULL) {
1643 perror("xmlDecodeEntities: malloc failed");
1644 return(NULL);
1645 }
1646
1647 /*
1648 * Ok loop until we reach one of the ending char or a size limit.
1649 */
1650 c = CUR_CHAR(l);
1651 while ((nbchars < max) && (c != end) &&
1652 (c != end2) && (c != end3)) {
1653
1654 if (c == 0) break;
1655 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1656 int val = htmlParseCharRef(ctxt);
1657 COPY_BUF(0,buffer,nbchars,val);
1658 NEXTL(l);
1659 } else if ((c == '&') && (ctxt->token != '&')) {
1660 ent = htmlParseEntityRef(ctxt, &name);
1661 if (name != NULL) {
1662 if (ent != NULL) {
1663 int val = ent->value;
1664 COPY_BUF(0,buffer,nbchars,val);
1665 NEXTL(l);
1666 } else {
1667 const xmlChar *cur = name;
1668
1669 buffer[nbchars++] = '&';
1670 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1671 growBuffer(buffer);
1672 }
1673 while (*cur != 0) {
1674 buffer[nbchars++] = *cur++;
1675 }
1676 buffer[nbchars++] = ';';
1677 }
1678 }
1679 } else {
1680 COPY_BUF(l,buffer,nbchars,c);
1681 NEXTL(l);
1682 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1683 growBuffer(buffer);
1684 }
1685 }
1686 c = CUR_CHAR(l);
1687 }
1688 buffer[nbchars++] = 0;
1689 return(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001690#endif
Owen Taylor3473f882001-02-23 17:55:21 +00001691}
1692
1693/************************************************************************
1694 * *
1695 * Commodity functions to handle streams *
1696 * *
1697 ************************************************************************/
1698
1699/**
Owen Taylor3473f882001-02-23 17:55:21 +00001700 * htmlNewInputStream:
1701 * @ctxt: an HTML parser context
1702 *
1703 * Create a new input stream structure
1704 * Returns the new input stream or NULL
1705 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001706static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001707htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1708 htmlParserInputPtr input;
1709
1710 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1711 if (input == NULL) {
1712 ctxt->errNo = XML_ERR_NO_MEMORY;
1713 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1714 ctxt->sax->error(ctxt->userData,
1715 "malloc: couldn't allocate a new input stream\n");
1716 return(NULL);
1717 }
1718 memset(input, 0, sizeof(htmlParserInput));
1719 input->filename = NULL;
1720 input->directory = NULL;
1721 input->base = NULL;
1722 input->cur = NULL;
1723 input->buf = NULL;
1724 input->line = 1;
1725 input->col = 1;
1726 input->buf = NULL;
1727 input->free = NULL;
1728 input->version = NULL;
1729 input->consumed = 0;
1730 input->length = 0;
1731 return(input);
1732}
1733
1734
1735/************************************************************************
1736 * *
1737 * Commodity functions, cleanup needed ? *
1738 * *
1739 ************************************************************************/
1740
1741/**
1742 * areBlanks:
1743 * @ctxt: an HTML parser context
1744 * @str: a xmlChar *
1745 * @len: the size of @str
1746 *
1747 * Is this a sequence of blank chars that one can ignore ?
1748 *
1749 * Returns 1 if ignorable 0 otherwise.
1750 */
1751
1752static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1753 int i;
1754 xmlNodePtr lastChild;
1755
1756 for (i = 0;i < len;i++)
1757 if (!(IS_BLANK(str[i]))) return(0);
1758
1759 if (CUR == 0) return(1);
1760 if (CUR != '<') return(0);
1761 if (ctxt->name == NULL)
1762 return(1);
1763 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1764 return(1);
1765 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1766 return(1);
1767 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1768 return(1);
1769 if (ctxt->node == NULL) return(0);
1770 lastChild = xmlGetLastChild(ctxt->node);
1771 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00001772 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
1773 (ctxt->node->content != NULL)) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001774 } else if (xmlNodeIsText(lastChild)) {
1775 return(0);
1776 } else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
1777 return(0);
1778 } else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
1779 return(0);
1780 } else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
1781 return(0);
1782 }
1783 return(1);
1784}
1785
1786/**
Owen Taylor3473f882001-02-23 17:55:21 +00001787 * htmlNewDocNoDtD:
1788 * @URI: URI for the dtd, or NULL
1789 * @ExternalID: the external ID of the DTD, or NULL
1790 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001791 * Creates a new HTML document without a DTD node if @URI and @ExternalID
1792 * are NULL
1793 *
Owen Taylor3473f882001-02-23 17:55:21 +00001794 * Returns a new document, do not intialize the DTD if not provided
1795 */
1796htmlDocPtr
1797htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1798 xmlDocPtr cur;
1799
1800 /*
1801 * Allocate a new document and fill the fields.
1802 */
1803 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1804 if (cur == NULL) {
1805 xmlGenericError(xmlGenericErrorContext,
1806 "xmlNewDoc : malloc failed\n");
1807 return(NULL);
1808 }
1809 memset(cur, 0, sizeof(xmlDoc));
1810
1811 cur->type = XML_HTML_DOCUMENT_NODE;
1812 cur->version = NULL;
1813 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001814 cur->doc = cur;
1815 cur->name = NULL;
1816 cur->children = NULL;
1817 cur->extSubset = NULL;
1818 cur->oldNs = NULL;
1819 cur->encoding = NULL;
1820 cur->standalone = 1;
1821 cur->compression = 0;
1822 cur->ids = NULL;
1823 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001824 cur->_private = NULL;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00001825 if ((ExternalID != NULL) ||
1826 (URI != NULL))
Daniel Veillard5151c062001-10-23 13:10:19 +00001827 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00001828 return(cur);
1829}
1830
1831/**
1832 * htmlNewDoc:
1833 * @URI: URI for the dtd, or NULL
1834 * @ExternalID: the external ID of the DTD, or NULL
1835 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001836 * Creates a new HTML document
1837 *
Owen Taylor3473f882001-02-23 17:55:21 +00001838 * Returns a new document
1839 */
1840htmlDocPtr
1841htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1842 if ((URI == NULL) && (ExternalID == NULL))
1843 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00001844 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
1845 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00001846
1847 return(htmlNewDocNoDtD(URI, ExternalID));
1848}
1849
1850
1851/************************************************************************
1852 * *
1853 * The parser itself *
1854 * Relates to http://www.w3.org/TR/html40 *
1855 * *
1856 ************************************************************************/
1857
1858/************************************************************************
1859 * *
1860 * The parser itself *
1861 * *
1862 ************************************************************************/
1863
1864/**
1865 * htmlParseHTMLName:
1866 * @ctxt: an HTML parser context
1867 *
1868 * parse an HTML tag or attribute name, note that we convert it to lowercase
1869 * since HTML names are not case-sensitive.
1870 *
1871 * Returns the Tag Name parsed or NULL
1872 */
1873
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001874static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001875htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1876 xmlChar *ret = NULL;
1877 int i = 0;
1878 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1879
1880 if (!IS_LETTER(CUR) && (CUR != '_') &&
1881 (CUR != ':')) return(NULL);
1882
1883 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1884 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1885 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1886 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1887 else loc[i] = CUR;
1888 i++;
1889
1890 NEXT;
1891 }
1892
1893 ret = xmlStrndup(loc, i);
1894
1895 return(ret);
1896}
1897
1898/**
1899 * htmlParseName:
1900 * @ctxt: an HTML parser context
1901 *
1902 * parse an HTML name, this routine is case sensistive.
1903 *
1904 * Returns the Name parsed or NULL
1905 */
1906
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001907static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001908htmlParseName(htmlParserCtxtPtr ctxt) {
1909 xmlChar buf[HTML_MAX_NAMELEN];
1910 int len = 0;
1911
1912 GROW;
1913 if (!IS_LETTER(CUR) && (CUR != '_')) {
1914 return(NULL);
1915 }
1916
1917 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1918 (CUR == '.') || (CUR == '-') ||
1919 (CUR == '_') || (CUR == ':') ||
1920 (IS_COMBINING(CUR)) ||
1921 (IS_EXTENDER(CUR))) {
1922 buf[len++] = CUR;
1923 NEXT;
1924 if (len >= HTML_MAX_NAMELEN) {
1925 xmlGenericError(xmlGenericErrorContext,
1926 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1927 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1928 (CUR == '.') || (CUR == '-') ||
1929 (CUR == '_') || (CUR == ':') ||
1930 (IS_COMBINING(CUR)) ||
1931 (IS_EXTENDER(CUR)))
1932 NEXT;
1933 break;
1934 }
1935 }
1936 return(xmlStrndup(buf, len));
1937}
1938
1939/**
1940 * htmlParseHTMLAttribute:
1941 * @ctxt: an HTML parser context
1942 * @stop: a char stop value
1943 *
1944 * parse an HTML attribute value till the stop (quote), if
1945 * stop is 0 then it stops at the first space
1946 *
1947 * Returns the attribute parsed or NULL
1948 */
1949
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001950static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001951htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1952 xmlChar *buffer = NULL;
1953 int buffer_size = 0;
1954 xmlChar *out = NULL;
1955 xmlChar *name = NULL;
1956
1957 xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00001958 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001959
1960 /*
1961 * allocate a translation buffer.
1962 */
1963 buffer_size = HTML_PARSER_BUFFER_SIZE;
1964 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1965 if (buffer == NULL) {
1966 perror("htmlParseHTMLAttribute: malloc failed");
1967 return(NULL);
1968 }
1969 out = buffer;
1970
1971 /*
1972 * Ok loop until we reach one of the ending chars
1973 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00001974 while ((CUR != 0) && (CUR != stop)) {
1975 if ((stop == 0) && (CUR == '>')) break;
Owen Taylor3473f882001-02-23 17:55:21 +00001976 if ((stop == 0) && (IS_BLANK(CUR))) break;
1977 if (CUR == '&') {
1978 if (NXT(1) == '#') {
1979 unsigned int c;
1980 int bits;
1981
1982 c = htmlParseCharRef(ctxt);
1983 if (c < 0x80)
1984 { *out++ = c; bits= -6; }
1985 else if (c < 0x800)
1986 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1987 else if (c < 0x10000)
1988 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1989 else
1990 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1991
1992 for ( ; bits >= 0; bits-= 6) {
1993 *out++ = ((c >> bits) & 0x3F) | 0x80;
1994 }
1995 } else {
1996 ent = htmlParseEntityRef(ctxt, &name);
1997 if (name == NULL) {
1998 *out++ = '&';
1999 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002000 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002001
2002 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002003 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002004 }
2005 } else if (ent == NULL) {
2006 *out++ = '&';
2007 cur = name;
2008 while (*cur != 0) {
2009 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002010 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002011
2012 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002013 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002014 }
2015 *out++ = *cur++;
2016 }
2017 xmlFree(name);
2018 } else {
2019 unsigned int c;
2020 int bits;
2021
2022 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002023 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002024
2025 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002026 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002027 }
2028 c = (xmlChar)ent->value;
2029 if (c < 0x80)
2030 { *out++ = c; bits= -6; }
2031 else if (c < 0x800)
2032 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2033 else if (c < 0x10000)
2034 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2035 else
2036 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2037
2038 for ( ; bits >= 0; bits-= 6) {
2039 *out++ = ((c >> bits) & 0x3F) | 0x80;
2040 }
2041 xmlFree(name);
2042 }
2043 }
2044 } else {
2045 unsigned int c;
2046 int bits, l;
2047
2048 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002049 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002050
2051 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002052 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002053 }
2054 c = CUR_CHAR(l);
2055 if (c < 0x80)
2056 { *out++ = c; bits= -6; }
2057 else if (c < 0x800)
2058 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2059 else if (c < 0x10000)
2060 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2061 else
2062 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2063
2064 for ( ; bits >= 0; bits-= 6) {
2065 *out++ = ((c >> bits) & 0x3F) | 0x80;
2066 }
2067 NEXT;
2068 }
2069 }
2070 *out++ = 0;
2071 return(buffer);
2072}
2073
2074/**
Owen Taylor3473f882001-02-23 17:55:21 +00002075 * htmlParseEntityRef:
2076 * @ctxt: an HTML parser context
2077 * @str: location to store the entity name
2078 *
2079 * parse an HTML ENTITY references
2080 *
2081 * [68] EntityRef ::= '&' Name ';'
2082 *
2083 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2084 * if non-NULL *str will have to be freed by the caller.
2085 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002086const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00002087htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2088 xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002089 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002090 *str = NULL;
2091
2092 if (CUR == '&') {
2093 NEXT;
2094 name = htmlParseName(ctxt);
2095 if (name == NULL) {
2096 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2097 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2098 ctxt->wellFormed = 0;
2099 } else {
2100 GROW;
2101 if (CUR == ';') {
2102 *str = name;
2103
2104 /*
2105 * Lookup the entity in the table.
2106 */
2107 ent = htmlEntityLookup(name);
2108 if (ent != NULL) /* OK that's ugly !!! */
2109 NEXT;
2110 } else {
2111 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2112 ctxt->sax->error(ctxt->userData,
2113 "htmlParseEntityRef: expecting ';'\n");
2114 *str = name;
2115 }
2116 }
2117 }
2118 return(ent);
2119}
2120
2121/**
2122 * htmlParseAttValue:
2123 * @ctxt: an HTML parser context
2124 *
2125 * parse a value for an attribute
2126 * Note: the parser won't do substitution of entities here, this
2127 * will be handled later in xmlStringGetNodeList, unless it was
2128 * asked for ctxt->replaceEntities != 0
2129 *
2130 * Returns the AttValue parsed or NULL.
2131 */
2132
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002133static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002134htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2135 xmlChar *ret = NULL;
2136
2137 if (CUR == '"') {
2138 NEXT;
2139 ret = htmlParseHTMLAttribute(ctxt, '"');
2140 if (CUR != '"') {
2141 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2142 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2143 ctxt->wellFormed = 0;
2144 } else
2145 NEXT;
2146 } else if (CUR == '\'') {
2147 NEXT;
2148 ret = htmlParseHTMLAttribute(ctxt, '\'');
2149 if (CUR != '\'') {
2150 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2151 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2152 ctxt->wellFormed = 0;
2153 } else
2154 NEXT;
2155 } else {
2156 /*
2157 * That's an HTMLism, the attribute value may not be quoted
2158 */
2159 ret = htmlParseHTMLAttribute(ctxt, 0);
2160 if (ret == NULL) {
2161 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2162 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2163 ctxt->wellFormed = 0;
2164 }
2165 }
2166 return(ret);
2167}
2168
2169/**
2170 * htmlParseSystemLiteral:
2171 * @ctxt: an HTML parser context
2172 *
2173 * parse an HTML Literal
2174 *
2175 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2176 *
2177 * Returns the SystemLiteral parsed or NULL
2178 */
2179
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002180static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002181htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2182 const xmlChar *q;
2183 xmlChar *ret = NULL;
2184
2185 if (CUR == '"') {
2186 NEXT;
2187 q = CUR_PTR;
2188 while ((IS_CHAR(CUR)) && (CUR != '"'))
2189 NEXT;
2190 if (!IS_CHAR(CUR)) {
2191 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2192 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2193 ctxt->wellFormed = 0;
2194 } else {
2195 ret = xmlStrndup(q, CUR_PTR - q);
2196 NEXT;
2197 }
2198 } else if (CUR == '\'') {
2199 NEXT;
2200 q = CUR_PTR;
2201 while ((IS_CHAR(CUR)) && (CUR != '\''))
2202 NEXT;
2203 if (!IS_CHAR(CUR)) {
2204 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2205 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2206 ctxt->wellFormed = 0;
2207 } else {
2208 ret = xmlStrndup(q, CUR_PTR - q);
2209 NEXT;
2210 }
2211 } else {
2212 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2213 ctxt->sax->error(ctxt->userData,
2214 "SystemLiteral \" or ' expected\n");
2215 ctxt->wellFormed = 0;
2216 }
2217
2218 return(ret);
2219}
2220
2221/**
2222 * htmlParsePubidLiteral:
2223 * @ctxt: an HTML parser context
2224 *
2225 * parse an HTML public literal
2226 *
2227 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2228 *
2229 * Returns the PubidLiteral parsed or NULL.
2230 */
2231
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002232static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002233htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2234 const xmlChar *q;
2235 xmlChar *ret = NULL;
2236 /*
2237 * Name ::= (Letter | '_') (NameChar)*
2238 */
2239 if (CUR == '"') {
2240 NEXT;
2241 q = CUR_PTR;
2242 while (IS_PUBIDCHAR(CUR)) NEXT;
2243 if (CUR != '"') {
2244 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2245 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2246 ctxt->wellFormed = 0;
2247 } else {
2248 ret = xmlStrndup(q, CUR_PTR - q);
2249 NEXT;
2250 }
2251 } else if (CUR == '\'') {
2252 NEXT;
2253 q = CUR_PTR;
2254 while ((IS_LETTER(CUR)) && (CUR != '\''))
2255 NEXT;
2256 if (!IS_LETTER(CUR)) {
2257 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2258 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2259 ctxt->wellFormed = 0;
2260 } else {
2261 ret = xmlStrndup(q, CUR_PTR - q);
2262 NEXT;
2263 }
2264 } else {
2265 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2266 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2267 ctxt->wellFormed = 0;
2268 }
2269
2270 return(ret);
2271}
2272
2273/**
2274 * htmlParseScript:
2275 * @ctxt: an HTML parser context
2276 *
2277 * parse the content of an HTML SCRIPT or STYLE element
2278 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2279 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2280 * http://www.w3.org/TR/html4/types.html#type-script
2281 * http://www.w3.org/TR/html4/types.html#h-6.15
2282 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2283 *
2284 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2285 * element and the value of intrinsic event attributes. User agents must
2286 * not evaluate script data as HTML markup but instead must pass it on as
2287 * data to a script engine.
2288 * NOTES:
2289 * - The content is passed like CDATA
2290 * - the attributes for style and scripting "onXXX" are also described
2291 * as CDATA but SGML allows entities references in attributes so their
2292 * processing is identical as other attributes
2293 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002294static void
Owen Taylor3473f882001-02-23 17:55:21 +00002295htmlParseScript(htmlParserCtxtPtr ctxt) {
2296 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2297 int nbchar = 0;
2298 xmlChar cur;
2299
2300 SHRINK;
2301 cur = CUR;
2302 while (IS_CHAR(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002303 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2304 (NXT(3) == '-')) {
2305 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2306 if (ctxt->sax->cdataBlock!= NULL) {
2307 /*
2308 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2309 */
2310 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2311 }
2312 }
2313 nbchar = 0;
2314 htmlParseComment(ctxt);
2315 cur = CUR;
2316 continue;
2317 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002318 /*
2319 * One should break here, the specification is clear:
2320 * Authors should therefore escape "</" within the content.
2321 * Escape mechanisms are specific to each scripting or
2322 * style sheet language.
2323 */
2324 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2325 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2326 break; /* while */
2327 }
2328 buf[nbchar++] = cur;
2329 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2330 if (ctxt->sax->cdataBlock!= NULL) {
2331 /*
2332 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2333 */
2334 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2335 }
2336 nbchar = 0;
2337 }
2338 NEXT;
2339 cur = CUR;
2340 }
2341 if (!(IS_CHAR(cur))) {
2342 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2343 ctxt->sax->error(ctxt->userData,
2344 "Invalid char in CDATA 0x%X\n", cur);
2345 ctxt->wellFormed = 0;
2346 NEXT;
2347 }
2348
2349 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2350 if (ctxt->sax->cdataBlock!= NULL) {
2351 /*
2352 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2353 */
2354 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2355 }
2356 }
2357}
2358
2359
2360/**
2361 * htmlParseCharData:
2362 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002363 *
2364 * parse a CharData section.
2365 * if we are within a CDATA section ']]>' marks an end of section.
2366 *
2367 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2368 */
2369
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002370static void
2371htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002372 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2373 int nbchar = 0;
2374 int cur, l;
2375
2376 SHRINK;
2377 cur = CUR_CHAR(l);
2378 while (((cur != '<') || (ctxt->token == '<')) &&
2379 ((cur != '&') || (ctxt->token == '&')) &&
2380 (IS_CHAR(cur))) {
2381 COPY_BUF(l,buf,nbchar,cur);
2382 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2383 /*
2384 * Ok the segment is to be consumed as chars.
2385 */
2386 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2387 if (areBlanks(ctxt, buf, nbchar)) {
2388 if (ctxt->sax->ignorableWhitespace != NULL)
2389 ctxt->sax->ignorableWhitespace(ctxt->userData,
2390 buf, nbchar);
2391 } else {
2392 htmlCheckParagraph(ctxt);
2393 if (ctxt->sax->characters != NULL)
2394 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2395 }
2396 }
2397 nbchar = 0;
2398 }
2399 NEXTL(l);
2400 cur = CUR_CHAR(l);
2401 }
2402 if (nbchar != 0) {
2403 /*
2404 * Ok the segment is to be consumed as chars.
2405 */
2406 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2407 if (areBlanks(ctxt, buf, nbchar)) {
2408 if (ctxt->sax->ignorableWhitespace != NULL)
2409 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2410 } else {
2411 htmlCheckParagraph(ctxt);
2412 if (ctxt->sax->characters != NULL)
2413 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2414 }
2415 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002416 } else {
2417 /*
2418 * Loop detection
2419 */
2420 if (cur == 0)
2421 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002422 }
2423}
2424
2425/**
2426 * htmlParseExternalID:
2427 * @ctxt: an HTML parser context
2428 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002429 *
2430 * Parse an External ID or a Public ID
2431 *
Owen Taylor3473f882001-02-23 17:55:21 +00002432 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2433 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2434 *
2435 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2436 *
2437 * Returns the function returns SystemLiteral and in the second
2438 * case publicID receives PubidLiteral, is strict is off
2439 * it is possible to return NULL and have publicID set.
2440 */
2441
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002442static xmlChar *
2443htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002444 xmlChar *URI = NULL;
2445
2446 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2447 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2448 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2449 SKIP(6);
2450 if (!IS_BLANK(CUR)) {
2451 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2452 ctxt->sax->error(ctxt->userData,
2453 "Space required after 'SYSTEM'\n");
2454 ctxt->wellFormed = 0;
2455 }
2456 SKIP_BLANKS;
2457 URI = htmlParseSystemLiteral(ctxt);
2458 if (URI == NULL) {
2459 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2460 ctxt->sax->error(ctxt->userData,
2461 "htmlParseExternalID: SYSTEM, no URI\n");
2462 ctxt->wellFormed = 0;
2463 }
2464 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2465 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2466 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2467 SKIP(6);
2468 if (!IS_BLANK(CUR)) {
2469 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2470 ctxt->sax->error(ctxt->userData,
2471 "Space required after 'PUBLIC'\n");
2472 ctxt->wellFormed = 0;
2473 }
2474 SKIP_BLANKS;
2475 *publicID = htmlParsePubidLiteral(ctxt);
2476 if (*publicID == NULL) {
2477 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2478 ctxt->sax->error(ctxt->userData,
2479 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2480 ctxt->wellFormed = 0;
2481 }
2482 SKIP_BLANKS;
2483 if ((CUR == '"') || (CUR == '\'')) {
2484 URI = htmlParseSystemLiteral(ctxt);
2485 }
2486 }
2487 return(URI);
2488}
2489
2490/**
2491 * htmlParseComment:
2492 * @ctxt: an HTML parser context
2493 *
2494 * Parse an XML (SGML) comment <!-- .... -->
2495 *
2496 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2497 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002498static void
Owen Taylor3473f882001-02-23 17:55:21 +00002499htmlParseComment(htmlParserCtxtPtr ctxt) {
2500 xmlChar *buf = NULL;
2501 int len;
2502 int size = HTML_PARSER_BUFFER_SIZE;
2503 int q, ql;
2504 int r, rl;
2505 int cur, l;
2506 xmlParserInputState state;
2507
2508 /*
2509 * Check that there is a comment right here.
2510 */
2511 if ((RAW != '<') || (NXT(1) != '!') ||
2512 (NXT(2) != '-') || (NXT(3) != '-')) return;
2513
2514 state = ctxt->instate;
2515 ctxt->instate = XML_PARSER_COMMENT;
2516 SHRINK;
2517 SKIP(4);
2518 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2519 if (buf == NULL) {
2520 xmlGenericError(xmlGenericErrorContext,
2521 "malloc of %d byte failed\n", size);
2522 ctxt->instate = state;
2523 return;
2524 }
2525 q = CUR_CHAR(ql);
2526 NEXTL(ql);
2527 r = CUR_CHAR(rl);
2528 NEXTL(rl);
2529 cur = CUR_CHAR(l);
2530 len = 0;
2531 while (IS_CHAR(cur) &&
2532 ((cur != '>') ||
2533 (r != '-') || (q != '-'))) {
2534 if (len + 5 >= size) {
2535 size *= 2;
2536 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2537 if (buf == NULL) {
2538 xmlGenericError(xmlGenericErrorContext,
2539 "realloc of %d byte failed\n", size);
2540 ctxt->instate = state;
2541 return;
2542 }
2543 }
2544 COPY_BUF(ql,buf,len,q);
2545 q = r;
2546 ql = rl;
2547 r = cur;
2548 rl = l;
2549 NEXTL(l);
2550 cur = CUR_CHAR(l);
2551 if (cur == 0) {
2552 SHRINK;
2553 GROW;
2554 cur = CUR_CHAR(l);
2555 }
2556 }
2557 buf[len] = 0;
2558 if (!IS_CHAR(cur)) {
2559 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2560 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2561 ctxt->sax->error(ctxt->userData,
2562 "Comment not terminated \n<!--%.50s\n", buf);
2563 ctxt->wellFormed = 0;
2564 xmlFree(buf);
2565 } else {
2566 NEXT;
2567 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2568 (!ctxt->disableSAX))
2569 ctxt->sax->comment(ctxt->userData, buf);
2570 xmlFree(buf);
2571 }
2572 ctxt->instate = state;
2573}
2574
2575/**
2576 * htmlParseCharRef:
2577 * @ctxt: an HTML parser context
2578 *
2579 * parse Reference declarations
2580 *
2581 * [66] CharRef ::= '&#' [0-9]+ ';' |
2582 * '&#x' [0-9a-fA-F]+ ';'
2583 *
2584 * Returns the value parsed (as an int)
2585 */
2586int
2587htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2588 int val = 0;
2589
2590 if ((CUR == '&') && (NXT(1) == '#') &&
2591 (NXT(2) == 'x')) {
2592 SKIP(3);
2593 while (CUR != ';') {
2594 if ((CUR >= '0') && (CUR <= '9'))
2595 val = val * 16 + (CUR - '0');
2596 else if ((CUR >= 'a') && (CUR <= 'f'))
2597 val = val * 16 + (CUR - 'a') + 10;
2598 else if ((CUR >= 'A') && (CUR <= 'F'))
2599 val = val * 16 + (CUR - 'A') + 10;
2600 else {
2601 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2602 ctxt->sax->error(ctxt->userData,
2603 "htmlParseCharRef: invalid hexadecimal value\n");
2604 ctxt->wellFormed = 0;
2605 return(0);
2606 }
2607 NEXT;
2608 }
2609 if (CUR == ';')
2610 NEXT;
2611 } else if ((CUR == '&') && (NXT(1) == '#')) {
2612 SKIP(2);
2613 while (CUR != ';') {
2614 if ((CUR >= '0') && (CUR <= '9'))
2615 val = val * 10 + (CUR - '0');
2616 else {
2617 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2618 ctxt->sax->error(ctxt->userData,
2619 "htmlParseCharRef: invalid decimal value\n");
2620 ctxt->wellFormed = 0;
2621 return(0);
2622 }
2623 NEXT;
2624 }
2625 if (CUR == ';')
2626 NEXT;
2627 } else {
2628 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2629 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2630 ctxt->wellFormed = 0;
2631 }
2632 /*
2633 * Check the value IS_CHAR ...
2634 */
2635 if (IS_CHAR(val)) {
2636 return(val);
2637 } else {
2638 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2639 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2640 val);
2641 ctxt->wellFormed = 0;
2642 }
2643 return(0);
2644}
2645
2646
2647/**
2648 * htmlParseDocTypeDecl :
2649 * @ctxt: an HTML parser context
2650 *
2651 * parse a DOCTYPE declaration
2652 *
2653 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2654 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2655 */
2656
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002657static void
Owen Taylor3473f882001-02-23 17:55:21 +00002658htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2659 xmlChar *name;
2660 xmlChar *ExternalID = NULL;
2661 xmlChar *URI = NULL;
2662
2663 /*
2664 * We know that '<!DOCTYPE' has been detected.
2665 */
2666 SKIP(9);
2667
2668 SKIP_BLANKS;
2669
2670 /*
2671 * Parse the DOCTYPE name.
2672 */
2673 name = htmlParseName(ctxt);
2674 if (name == NULL) {
2675 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2676 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2677 ctxt->wellFormed = 0;
2678 }
2679 /*
2680 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2681 */
2682
2683 SKIP_BLANKS;
2684
2685 /*
2686 * Check for SystemID and ExternalID
2687 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002688 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002689 SKIP_BLANKS;
2690
2691 /*
2692 * We should be at the end of the DOCTYPE declaration.
2693 */
2694 if (CUR != '>') {
2695 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardf6ed8bc2001-10-02 09:22:47 +00002696 ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002697 ctxt->wellFormed = 0;
2698 /* We shouldn't try to resynchronize ... */
2699 }
2700 NEXT;
2701
2702 /*
2703 * Create or update the document accordingly to the DOCTYPE
2704 */
2705 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2706 (!ctxt->disableSAX))
2707 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2708
2709 /*
2710 * Cleanup, since we don't use all those identifiers
2711 */
2712 if (URI != NULL) xmlFree(URI);
2713 if (ExternalID != NULL) xmlFree(ExternalID);
2714 if (name != NULL) xmlFree(name);
2715}
2716
2717/**
2718 * htmlParseAttribute:
2719 * @ctxt: an HTML parser context
2720 * @value: a xmlChar ** used to store the value of the attribute
2721 *
2722 * parse an attribute
2723 *
2724 * [41] Attribute ::= Name Eq AttValue
2725 *
2726 * [25] Eq ::= S? '=' S?
2727 *
2728 * With namespace:
2729 *
2730 * [NS 11] Attribute ::= QName Eq AttValue
2731 *
2732 * Also the case QName == xmlns:??? is handled independently as a namespace
2733 * definition.
2734 *
2735 * Returns the attribute name, and the value in *value.
2736 */
2737
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002738static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002739htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2740 xmlChar *name, *val = NULL;
2741
2742 *value = NULL;
2743 name = htmlParseHTMLName(ctxt);
2744 if (name == NULL) {
2745 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2746 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2747 ctxt->wellFormed = 0;
2748 return(NULL);
2749 }
2750
2751 /*
2752 * read the value
2753 */
2754 SKIP_BLANKS;
2755 if (CUR == '=') {
2756 NEXT;
2757 SKIP_BLANKS;
2758 val = htmlParseAttValue(ctxt);
2759 /******
2760 } else {
2761 * TODO : some attribute must have values, some may not
2762 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2763 ctxt->sax->warning(ctxt->userData,
2764 "No value for attribute %s\n", name); */
2765 }
2766
2767 *value = val;
2768 return(name);
2769}
2770
2771/**
2772 * htmlCheckEncoding:
2773 * @ctxt: an HTML parser context
2774 * @attvalue: the attribute value
2775 *
2776 * Checks an http-equiv attribute from a Meta tag to detect
2777 * the encoding
2778 * If a new encoding is detected the parser is switched to decode
2779 * it and pass UTF8
2780 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002781static void
Owen Taylor3473f882001-02-23 17:55:21 +00002782htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2783 const xmlChar *encoding;
2784
2785 if ((ctxt == NULL) || (attvalue == NULL))
2786 return;
2787
2788 /* do not change encoding */
2789 if (ctxt->input->encoding != NULL)
2790 return;
2791
2792 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2793 if (encoding != NULL) {
2794 encoding += 8;
2795 } else {
2796 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2797 if (encoding != NULL)
2798 encoding += 9;
2799 }
2800 if (encoding != NULL) {
2801 xmlCharEncoding enc;
2802 xmlCharEncodingHandlerPtr handler;
2803
2804 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2805
2806 if (ctxt->input->encoding != NULL)
2807 xmlFree((xmlChar *) ctxt->input->encoding);
2808 ctxt->input->encoding = xmlStrdup(encoding);
2809
2810 enc = xmlParseCharEncoding((const char *) encoding);
2811 /*
2812 * registered set of known encodings
2813 */
2814 if (enc != XML_CHAR_ENCODING_ERROR) {
2815 xmlSwitchEncoding(ctxt, enc);
2816 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2817 } else {
2818 /*
2819 * fallback for unknown encodings
2820 */
2821 handler = xmlFindCharEncodingHandler((const char *) encoding);
2822 if (handler != NULL) {
2823 xmlSwitchToEncoding(ctxt, handler);
2824 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2825 } else {
2826 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2827 }
2828 }
2829
2830 if ((ctxt->input->buf != NULL) &&
2831 (ctxt->input->buf->encoder != NULL) &&
2832 (ctxt->input->buf->raw != NULL) &&
2833 (ctxt->input->buf->buffer != NULL)) {
2834 int nbchars;
2835 int processed;
2836
2837 /*
2838 * convert as much as possible to the parser reading buffer.
2839 */
2840 processed = ctxt->input->cur - ctxt->input->base;
2841 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2842 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2843 ctxt->input->buf->buffer,
2844 ctxt->input->buf->raw);
2845 if (nbchars < 0) {
2846 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2847 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2848 ctxt->sax->error(ctxt->userData,
2849 "htmlCheckEncoding: encoder error\n");
2850 }
2851 ctxt->input->base =
2852 ctxt->input->cur = ctxt->input->buf->buffer->content;
2853 }
2854 }
2855}
2856
2857/**
2858 * htmlCheckMeta:
2859 * @ctxt: an HTML parser context
2860 * @atts: the attributes values
2861 *
2862 * Checks an attributes from a Meta tag
2863 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002864static void
Owen Taylor3473f882001-02-23 17:55:21 +00002865htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2866 int i;
2867 const xmlChar *att, *value;
2868 int http = 0;
2869 const xmlChar *content = NULL;
2870
2871 if ((ctxt == NULL) || (atts == NULL))
2872 return;
2873
2874 i = 0;
2875 att = atts[i++];
2876 while (att != NULL) {
2877 value = atts[i++];
2878 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2879 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2880 http = 1;
2881 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2882 content = value;
2883 att = atts[i++];
2884 }
2885 if ((http) && (content != NULL))
2886 htmlCheckEncoding(ctxt, content);
2887
2888}
2889
2890/**
2891 * htmlParseStartTag:
2892 * @ctxt: an HTML parser context
2893 *
2894 * parse a start of tag either for rule element or
2895 * EmptyElement. In both case we don't parse the tag closing chars.
2896 *
2897 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2898 *
2899 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2900 *
2901 * With namespace:
2902 *
2903 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2904 *
2905 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2906 *
2907 */
2908
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002909static void
Owen Taylor3473f882001-02-23 17:55:21 +00002910htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2911 xmlChar *name;
2912 xmlChar *attname;
2913 xmlChar *attvalue;
2914 const xmlChar **atts = NULL;
2915 int nbatts = 0;
2916 int maxatts = 0;
2917 int meta = 0;
2918 int i;
2919
2920 if (CUR != '<') return;
2921 NEXT;
2922
2923 GROW;
2924 name = htmlParseHTMLName(ctxt);
2925 if (name == NULL) {
2926 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2927 ctxt->sax->error(ctxt->userData,
2928 "htmlParseStartTag: invalid element name\n");
2929 ctxt->wellFormed = 0;
2930 /* Dump the bogus tag like browsers do */
2931 while ((IS_CHAR(CUR)) && (CUR != '>'))
2932 NEXT;
2933 return;
2934 }
2935 if (xmlStrEqual(name, BAD_CAST"meta"))
2936 meta = 1;
2937
2938 /*
2939 * Check for auto-closure of HTML elements.
2940 */
2941 htmlAutoClose(ctxt, name);
2942
2943 /*
2944 * Check for implied HTML elements.
2945 */
2946 htmlCheckImplied(ctxt, name);
2947
2948 /*
2949 * Avoid html at any level > 0, head at any level != 1
2950 * or any attempt to recurse body
2951 */
2952 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2953 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2954 ctxt->sax->error(ctxt->userData,
2955 "htmlParseStartTag: misplaced <html> tag\n");
2956 ctxt->wellFormed = 0;
2957 xmlFree(name);
2958 return;
2959 }
2960 if ((ctxt->nameNr != 1) &&
2961 (xmlStrEqual(name, BAD_CAST"head"))) {
2962 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2963 ctxt->sax->error(ctxt->userData,
2964 "htmlParseStartTag: misplaced <head> tag\n");
2965 ctxt->wellFormed = 0;
2966 xmlFree(name);
2967 return;
2968 }
2969 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002970 int indx;
2971 for (indx = 0;indx < ctxt->nameNr;indx++) {
2972 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00002973 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2974 ctxt->sax->error(ctxt->userData,
2975 "htmlParseStartTag: misplaced <body> tag\n");
2976 ctxt->wellFormed = 0;
2977 xmlFree(name);
2978 return;
2979 }
2980 }
2981 }
2982
2983 /*
2984 * Now parse the attributes, it ends up with the ending
2985 *
2986 * (S Attribute)* S?
2987 */
2988 SKIP_BLANKS;
2989 while ((IS_CHAR(CUR)) &&
2990 (CUR != '>') &&
2991 ((CUR != '/') || (NXT(1) != '>'))) {
2992 long cons = ctxt->nbChars;
2993
2994 GROW;
2995 attname = htmlParseAttribute(ctxt, &attvalue);
2996 if (attname != NULL) {
2997
2998 /*
2999 * Well formedness requires at most one declaration of an attribute
3000 */
3001 for (i = 0; i < nbatts;i += 2) {
3002 if (xmlStrEqual(atts[i], attname)) {
3003 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3004 ctxt->sax->error(ctxt->userData,
3005 "Attribute %s redefined\n",
3006 attname);
3007 ctxt->wellFormed = 0;
3008 xmlFree(attname);
3009 if (attvalue != NULL)
3010 xmlFree(attvalue);
3011 goto failed;
3012 }
3013 }
3014
3015 /*
3016 * Add the pair to atts
3017 */
3018 if (atts == NULL) {
3019 maxatts = 10;
3020 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
3021 if (atts == NULL) {
3022 xmlGenericError(xmlGenericErrorContext,
3023 "malloc of %ld byte failed\n",
3024 maxatts * (long)sizeof(xmlChar *));
3025 if (name != NULL) xmlFree(name);
3026 return;
3027 }
3028 } else if (nbatts + 4 > maxatts) {
3029 maxatts *= 2;
3030 atts = (const xmlChar **) xmlRealloc((void *) atts,
3031 maxatts * sizeof(xmlChar *));
3032 if (atts == NULL) {
3033 xmlGenericError(xmlGenericErrorContext,
3034 "realloc of %ld byte failed\n",
3035 maxatts * (long)sizeof(xmlChar *));
3036 if (name != NULL) xmlFree(name);
3037 return;
3038 }
3039 }
3040 atts[nbatts++] = attname;
3041 atts[nbatts++] = attvalue;
3042 atts[nbatts] = NULL;
3043 atts[nbatts + 1] = NULL;
3044 }
3045 else {
3046 /* Dump the bogus attribute string up to the next blank or
3047 * the end of the tag. */
3048 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3049 && ((CUR != '/') || (NXT(1) != '>')))
3050 NEXT;
3051 }
3052
3053failed:
3054 SKIP_BLANKS;
3055 if (cons == ctxt->nbChars) {
3056 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3057 ctxt->sax->error(ctxt->userData,
3058 "htmlParseStartTag: problem parsing attributes\n");
3059 ctxt->wellFormed = 0;
3060 break;
3061 }
3062 }
3063
3064 /*
3065 * Handle specific association to the META tag
3066 */
3067 if (meta)
3068 htmlCheckMeta(ctxt, atts);
3069
3070 /*
3071 * SAX: Start of Element !
3072 */
3073 htmlnamePush(ctxt, xmlStrdup(name));
3074#ifdef DEBUG
3075 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3076#endif
3077 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3078 ctxt->sax->startElement(ctxt->userData, name, atts);
3079
3080 if (atts != NULL) {
3081 for (i = 0;i < nbatts;i++) {
3082 if (atts[i] != NULL)
3083 xmlFree((xmlChar *) atts[i]);
3084 }
3085 xmlFree((void *) atts);
3086 }
3087 if (name != NULL) xmlFree(name);
3088}
3089
3090/**
3091 * htmlParseEndTag:
3092 * @ctxt: an HTML parser context
3093 *
3094 * parse an end of tag
3095 *
3096 * [42] ETag ::= '</' Name S? '>'
3097 *
3098 * With namespace
3099 *
3100 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003101 *
3102 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003103 */
3104
Daniel Veillardf420ac52001-07-04 16:04:09 +00003105static int
Owen Taylor3473f882001-02-23 17:55:21 +00003106htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3107 xmlChar *name;
3108 xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003109 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003110
3111 if ((CUR != '<') || (NXT(1) != '/')) {
3112 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3113 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3114 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003115 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003116 }
3117 SKIP(2);
3118
3119 name = htmlParseHTMLName(ctxt);
Daniel Veillardf420ac52001-07-04 16:04:09 +00003120 if (name == NULL) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003121
3122 /*
3123 * We should definitely be at the ending "S? '>'" part
3124 */
3125 SKIP_BLANKS;
3126 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3127 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3128 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3129 ctxt->wellFormed = 0;
3130 } else
3131 NEXT;
3132
3133 /*
3134 * If the name read is not one of the element in the parsing stack
3135 * then return, it's just an error.
3136 */
3137 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3138 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3139 }
3140 if (i < 0) {
3141 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3142 ctxt->sax->error(ctxt->userData,
3143 "Unexpected end tag : %s\n", name);
3144 xmlFree(name);
3145 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003146 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003147 }
3148
3149
3150 /*
3151 * Check for auto-closure of HTML elements.
3152 */
3153
3154 htmlAutoCloseOnClose(ctxt, name);
3155
3156 /*
3157 * Well formedness constraints, opening and closing must match.
3158 * With the exception that the autoclose may have popped stuff out
3159 * of the stack.
3160 */
3161 if (!xmlStrEqual(name, ctxt->name)) {
3162#ifdef DEBUG
3163 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3164#endif
3165 if ((ctxt->name != NULL) &&
3166 (!xmlStrEqual(ctxt->name, name))) {
3167 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3168 ctxt->sax->error(ctxt->userData,
3169 "Opening and ending tag mismatch: %s and %s\n",
3170 name, ctxt->name);
3171 ctxt->wellFormed = 0;
3172 }
3173 }
3174
3175 /*
3176 * SAX: End of Tag
3177 */
3178 oldname = ctxt->name;
3179 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3180 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3181 ctxt->sax->endElement(ctxt->userData, name);
3182 oldname = htmlnamePop(ctxt);
3183 if (oldname != NULL) {
3184#ifdef DEBUG
3185 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3186#endif
3187 xmlFree(oldname);
3188#ifdef DEBUG
3189 } else {
3190 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3191#endif
3192 }
Daniel Veillardf420ac52001-07-04 16:04:09 +00003193 ret = 1;
3194 } else {
3195 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003196 }
3197
3198 if (name != NULL)
3199 xmlFree(name);
3200
Daniel Veillardf420ac52001-07-04 16:04:09 +00003201 return(ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003202}
3203
3204
3205/**
3206 * htmlParseReference:
3207 * @ctxt: an HTML parser context
3208 *
3209 * parse and handle entity references in content,
3210 * this will end-up in a call to character() since this is either a
3211 * CharRef, or a predefined entity.
3212 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003213static void
Owen Taylor3473f882001-02-23 17:55:21 +00003214htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003215 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003216 xmlChar out[6];
3217 xmlChar *name;
3218 if (CUR != '&') return;
3219
3220 if (NXT(1) == '#') {
3221 unsigned int c;
3222 int bits, i = 0;
3223
3224 c = htmlParseCharRef(ctxt);
3225 if (c == 0)
3226 return;
3227
3228 if (c < 0x80) { out[i++]= c; bits= -6; }
3229 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3230 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3231 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3232
3233 for ( ; bits >= 0; bits-= 6) {
3234 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3235 }
3236 out[i] = 0;
3237
3238 htmlCheckParagraph(ctxt);
3239 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3240 ctxt->sax->characters(ctxt->userData, out, i);
3241 } else {
3242 ent = htmlParseEntityRef(ctxt, &name);
3243 if (name == NULL) {
3244 htmlCheckParagraph(ctxt);
3245 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3246 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3247 return;
3248 }
3249 if ((ent == NULL) || (ent->value <= 0)) {
3250 htmlCheckParagraph(ctxt);
3251 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3252 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3253 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3254 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3255 }
3256 } else {
3257 unsigned int c;
3258 int bits, i = 0;
3259
3260 c = ent->value;
3261 if (c < 0x80)
3262 { out[i++]= c; bits= -6; }
3263 else if (c < 0x800)
3264 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3265 else if (c < 0x10000)
3266 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3267 else
3268 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3269
3270 for ( ; bits >= 0; bits-= 6) {
3271 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3272 }
3273 out[i] = 0;
3274
3275 htmlCheckParagraph(ctxt);
3276 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3277 ctxt->sax->characters(ctxt->userData, out, i);
3278 }
3279 xmlFree(name);
3280 }
3281}
3282
3283/**
3284 * htmlParseContent:
3285 * @ctxt: an HTML parser context
3286 * @name: the node name
3287 *
3288 * Parse a content: comment, sub-element, reference or text.
3289 *
3290 */
3291
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003292static void
Owen Taylor3473f882001-02-23 17:55:21 +00003293htmlParseContent(htmlParserCtxtPtr ctxt) {
3294 xmlChar *currentNode;
3295 int depth;
3296
3297 currentNode = xmlStrdup(ctxt->name);
3298 depth = ctxt->nameNr;
3299 while (1) {
3300 long cons = ctxt->nbChars;
3301
3302 GROW;
3303 /*
3304 * Our tag or one of it's parent or children is ending.
3305 */
3306 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003307 if (htmlParseEndTag(ctxt) &&
3308 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3309 if (currentNode != NULL)
3310 xmlFree(currentNode);
3311 return;
3312 }
3313 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003314 }
3315
3316 /*
3317 * Has this node been popped out during parsing of
3318 * the next element
3319 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003320 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3321 (!xmlStrEqual(currentNode, ctxt->name)))
3322 {
Owen Taylor3473f882001-02-23 17:55:21 +00003323 if (currentNode != NULL) xmlFree(currentNode);
3324 return;
3325 }
3326
Daniel Veillardf9533d12001-03-03 10:04:57 +00003327 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3328 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003329 /*
3330 * Handle SCRIPT/STYLE separately
3331 */
3332 htmlParseScript(ctxt);
3333 } else {
3334 /*
3335 * Sometimes DOCTYPE arrives in the middle of the document
3336 */
3337 if ((CUR == '<') && (NXT(1) == '!') &&
3338 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3339 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3340 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3341 (UPP(8) == 'E')) {
3342 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3343 ctxt->sax->error(ctxt->userData,
3344 "Misplaced DOCTYPE declaration\n");
3345 ctxt->wellFormed = 0;
3346 htmlParseDocTypeDecl(ctxt);
3347 }
3348
3349 /*
3350 * First case : a comment
3351 */
3352 if ((CUR == '<') && (NXT(1) == '!') &&
3353 (NXT(2) == '-') && (NXT(3) == '-')) {
3354 htmlParseComment(ctxt);
3355 }
3356
3357 /*
3358 * Second case : a sub-element.
3359 */
3360 else if (CUR == '<') {
3361 htmlParseElement(ctxt);
3362 }
3363
3364 /*
3365 * Third case : a reference. If if has not been resolved,
3366 * parsing returns it's Name, create the node
3367 */
3368 else if (CUR == '&') {
3369 htmlParseReference(ctxt);
3370 }
3371
3372 /*
3373 * Fourth : end of the resource
3374 */
3375 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003376 htmlAutoCloseOnEnd(ctxt);
3377 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003378 }
3379
3380 /*
3381 * Last case, text. Note that References are handled directly.
3382 */
3383 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003384 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003385 }
3386
3387 if (cons == ctxt->nbChars) {
3388 if (ctxt->node != NULL) {
3389 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3390 ctxt->sax->error(ctxt->userData,
3391 "detected an error in element content\n");
3392 ctxt->wellFormed = 0;
3393 }
3394 break;
3395 }
3396 }
3397 GROW;
3398 }
3399 if (currentNode != NULL) xmlFree(currentNode);
3400}
3401
3402/**
3403 * htmlParseElement:
3404 * @ctxt: an HTML parser context
3405 *
3406 * parse an HTML element, this is highly recursive
3407 *
3408 * [39] element ::= EmptyElemTag | STag content ETag
3409 *
3410 * [41] Attribute ::= Name Eq AttValue
3411 */
3412
3413void
3414htmlParseElement(htmlParserCtxtPtr ctxt) {
3415 xmlChar *name;
3416 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003417 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003418 htmlParserNodeInfo node_info;
3419 xmlChar *oldname;
3420 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003421 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003422
3423 /* Capture start position */
3424 if (ctxt->record_info) {
3425 node_info.begin_pos = ctxt->input->consumed +
3426 (CUR_PTR - ctxt->input->base);
3427 node_info.begin_line = ctxt->input->line;
3428 }
3429
3430 oldname = xmlStrdup(ctxt->name);
3431 htmlParseStartTag(ctxt);
3432 name = ctxt->name;
3433#ifdef DEBUG
3434 if (oldname == NULL)
3435 xmlGenericError(xmlGenericErrorContext,
3436 "Start of element %s\n", name);
3437 else if (name == NULL)
3438 xmlGenericError(xmlGenericErrorContext,
3439 "Start of element failed, was %s\n", oldname);
3440 else
3441 xmlGenericError(xmlGenericErrorContext,
3442 "Start of element %s, was %s\n", name, oldname);
3443#endif
3444 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3445 (name == NULL)) {
3446 if (CUR == '>')
3447 NEXT;
3448 if (oldname != NULL)
3449 xmlFree(oldname);
3450 return;
3451 }
3452 if (oldname != NULL)
3453 xmlFree(oldname);
3454
3455 /*
3456 * Lookup the info for that element.
3457 */
3458 info = htmlTagLookup(name);
3459 if (info == NULL) {
3460 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3461 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3462 name);
3463 ctxt->wellFormed = 0;
3464 } else if (info->depr) {
3465/***************************
3466 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3467 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3468 name);
3469 ***************************/
3470 }
3471
3472 /*
3473 * Check for an Empty Element labelled the XML/SGML way
3474 */
3475 if ((CUR == '/') && (NXT(1) == '>')) {
3476 SKIP(2);
3477 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3478 ctxt->sax->endElement(ctxt->userData, name);
3479 oldname = htmlnamePop(ctxt);
3480#ifdef DEBUG
3481 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3482#endif
3483 if (oldname != NULL)
3484 xmlFree(oldname);
3485 return;
3486 }
3487
3488 if (CUR == '>') {
3489 NEXT;
3490 } else {
3491 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3492 ctxt->sax->error(ctxt->userData,
3493 "Couldn't find end of Start Tag %s\n",
3494 name);
3495 ctxt->wellFormed = 0;
3496
3497 /*
3498 * end of parsing of this node.
3499 */
3500 if (xmlStrEqual(name, ctxt->name)) {
3501 nodePop(ctxt);
3502 oldname = htmlnamePop(ctxt);
3503#ifdef DEBUG
3504 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3505#endif
3506 if (oldname != NULL)
3507 xmlFree(oldname);
3508 }
3509
3510 /*
3511 * Capture end position and add node
3512 */
3513 if ( currentNode != NULL && ctxt->record_info ) {
3514 node_info.end_pos = ctxt->input->consumed +
3515 (CUR_PTR - ctxt->input->base);
3516 node_info.end_line = ctxt->input->line;
3517 node_info.node = ctxt->node;
3518 xmlParserAddNodeInfo(ctxt, &node_info);
3519 }
3520 return;
3521 }
3522
3523 /*
3524 * Check for an Empty Element from DTD definition
3525 */
3526 if ((info != NULL) && (info->empty)) {
3527 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3528 ctxt->sax->endElement(ctxt->userData, name);
3529 oldname = htmlnamePop(ctxt);
3530#ifdef DEBUG
3531 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3532#endif
3533 if (oldname != NULL)
3534 xmlFree(oldname);
3535 return;
3536 }
3537
3538 /*
3539 * Parse the content of the element:
3540 */
3541 currentNode = xmlStrdup(ctxt->name);
3542 depth = ctxt->nameNr;
3543 while (IS_CHAR(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003544 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003545 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003546 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003547 if (ctxt->nameNr < depth) break;
3548 }
3549
Owen Taylor3473f882001-02-23 17:55:21 +00003550 /*
3551 * Capture end position and add node
3552 */
3553 if ( currentNode != NULL && ctxt->record_info ) {
3554 node_info.end_pos = ctxt->input->consumed +
3555 (CUR_PTR - ctxt->input->base);
3556 node_info.end_line = ctxt->input->line;
3557 node_info.node = ctxt->node;
3558 xmlParserAddNodeInfo(ctxt, &node_info);
3559 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003560 if (!IS_CHAR(CUR)) {
3561 htmlAutoCloseOnEnd(ctxt);
3562 }
3563
Owen Taylor3473f882001-02-23 17:55:21 +00003564 if (currentNode != NULL)
3565 xmlFree(currentNode);
3566}
3567
3568/**
3569 * htmlParseDocument :
3570 * @ctxt: an HTML parser context
3571 *
3572 * parse an HTML document (and build a tree if using the standard SAX
3573 * interface).
3574 *
3575 * Returns 0, -1 in case of error. the parser context is augmented
3576 * as a result of the parsing.
3577 */
3578
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003579static int
Owen Taylor3473f882001-02-23 17:55:21 +00003580htmlParseDocument(htmlParserCtxtPtr ctxt) {
3581 xmlDtdPtr dtd;
3582
Daniel Veillardd0463562001-10-13 09:15:48 +00003583 xmlInitParser();
3584
Owen Taylor3473f882001-02-23 17:55:21 +00003585 htmlDefaultSAXHandlerInit();
3586 ctxt->html = 1;
3587
3588 GROW;
3589 /*
3590 * SAX: beginning of the document processing.
3591 */
3592 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3593 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3594
3595 /*
3596 * Wipe out everything which is before the first '<'
3597 */
3598 SKIP_BLANKS;
3599 if (CUR == 0) {
3600 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3601 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3602 ctxt->wellFormed = 0;
3603 }
3604
3605 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3606 ctxt->sax->startDocument(ctxt->userData);
3607
3608
3609 /*
3610 * Parse possible comments before any content
3611 */
3612 while ((CUR == '<') && (NXT(1) == '!') &&
3613 (NXT(2) == '-') && (NXT(3) == '-')) {
3614 htmlParseComment(ctxt);
3615 SKIP_BLANKS;
3616 }
3617
3618
3619 /*
3620 * Then possibly doc type declaration(s) and more Misc
3621 * (doctypedecl Misc*)?
3622 */
3623 if ((CUR == '<') && (NXT(1) == '!') &&
3624 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3625 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3626 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3627 (UPP(8) == 'E')) {
3628 htmlParseDocTypeDecl(ctxt);
3629 }
3630 SKIP_BLANKS;
3631
3632 /*
3633 * Parse possible comments before any content
3634 */
3635 while ((CUR == '<') && (NXT(1) == '!') &&
3636 (NXT(2) == '-') && (NXT(3) == '-')) {
3637 htmlParseComment(ctxt);
3638 SKIP_BLANKS;
3639 }
3640
3641 /*
3642 * Time to start parsing the tree itself
3643 */
3644 htmlParseContent(ctxt);
3645
3646 /*
3647 * autoclose
3648 */
3649 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003650 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003651
3652
3653 /*
3654 * SAX: end of the document processing.
3655 */
3656 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3657 ctxt->sax->endDocument(ctxt->userData);
3658
3659 if (ctxt->myDoc != NULL) {
3660 dtd = xmlGetIntSubset(ctxt->myDoc);
3661 if (dtd == NULL)
3662 ctxt->myDoc->intSubset =
3663 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3664 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3665 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3666 }
3667 if (! ctxt->wellFormed) return(-1);
3668 return(0);
3669}
3670
3671
3672/************************************************************************
3673 * *
3674 * Parser contexts handling *
3675 * *
3676 ************************************************************************/
3677
3678/**
3679 * xmlInitParserCtxt:
3680 * @ctxt: an HTML parser context
3681 *
3682 * Initialize a parser context
3683 */
3684
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003685static void
Owen Taylor3473f882001-02-23 17:55:21 +00003686htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3687{
3688 htmlSAXHandler *sax;
3689
3690 if (ctxt == NULL) return;
3691 memset(ctxt, 0, sizeof(htmlParserCtxt));
3692
3693 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3694 if (sax == NULL) {
3695 xmlGenericError(xmlGenericErrorContext,
3696 "htmlInitParserCtxt: out of memory\n");
3697 }
3698 else
3699 memset(sax, 0, sizeof(htmlSAXHandler));
3700
3701 /* Allocate the Input stack */
3702 ctxt->inputTab = (htmlParserInputPtr *)
3703 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3704 if (ctxt->inputTab == NULL) {
3705 xmlGenericError(xmlGenericErrorContext,
3706 "htmlInitParserCtxt: out of memory\n");
3707 ctxt->inputNr = 0;
3708 ctxt->inputMax = 0;
3709 ctxt->input = NULL;
3710 return;
3711 }
3712 ctxt->inputNr = 0;
3713 ctxt->inputMax = 5;
3714 ctxt->input = NULL;
3715 ctxt->version = NULL;
3716 ctxt->encoding = NULL;
3717 ctxt->standalone = -1;
3718 ctxt->instate = XML_PARSER_START;
3719
3720 /* Allocate the Node stack */
3721 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3722 if (ctxt->nodeTab == NULL) {
3723 xmlGenericError(xmlGenericErrorContext,
3724 "htmlInitParserCtxt: out of memory\n");
3725 ctxt->nodeNr = 0;
3726 ctxt->nodeMax = 0;
3727 ctxt->node = NULL;
3728 ctxt->inputNr = 0;
3729 ctxt->inputMax = 0;
3730 ctxt->input = NULL;
3731 return;
3732 }
3733 ctxt->nodeNr = 0;
3734 ctxt->nodeMax = 10;
3735 ctxt->node = NULL;
3736
3737 /* Allocate the Name stack */
3738 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3739 if (ctxt->nameTab == NULL) {
3740 xmlGenericError(xmlGenericErrorContext,
3741 "htmlInitParserCtxt: out of memory\n");
3742 ctxt->nameNr = 0;
3743 ctxt->nameMax = 10;
3744 ctxt->name = NULL;
3745 ctxt->nodeNr = 0;
3746 ctxt->nodeMax = 0;
3747 ctxt->node = NULL;
3748 ctxt->inputNr = 0;
3749 ctxt->inputMax = 0;
3750 ctxt->input = NULL;
3751 return;
3752 }
3753 ctxt->nameNr = 0;
3754 ctxt->nameMax = 10;
3755 ctxt->name = NULL;
3756
3757 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3758 else {
3759 ctxt->sax = sax;
3760 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3761 }
3762 ctxt->userData = ctxt;
3763 ctxt->myDoc = NULL;
3764 ctxt->wellFormed = 1;
3765 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00003766 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00003767 ctxt->html = 1;
3768 ctxt->record_info = 0;
3769 ctxt->validate = 0;
3770 ctxt->nbChars = 0;
3771 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00003772 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003773 xmlInitNodeInfoSeq(&ctxt->node_seq);
3774}
3775
3776/**
3777 * htmlFreeParserCtxt:
3778 * @ctxt: an HTML parser context
3779 *
3780 * Free all the memory used by a parser context. However the parsed
3781 * document in ctxt->myDoc is not freed.
3782 */
3783
3784void
3785htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3786{
3787 xmlFreeParserCtxt(ctxt);
3788}
3789
3790/**
3791 * htmlCreateDocParserCtxt :
3792 * @cur: a pointer to an array of xmlChar
3793 * @encoding: a free form C string describing the HTML document encoding, or NULL
3794 *
3795 * Create a parser context for an HTML document.
3796 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003797 * TODO: check the need to add encoding handling there
3798 *
Owen Taylor3473f882001-02-23 17:55:21 +00003799 * Returns the new parser context or NULL
3800 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003801static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00003802htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Owen Taylor3473f882001-02-23 17:55:21 +00003803 htmlParserCtxtPtr ctxt;
3804 htmlParserInputPtr input;
3805 /* htmlCharEncoding enc; */
3806
3807 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
3808 if (ctxt == NULL) {
3809 perror("malloc");
3810 return(NULL);
3811 }
3812 htmlInitParserCtxt(ctxt);
3813 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
3814 if (input == NULL) {
3815 perror("malloc");
3816 xmlFree(ctxt);
3817 return(NULL);
3818 }
3819 memset(input, 0, sizeof(htmlParserInput));
3820
3821 input->line = 1;
3822 input->col = 1;
3823 input->base = cur;
3824 input->cur = cur;
3825
3826 inputPush(ctxt, input);
3827 return(ctxt);
3828}
3829
3830/************************************************************************
3831 * *
3832 * Progressive parsing interfaces *
3833 * *
3834 ************************************************************************/
3835
3836/**
3837 * htmlParseLookupSequence:
3838 * @ctxt: an HTML parser context
3839 * @first: the first char to lookup
3840 * @next: the next char to lookup or zero
3841 * @third: the next char to lookup or zero
3842 *
3843 * Try to find if a sequence (first, next, third) or just (first next) or
3844 * (first) is available in the input stream.
3845 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3846 * to avoid rescanning sequences of bytes, it DOES change the state of the
3847 * parser, do not use liberally.
3848 * This is basically similar to xmlParseLookupSequence()
3849 *
3850 * Returns the index to the current parsing point if the full sequence
3851 * is available, -1 otherwise.
3852 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003853static int
Owen Taylor3473f882001-02-23 17:55:21 +00003854htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3855 xmlChar next, xmlChar third) {
3856 int base, len;
3857 htmlParserInputPtr in;
3858 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00003859 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003860
3861 in = ctxt->input;
3862 if (in == NULL) return(-1);
3863 base = in->cur - in->base;
3864 if (base < 0) return(-1);
3865 if (ctxt->checkIndex > base)
3866 base = ctxt->checkIndex;
3867 if (in->buf == NULL) {
3868 buf = in->base;
3869 len = in->length;
3870 } else {
3871 buf = in->buf->buffer->content;
3872 len = in->buf->buffer->use;
3873 }
3874 /* take into account the sequence length */
3875 if (third) len -= 2;
3876 else if (next) len --;
3877 for (;base < len;base++) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00003878 if (!incomment && (base + 4 < len)) {
3879 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
3880 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
3881 incomment = 1;
3882 }
3883 /* do not increment base, some people use <!--> */
3884 }
3885 if (incomment) {
3886 if (base + 3 < len)
3887 return(-1);
3888 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
3889 (buf[base + 2] == '>')) {
3890 incomment = 0;
3891 base += 2;
3892 }
3893 continue;
3894 }
Owen Taylor3473f882001-02-23 17:55:21 +00003895 if (buf[base] == first) {
3896 if (third != 0) {
3897 if ((buf[base + 1] != next) ||
3898 (buf[base + 2] != third)) continue;
3899 } else if (next != 0) {
3900 if (buf[base + 1] != next) continue;
3901 }
3902 ctxt->checkIndex = 0;
3903#ifdef DEBUG_PUSH
3904 if (next == 0)
3905 xmlGenericError(xmlGenericErrorContext,
3906 "HPP: lookup '%c' found at %d\n",
3907 first, base);
3908 else if (third == 0)
3909 xmlGenericError(xmlGenericErrorContext,
3910 "HPP: lookup '%c%c' found at %d\n",
3911 first, next, base);
3912 else
3913 xmlGenericError(xmlGenericErrorContext,
3914 "HPP: lookup '%c%c%c' found at %d\n",
3915 first, next, third, base);
3916#endif
3917 return(base - (in->cur - in->base));
3918 }
3919 }
3920 ctxt->checkIndex = base;
3921#ifdef DEBUG_PUSH
3922 if (next == 0)
3923 xmlGenericError(xmlGenericErrorContext,
3924 "HPP: lookup '%c' failed\n", first);
3925 else if (third == 0)
3926 xmlGenericError(xmlGenericErrorContext,
3927 "HPP: lookup '%c%c' failed\n", first, next);
3928 else
3929 xmlGenericError(xmlGenericErrorContext,
3930 "HPP: lookup '%c%c%c' failed\n", first, next, third);
3931#endif
3932 return(-1);
3933}
3934
3935/**
3936 * htmlParseTryOrFinish:
3937 * @ctxt: an HTML parser context
3938 * @terminate: last chunk indicator
3939 *
3940 * Try to progress on parsing
3941 *
3942 * Returns zero if no parsing was possible
3943 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003944static int
Owen Taylor3473f882001-02-23 17:55:21 +00003945htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
3946 int ret = 0;
3947 htmlParserInputPtr in;
3948 int avail = 0;
3949 xmlChar cur, next;
3950
3951#ifdef DEBUG_PUSH
3952 switch (ctxt->instate) {
3953 case XML_PARSER_EOF:
3954 xmlGenericError(xmlGenericErrorContext,
3955 "HPP: try EOF\n"); break;
3956 case XML_PARSER_START:
3957 xmlGenericError(xmlGenericErrorContext,
3958 "HPP: try START\n"); break;
3959 case XML_PARSER_MISC:
3960 xmlGenericError(xmlGenericErrorContext,
3961 "HPP: try MISC\n");break;
3962 case XML_PARSER_COMMENT:
3963 xmlGenericError(xmlGenericErrorContext,
3964 "HPP: try COMMENT\n");break;
3965 case XML_PARSER_PROLOG:
3966 xmlGenericError(xmlGenericErrorContext,
3967 "HPP: try PROLOG\n");break;
3968 case XML_PARSER_START_TAG:
3969 xmlGenericError(xmlGenericErrorContext,
3970 "HPP: try START_TAG\n");break;
3971 case XML_PARSER_CONTENT:
3972 xmlGenericError(xmlGenericErrorContext,
3973 "HPP: try CONTENT\n");break;
3974 case XML_PARSER_CDATA_SECTION:
3975 xmlGenericError(xmlGenericErrorContext,
3976 "HPP: try CDATA_SECTION\n");break;
3977 case XML_PARSER_END_TAG:
3978 xmlGenericError(xmlGenericErrorContext,
3979 "HPP: try END_TAG\n");break;
3980 case XML_PARSER_ENTITY_DECL:
3981 xmlGenericError(xmlGenericErrorContext,
3982 "HPP: try ENTITY_DECL\n");break;
3983 case XML_PARSER_ENTITY_VALUE:
3984 xmlGenericError(xmlGenericErrorContext,
3985 "HPP: try ENTITY_VALUE\n");break;
3986 case XML_PARSER_ATTRIBUTE_VALUE:
3987 xmlGenericError(xmlGenericErrorContext,
3988 "HPP: try ATTRIBUTE_VALUE\n");break;
3989 case XML_PARSER_DTD:
3990 xmlGenericError(xmlGenericErrorContext,
3991 "HPP: try DTD\n");break;
3992 case XML_PARSER_EPILOG:
3993 xmlGenericError(xmlGenericErrorContext,
3994 "HPP: try EPILOG\n");break;
3995 case XML_PARSER_PI:
3996 xmlGenericError(xmlGenericErrorContext,
3997 "HPP: try PI\n");break;
3998 case XML_PARSER_SYSTEM_LITERAL:
3999 xmlGenericError(xmlGenericErrorContext,
4000 "HPP: try SYSTEM_LITERAL\n");break;
4001 }
4002#endif
4003
4004 while (1) {
4005
4006 in = ctxt->input;
4007 if (in == NULL) break;
4008 if (in->buf == NULL)
4009 avail = in->length - (in->cur - in->base);
4010 else
4011 avail = in->buf->buffer->use - (in->cur - in->base);
4012 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004013 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004014 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4015 /*
4016 * SAX: end of the document processing.
4017 */
4018 ctxt->instate = XML_PARSER_EOF;
4019 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4020 ctxt->sax->endDocument(ctxt->userData);
4021 }
4022 }
4023 if (avail < 1)
4024 goto done;
4025 switch (ctxt->instate) {
4026 case XML_PARSER_EOF:
4027 /*
4028 * Document parsing is done !
4029 */
4030 goto done;
4031 case XML_PARSER_START:
4032 /*
4033 * Very first chars read from the document flow.
4034 */
4035 cur = in->cur[0];
4036 if (IS_BLANK(cur)) {
4037 SKIP_BLANKS;
4038 if (in->buf == NULL)
4039 avail = in->length - (in->cur - in->base);
4040 else
4041 avail = in->buf->buffer->use - (in->cur - in->base);
4042 }
4043 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4044 ctxt->sax->setDocumentLocator(ctxt->userData,
4045 &xmlDefaultSAXLocator);
4046 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4047 (!ctxt->disableSAX))
4048 ctxt->sax->startDocument(ctxt->userData);
4049
4050 cur = in->cur[0];
4051 next = in->cur[1];
4052 if ((cur == '<') && (next == '!') &&
4053 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4054 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4055 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4056 (UPP(8) == 'E')) {
4057 if ((!terminate) &&
4058 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4059 goto done;
4060#ifdef DEBUG_PUSH
4061 xmlGenericError(xmlGenericErrorContext,
4062 "HPP: Parsing internal subset\n");
4063#endif
4064 htmlParseDocTypeDecl(ctxt);
4065 ctxt->instate = XML_PARSER_PROLOG;
4066#ifdef DEBUG_PUSH
4067 xmlGenericError(xmlGenericErrorContext,
4068 "HPP: entering PROLOG\n");
4069#endif
4070 } else {
4071 ctxt->instate = XML_PARSER_MISC;
4072 }
4073#ifdef DEBUG_PUSH
4074 xmlGenericError(xmlGenericErrorContext,
4075 "HPP: entering MISC\n");
4076#endif
4077 break;
4078 case XML_PARSER_MISC:
4079 SKIP_BLANKS;
4080 if (in->buf == NULL)
4081 avail = in->length - (in->cur - in->base);
4082 else
4083 avail = in->buf->buffer->use - (in->cur - in->base);
4084 if (avail < 2)
4085 goto done;
4086 cur = in->cur[0];
4087 next = in->cur[1];
4088 if ((cur == '<') && (next == '!') &&
4089 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4090 if ((!terminate) &&
4091 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4092 goto done;
4093#ifdef DEBUG_PUSH
4094 xmlGenericError(xmlGenericErrorContext,
4095 "HPP: Parsing Comment\n");
4096#endif
4097 htmlParseComment(ctxt);
4098 ctxt->instate = XML_PARSER_MISC;
4099 } else if ((cur == '<') && (next == '!') &&
4100 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4101 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4102 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4103 (UPP(8) == 'E')) {
4104 if ((!terminate) &&
4105 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4106 goto done;
4107#ifdef DEBUG_PUSH
4108 xmlGenericError(xmlGenericErrorContext,
4109 "HPP: Parsing internal subset\n");
4110#endif
4111 htmlParseDocTypeDecl(ctxt);
4112 ctxt->instate = XML_PARSER_PROLOG;
4113#ifdef DEBUG_PUSH
4114 xmlGenericError(xmlGenericErrorContext,
4115 "HPP: entering PROLOG\n");
4116#endif
4117 } else if ((cur == '<') && (next == '!') &&
4118 (avail < 9)) {
4119 goto done;
4120 } else {
4121 ctxt->instate = XML_PARSER_START_TAG;
4122#ifdef DEBUG_PUSH
4123 xmlGenericError(xmlGenericErrorContext,
4124 "HPP: entering START_TAG\n");
4125#endif
4126 }
4127 break;
4128 case XML_PARSER_PROLOG:
4129 SKIP_BLANKS;
4130 if (in->buf == NULL)
4131 avail = in->length - (in->cur - in->base);
4132 else
4133 avail = in->buf->buffer->use - (in->cur - in->base);
4134 if (avail < 2)
4135 goto done;
4136 cur = in->cur[0];
4137 next = in->cur[1];
4138 if ((cur == '<') && (next == '!') &&
4139 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4140 if ((!terminate) &&
4141 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4142 goto done;
4143#ifdef DEBUG_PUSH
4144 xmlGenericError(xmlGenericErrorContext,
4145 "HPP: Parsing Comment\n");
4146#endif
4147 htmlParseComment(ctxt);
4148 ctxt->instate = XML_PARSER_PROLOG;
4149 } else if ((cur == '<') && (next == '!') &&
4150 (avail < 4)) {
4151 goto done;
4152 } else {
4153 ctxt->instate = XML_PARSER_START_TAG;
4154#ifdef DEBUG_PUSH
4155 xmlGenericError(xmlGenericErrorContext,
4156 "HPP: entering START_TAG\n");
4157#endif
4158 }
4159 break;
4160 case XML_PARSER_EPILOG:
4161 if (in->buf == NULL)
4162 avail = in->length - (in->cur - in->base);
4163 else
4164 avail = in->buf->buffer->use - (in->cur - in->base);
4165 if (avail < 1)
4166 goto done;
4167 cur = in->cur[0];
4168 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004169 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004170 goto done;
4171 }
4172 if (avail < 2)
4173 goto done;
4174 next = in->cur[1];
4175 if ((cur == '<') && (next == '!') &&
4176 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4177 if ((!terminate) &&
4178 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4179 goto done;
4180#ifdef DEBUG_PUSH
4181 xmlGenericError(xmlGenericErrorContext,
4182 "HPP: Parsing Comment\n");
4183#endif
4184 htmlParseComment(ctxt);
4185 ctxt->instate = XML_PARSER_EPILOG;
4186 } else if ((cur == '<') && (next == '!') &&
4187 (avail < 4)) {
4188 goto done;
4189 } else {
4190 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004191 ctxt->wellFormed = 0;
4192 ctxt->instate = XML_PARSER_EOF;
4193#ifdef DEBUG_PUSH
4194 xmlGenericError(xmlGenericErrorContext,
4195 "HPP: entering EOF\n");
4196#endif
4197 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4198 ctxt->sax->endDocument(ctxt->userData);
4199 goto done;
4200 }
4201 break;
4202 case XML_PARSER_START_TAG: {
4203 xmlChar *name, *oldname;
4204 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004205 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004206
4207 if (avail < 2)
4208 goto done;
4209 cur = in->cur[0];
4210 if (cur != '<') {
4211 ctxt->instate = XML_PARSER_CONTENT;
4212#ifdef DEBUG_PUSH
4213 xmlGenericError(xmlGenericErrorContext,
4214 "HPP: entering CONTENT\n");
4215#endif
4216 break;
4217 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004218 if (in->cur[1] == '/') {
4219 ctxt->instate = XML_PARSER_END_TAG;
4220 ctxt->checkIndex = 0;
4221#ifdef DEBUG_PUSH
4222 xmlGenericError(xmlGenericErrorContext,
4223 "HPP: entering END_TAG\n");
4224#endif
4225 break;
4226 }
Owen Taylor3473f882001-02-23 17:55:21 +00004227 if ((!terminate) &&
4228 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4229 goto done;
4230
4231 oldname = xmlStrdup(ctxt->name);
4232 htmlParseStartTag(ctxt);
4233 name = ctxt->name;
4234#ifdef DEBUG
4235 if (oldname == NULL)
4236 xmlGenericError(xmlGenericErrorContext,
4237 "Start of element %s\n", name);
4238 else if (name == NULL)
4239 xmlGenericError(xmlGenericErrorContext,
4240 "Start of element failed, was %s\n",
4241 oldname);
4242 else
4243 xmlGenericError(xmlGenericErrorContext,
4244 "Start of element %s, was %s\n",
4245 name, oldname);
4246#endif
4247 if (((depth == ctxt->nameNr) &&
4248 (xmlStrEqual(oldname, ctxt->name))) ||
4249 (name == NULL)) {
4250 if (CUR == '>')
4251 NEXT;
4252 if (oldname != NULL)
4253 xmlFree(oldname);
4254 break;
4255 }
4256 if (oldname != NULL)
4257 xmlFree(oldname);
4258
4259 /*
4260 * Lookup the info for that element.
4261 */
4262 info = htmlTagLookup(name);
4263 if (info == NULL) {
4264 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4265 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4266 name);
4267 ctxt->wellFormed = 0;
4268 } else if (info->depr) {
4269 /***************************
4270 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4271 ctxt->sax->warning(ctxt->userData,
4272 "Tag %s is deprecated\n",
4273 name);
4274 ***************************/
4275 }
4276
4277 /*
4278 * Check for an Empty Element labelled the XML/SGML way
4279 */
4280 if ((CUR == '/') && (NXT(1) == '>')) {
4281 SKIP(2);
4282 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4283 ctxt->sax->endElement(ctxt->userData, name);
4284 oldname = htmlnamePop(ctxt);
4285#ifdef DEBUG
4286 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4287 oldname);
4288#endif
4289 if (oldname != NULL)
4290 xmlFree(oldname);
4291 ctxt->instate = XML_PARSER_CONTENT;
4292#ifdef DEBUG_PUSH
4293 xmlGenericError(xmlGenericErrorContext,
4294 "HPP: entering CONTENT\n");
4295#endif
4296 break;
4297 }
4298
4299 if (CUR == '>') {
4300 NEXT;
4301 } else {
4302 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4303 ctxt->sax->error(ctxt->userData,
4304 "Couldn't find end of Start Tag %s\n",
4305 name);
4306 ctxt->wellFormed = 0;
4307
4308 /*
4309 * end of parsing of this node.
4310 */
4311 if (xmlStrEqual(name, ctxt->name)) {
4312 nodePop(ctxt);
4313 oldname = htmlnamePop(ctxt);
4314#ifdef DEBUG
4315 xmlGenericError(xmlGenericErrorContext,
4316 "End of start tag problem: popping out %s\n", oldname);
4317#endif
4318 if (oldname != NULL)
4319 xmlFree(oldname);
4320 }
4321
4322 ctxt->instate = XML_PARSER_CONTENT;
4323#ifdef DEBUG_PUSH
4324 xmlGenericError(xmlGenericErrorContext,
4325 "HPP: entering CONTENT\n");
4326#endif
4327 break;
4328 }
4329
4330 /*
4331 * Check for an Empty Element from DTD definition
4332 */
4333 if ((info != NULL) && (info->empty)) {
4334 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4335 ctxt->sax->endElement(ctxt->userData, name);
4336 oldname = htmlnamePop(ctxt);
4337#ifdef DEBUG
4338 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4339#endif
4340 if (oldname != NULL)
4341 xmlFree(oldname);
4342 }
4343 ctxt->instate = XML_PARSER_CONTENT;
4344#ifdef DEBUG_PUSH
4345 xmlGenericError(xmlGenericErrorContext,
4346 "HPP: entering CONTENT\n");
4347#endif
4348 break;
4349 }
4350 case XML_PARSER_CONTENT: {
4351 long cons;
4352 /*
4353 * Handle preparsed entities and charRef
4354 */
4355 if (ctxt->token != 0) {
4356 xmlChar chr[2] = { 0 , 0 } ;
4357
4358 chr[0] = (xmlChar) ctxt->token;
4359 htmlCheckParagraph(ctxt);
4360 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4361 ctxt->sax->characters(ctxt->userData, chr, 1);
4362 ctxt->token = 0;
4363 ctxt->checkIndex = 0;
4364 }
4365 if ((avail == 1) && (terminate)) {
4366 cur = in->cur[0];
4367 if ((cur != '<') && (cur != '&')) {
4368 if (ctxt->sax != NULL) {
4369 if (IS_BLANK(cur)) {
4370 if (ctxt->sax->ignorableWhitespace != NULL)
4371 ctxt->sax->ignorableWhitespace(
4372 ctxt->userData, &cur, 1);
4373 } else {
4374 htmlCheckParagraph(ctxt);
4375 if (ctxt->sax->characters != NULL)
4376 ctxt->sax->characters(
4377 ctxt->userData, &cur, 1);
4378 }
4379 }
4380 ctxt->token = 0;
4381 ctxt->checkIndex = 0;
4382 NEXT;
William M. Brack1633d182001-10-05 15:41:19 +00004383 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004384 }
Owen Taylor3473f882001-02-23 17:55:21 +00004385 }
4386 if (avail < 2)
4387 goto done;
4388 cur = in->cur[0];
4389 next = in->cur[1];
4390 cons = ctxt->nbChars;
4391 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4392 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4393 /*
4394 * Handle SCRIPT/STYLE separately
4395 */
4396 if ((!terminate) &&
4397 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4398 goto done;
4399 htmlParseScript(ctxt);
4400 if ((cur == '<') && (next == '/')) {
4401 ctxt->instate = XML_PARSER_END_TAG;
4402 ctxt->checkIndex = 0;
4403#ifdef DEBUG_PUSH
4404 xmlGenericError(xmlGenericErrorContext,
4405 "HPP: entering END_TAG\n");
4406#endif
4407 break;
4408 }
4409 } else {
4410 /*
4411 * Sometimes DOCTYPE arrives in the middle of the document
4412 */
4413 if ((cur == '<') && (next == '!') &&
4414 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4415 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4416 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4417 (UPP(8) == 'E')) {
4418 if ((!terminate) &&
4419 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4420 goto done;
4421 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4422 ctxt->sax->error(ctxt->userData,
4423 "Misplaced DOCTYPE declaration\n");
4424 ctxt->wellFormed = 0;
4425 htmlParseDocTypeDecl(ctxt);
4426 } else if ((cur == '<') && (next == '!') &&
4427 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4428 if ((!terminate) &&
4429 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4430 goto done;
4431#ifdef DEBUG_PUSH
4432 xmlGenericError(xmlGenericErrorContext,
4433 "HPP: Parsing Comment\n");
4434#endif
4435 htmlParseComment(ctxt);
4436 ctxt->instate = XML_PARSER_CONTENT;
4437 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4438 goto done;
4439 } else if ((cur == '<') && (next == '/')) {
4440 ctxt->instate = XML_PARSER_END_TAG;
4441 ctxt->checkIndex = 0;
4442#ifdef DEBUG_PUSH
4443 xmlGenericError(xmlGenericErrorContext,
4444 "HPP: entering END_TAG\n");
4445#endif
4446 break;
4447 } else if (cur == '<') {
4448 ctxt->instate = XML_PARSER_START_TAG;
4449 ctxt->checkIndex = 0;
4450#ifdef DEBUG_PUSH
4451 xmlGenericError(xmlGenericErrorContext,
4452 "HPP: entering START_TAG\n");
4453#endif
4454 break;
4455 } else if (cur == '&') {
4456 if ((!terminate) &&
4457 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4458 goto done;
4459#ifdef DEBUG_PUSH
4460 xmlGenericError(xmlGenericErrorContext,
4461 "HPP: Parsing Reference\n");
4462#endif
4463 /* TODO: check generation of subtrees if noent !!! */
4464 htmlParseReference(ctxt);
4465 } else {
4466 /* TODO Avoid the extra copy, handle directly !!!!!! */
4467 /*
4468 * Goal of the following test is :
4469 * - minimize calls to the SAX 'character' callback
4470 * when they are mergeable
4471 */
4472 if ((ctxt->inputNr == 1) &&
4473 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4474 if ((!terminate) &&
4475 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4476 goto done;
4477 }
4478 ctxt->checkIndex = 0;
4479#ifdef DEBUG_PUSH
4480 xmlGenericError(xmlGenericErrorContext,
4481 "HPP: Parsing char data\n");
4482#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004483 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004484 }
4485 }
4486 if (cons == ctxt->nbChars) {
4487 if (ctxt->node != NULL) {
4488 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4489 ctxt->sax->error(ctxt->userData,
4490 "detected an error in element content\n");
4491 ctxt->wellFormed = 0;
4492 }
4493 NEXT;
4494 break;
4495 }
4496
4497 break;
4498 }
4499 case XML_PARSER_END_TAG:
4500 if (avail < 2)
4501 goto done;
4502 if ((!terminate) &&
4503 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4504 goto done;
4505 htmlParseEndTag(ctxt);
4506 if (ctxt->nameNr == 0) {
4507 ctxt->instate = XML_PARSER_EPILOG;
4508 } else {
4509 ctxt->instate = XML_PARSER_CONTENT;
4510 }
4511 ctxt->checkIndex = 0;
4512#ifdef DEBUG_PUSH
4513 xmlGenericError(xmlGenericErrorContext,
4514 "HPP: entering CONTENT\n");
4515#endif
4516 break;
4517 case XML_PARSER_CDATA_SECTION:
4518 xmlGenericError(xmlGenericErrorContext,
4519 "HPP: internal error, state == CDATA\n");
4520 ctxt->instate = XML_PARSER_CONTENT;
4521 ctxt->checkIndex = 0;
4522#ifdef DEBUG_PUSH
4523 xmlGenericError(xmlGenericErrorContext,
4524 "HPP: entering CONTENT\n");
4525#endif
4526 break;
4527 case XML_PARSER_DTD:
4528 xmlGenericError(xmlGenericErrorContext,
4529 "HPP: internal error, state == DTD\n");
4530 ctxt->instate = XML_PARSER_CONTENT;
4531 ctxt->checkIndex = 0;
4532#ifdef DEBUG_PUSH
4533 xmlGenericError(xmlGenericErrorContext,
4534 "HPP: entering CONTENT\n");
4535#endif
4536 break;
4537 case XML_PARSER_COMMENT:
4538 xmlGenericError(xmlGenericErrorContext,
4539 "HPP: internal error, state == COMMENT\n");
4540 ctxt->instate = XML_PARSER_CONTENT;
4541 ctxt->checkIndex = 0;
4542#ifdef DEBUG_PUSH
4543 xmlGenericError(xmlGenericErrorContext,
4544 "HPP: entering CONTENT\n");
4545#endif
4546 break;
4547 case XML_PARSER_PI:
4548 xmlGenericError(xmlGenericErrorContext,
4549 "HPP: internal error, state == PI\n");
4550 ctxt->instate = XML_PARSER_CONTENT;
4551 ctxt->checkIndex = 0;
4552#ifdef DEBUG_PUSH
4553 xmlGenericError(xmlGenericErrorContext,
4554 "HPP: entering CONTENT\n");
4555#endif
4556 break;
4557 case XML_PARSER_ENTITY_DECL:
4558 xmlGenericError(xmlGenericErrorContext,
4559 "HPP: internal error, state == ENTITY_DECL\n");
4560 ctxt->instate = XML_PARSER_CONTENT;
4561 ctxt->checkIndex = 0;
4562#ifdef DEBUG_PUSH
4563 xmlGenericError(xmlGenericErrorContext,
4564 "HPP: entering CONTENT\n");
4565#endif
4566 break;
4567 case XML_PARSER_ENTITY_VALUE:
4568 xmlGenericError(xmlGenericErrorContext,
4569 "HPP: internal error, state == ENTITY_VALUE\n");
4570 ctxt->instate = XML_PARSER_CONTENT;
4571 ctxt->checkIndex = 0;
4572#ifdef DEBUG_PUSH
4573 xmlGenericError(xmlGenericErrorContext,
4574 "HPP: entering DTD\n");
4575#endif
4576 break;
4577 case XML_PARSER_ATTRIBUTE_VALUE:
4578 xmlGenericError(xmlGenericErrorContext,
4579 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4580 ctxt->instate = XML_PARSER_START_TAG;
4581 ctxt->checkIndex = 0;
4582#ifdef DEBUG_PUSH
4583 xmlGenericError(xmlGenericErrorContext,
4584 "HPP: entering START_TAG\n");
4585#endif
4586 break;
4587 case XML_PARSER_SYSTEM_LITERAL:
4588 xmlGenericError(xmlGenericErrorContext,
4589 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4590 ctxt->instate = XML_PARSER_CONTENT;
4591 ctxt->checkIndex = 0;
4592#ifdef DEBUG_PUSH
4593 xmlGenericError(xmlGenericErrorContext,
4594 "HPP: entering CONTENT\n");
4595#endif
4596 break;
4597 case XML_PARSER_IGNORE:
4598 xmlGenericError(xmlGenericErrorContext,
4599 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4600 ctxt->instate = XML_PARSER_CONTENT;
4601 ctxt->checkIndex = 0;
4602#ifdef DEBUG_PUSH
4603 xmlGenericError(xmlGenericErrorContext,
4604 "HPP: entering CONTENT\n");
4605#endif
4606 break;
4607 }
4608 }
4609done:
4610 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004611 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004612 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4613 /*
4614 * SAX: end of the document processing.
4615 */
4616 ctxt->instate = XML_PARSER_EOF;
4617 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4618 ctxt->sax->endDocument(ctxt->userData);
4619 }
4620 }
4621 if ((ctxt->myDoc != NULL) &&
4622 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4623 (ctxt->instate == XML_PARSER_EPILOG))) {
4624 xmlDtdPtr dtd;
4625 dtd = xmlGetIntSubset(ctxt->myDoc);
4626 if (dtd == NULL)
4627 ctxt->myDoc->intSubset =
4628 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4629 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4630 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4631 }
4632#ifdef DEBUG_PUSH
4633 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4634#endif
4635 return(ret);
4636}
4637
4638/**
Owen Taylor3473f882001-02-23 17:55:21 +00004639 * htmlParseChunk:
4640 * @ctxt: an XML parser context
4641 * @chunk: an char array
4642 * @size: the size in byte of the chunk
4643 * @terminate: last chunk indicator
4644 *
4645 * Parse a Chunk of memory
4646 *
4647 * Returns zero if no error, the xmlParserErrors otherwise.
4648 */
4649int
4650htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4651 int terminate) {
4652 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4653 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4654 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4655 int cur = ctxt->input->cur - ctxt->input->base;
4656
4657 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4658 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4659 ctxt->input->cur = ctxt->input->base + cur;
4660#ifdef DEBUG_PUSH
4661 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4662#endif
4663
4664 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4665 htmlParseTryOrFinish(ctxt, terminate);
4666 } else if (ctxt->instate != XML_PARSER_EOF) {
4667 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4668 htmlParseTryOrFinish(ctxt, terminate);
4669 }
4670 if (terminate) {
4671 if ((ctxt->instate != XML_PARSER_EOF) &&
4672 (ctxt->instate != XML_PARSER_EPILOG) &&
4673 (ctxt->instate != XML_PARSER_MISC)) {
4674 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004675 ctxt->wellFormed = 0;
4676 }
4677 if (ctxt->instate != XML_PARSER_EOF) {
4678 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4679 ctxt->sax->endDocument(ctxt->userData);
4680 }
4681 ctxt->instate = XML_PARSER_EOF;
4682 }
4683 return((xmlParserErrors) ctxt->errNo);
4684}
4685
4686/************************************************************************
4687 * *
4688 * User entry points *
4689 * *
4690 ************************************************************************/
4691
4692/**
4693 * htmlCreatePushParserCtxt :
4694 * @sax: a SAX handler
4695 * @user_data: The user data returned on SAX callbacks
4696 * @chunk: a pointer to an array of chars
4697 * @size: number of chars in the array
4698 * @filename: an optional file name or URI
4699 * @enc: an optional encoding
4700 *
4701 * Create a parser context for using the HTML parser in push mode
4702 * To allow content encoding detection, @size should be >= 4
4703 * The value of @filename is used for fetching external entities
4704 * and error/warning reports.
4705 *
4706 * Returns the new parser context or NULL
4707 */
4708htmlParserCtxtPtr
4709htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4710 const char *chunk, int size, const char *filename,
4711 xmlCharEncoding enc) {
4712 htmlParserCtxtPtr ctxt;
4713 htmlParserInputPtr inputStream;
4714 xmlParserInputBufferPtr buf;
4715
Daniel Veillardd0463562001-10-13 09:15:48 +00004716 xmlInitParser();
4717
Owen Taylor3473f882001-02-23 17:55:21 +00004718 buf = xmlAllocParserInputBuffer(enc);
4719 if (buf == NULL) return(NULL);
4720
4721 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4722 if (ctxt == NULL) {
4723 xmlFree(buf);
4724 return(NULL);
4725 }
4726 memset(ctxt, 0, sizeof(htmlParserCtxt));
4727 htmlInitParserCtxt(ctxt);
4728 if (sax != NULL) {
4729 if (ctxt->sax != &htmlDefaultSAXHandler)
4730 xmlFree(ctxt->sax);
4731 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4732 if (ctxt->sax == NULL) {
4733 xmlFree(buf);
4734 xmlFree(ctxt);
4735 return(NULL);
4736 }
4737 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4738 if (user_data != NULL)
4739 ctxt->userData = user_data;
4740 }
4741 if (filename == NULL) {
4742 ctxt->directory = NULL;
4743 } else {
4744 ctxt->directory = xmlParserGetDirectory(filename);
4745 }
4746
4747 inputStream = htmlNewInputStream(ctxt);
4748 if (inputStream == NULL) {
4749 xmlFreeParserCtxt(ctxt);
4750 return(NULL);
4751 }
4752
4753 if (filename == NULL)
4754 inputStream->filename = NULL;
4755 else
4756 inputStream->filename = xmlMemStrdup(filename);
4757 inputStream->buf = buf;
4758 inputStream->base = inputStream->buf->buffer->content;
4759 inputStream->cur = inputStream->buf->buffer->content;
4760
4761 inputPush(ctxt, inputStream);
4762
4763 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4764 (ctxt->input->buf != NULL)) {
4765 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4766#ifdef DEBUG_PUSH
4767 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4768#endif
4769 }
4770
4771 return(ctxt);
4772}
4773
4774/**
4775 * htmlSAXParseDoc :
4776 * @cur: a pointer to an array of xmlChar
4777 * @encoding: a free form C string describing the HTML document encoding, or NULL
4778 * @sax: the SAX handler block
4779 * @userData: if using SAX, this pointer will be provided on callbacks.
4780 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004781 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
4782 * to handle parse events. If sax is NULL, fallback to the default DOM
4783 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00004784 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004785 * Returns the resulting document tree unless SAX is NULL or the document is
4786 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004787 */
4788
4789htmlDocPtr
4790htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4791 htmlDocPtr ret;
4792 htmlParserCtxtPtr ctxt;
4793
Daniel Veillardd0463562001-10-13 09:15:48 +00004794 xmlInitParser();
4795
Owen Taylor3473f882001-02-23 17:55:21 +00004796 if (cur == NULL) return(NULL);
4797
4798
4799 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4800 if (ctxt == NULL) return(NULL);
4801 if (sax != NULL) {
4802 ctxt->sax = sax;
4803 ctxt->userData = userData;
4804 }
4805
4806 htmlParseDocument(ctxt);
4807 ret = ctxt->myDoc;
4808 if (sax != NULL) {
4809 ctxt->sax = NULL;
4810 ctxt->userData = NULL;
4811 }
4812 htmlFreeParserCtxt(ctxt);
4813
4814 return(ret);
4815}
4816
4817/**
4818 * htmlParseDoc :
4819 * @cur: a pointer to an array of xmlChar
4820 * @encoding: a free form C string describing the HTML document encoding, or NULL
4821 *
4822 * parse an HTML in-memory document and build a tree.
4823 *
4824 * Returns the resulting document tree
4825 */
4826
4827htmlDocPtr
4828htmlParseDoc(xmlChar *cur, const char *encoding) {
4829 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4830}
4831
4832
4833/**
4834 * htmlCreateFileParserCtxt :
4835 * @filename: the filename
4836 * @encoding: a free form C string describing the HTML document encoding, or NULL
4837 *
4838 * Create a parser context for a file content.
4839 * Automatic support for ZLIB/Compress compressed document is provided
4840 * by default if found at compile-time.
4841 *
4842 * Returns the new parser context or NULL
4843 */
4844htmlParserCtxtPtr
4845htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4846{
4847 htmlParserCtxtPtr ctxt;
4848 htmlParserInputPtr inputStream;
4849 xmlParserInputBufferPtr buf;
4850 /* htmlCharEncoding enc; */
4851 xmlChar *content, *content_line = (xmlChar *) "charset=";
4852
4853 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4854 if (buf == NULL) return(NULL);
4855
4856 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4857 if (ctxt == NULL) {
4858 perror("malloc");
4859 return(NULL);
4860 }
4861 memset(ctxt, 0, sizeof(htmlParserCtxt));
4862 htmlInitParserCtxt(ctxt);
4863 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4864 if (inputStream == NULL) {
4865 perror("malloc");
4866 xmlFree(ctxt);
4867 return(NULL);
4868 }
4869 memset(inputStream, 0, sizeof(htmlParserInput));
4870
4871 inputStream->filename = xmlMemStrdup(filename);
4872 inputStream->line = 1;
4873 inputStream->col = 1;
4874 inputStream->buf = buf;
4875 inputStream->directory = NULL;
4876
4877 inputStream->base = inputStream->buf->buffer->content;
4878 inputStream->cur = inputStream->buf->buffer->content;
4879 inputStream->free = NULL;
4880
4881 inputPush(ctxt, inputStream);
4882
4883 /* set encoding */
4884 if (encoding) {
4885 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4886 if (content) {
4887 strcpy ((char *)content, (char *)content_line);
4888 strcat ((char *)content, (char *)encoding);
4889 htmlCheckEncoding (ctxt, content);
4890 xmlFree (content);
4891 }
4892 }
4893
4894 return(ctxt);
4895}
4896
4897/**
4898 * htmlSAXParseFile :
4899 * @filename: the filename
4900 * @encoding: a free form C string describing the HTML document encoding, or NULL
4901 * @sax: the SAX handler block
4902 * @userData: if using SAX, this pointer will be provided on callbacks.
4903 *
4904 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4905 * compressed document is provided by default if found at compile-time.
4906 * It use the given SAX function block to handle the parsing callback.
4907 * If sax is NULL, fallback to the default DOM tree building routines.
4908 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004909 * Returns the resulting document tree unless SAX is NULL or the document is
4910 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004911 */
4912
4913htmlDocPtr
4914htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4915 void *userData) {
4916 htmlDocPtr ret;
4917 htmlParserCtxtPtr ctxt;
4918 htmlSAXHandlerPtr oldsax = NULL;
4919
Daniel Veillardd0463562001-10-13 09:15:48 +00004920 xmlInitParser();
4921
Owen Taylor3473f882001-02-23 17:55:21 +00004922 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4923 if (ctxt == NULL) return(NULL);
4924 if (sax != NULL) {
4925 oldsax = ctxt->sax;
4926 ctxt->sax = sax;
4927 ctxt->userData = userData;
4928 }
4929
4930 htmlParseDocument(ctxt);
4931
4932 ret = ctxt->myDoc;
4933 if (sax != NULL) {
4934 ctxt->sax = oldsax;
4935 ctxt->userData = NULL;
4936 }
4937 htmlFreeParserCtxt(ctxt);
4938
4939 return(ret);
4940}
4941
4942/**
4943 * htmlParseFile :
4944 * @filename: the filename
4945 * @encoding: a free form C string describing the HTML document encoding, or NULL
4946 *
4947 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4948 * compressed document is provided by default if found at compile-time.
4949 *
4950 * Returns the resulting document tree
4951 */
4952
4953htmlDocPtr
4954htmlParseFile(const char *filename, const char *encoding) {
4955 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4956}
4957
4958/**
4959 * htmlHandleOmittedElem:
4960 * @val: int 0 or 1
4961 *
4962 * Set and return the previous value for handling HTML omitted tags.
4963 *
4964 * Returns the last value for 0 for no handling, 1 for auto insertion.
4965 */
4966
4967int
4968htmlHandleOmittedElem(int val) {
4969 int old = htmlOmittedDefaultValue;
4970
4971 htmlOmittedDefaultValue = val;
4972 return(old);
4973}
4974
4975#endif /* LIBXML_HTML_ENABLED */