blob: d9ef259a20970c147b6424f6aebfa8a2c0c3be35 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Owen Taylor3473f882001-02-23 17:55:21 +000062 * Parser stacks related functions and macros *
63 * *
64 ************************************************************************/
65
Daniel Veillard1c732d22002-11-30 11:22:59 +000066/**
67 * htmlnamePush:
68 * @ctxt: an HTML parser context
69 * @value: the element name
70 *
71 * Pushes a new element name on top of the name stack
72 *
73 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +000074 */
Daniel Veillard1c732d22002-11-30 11:22:59 +000075static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +000076htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +000077{
78 if (ctxt->nameNr >= ctxt->nameMax) {
79 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +000080 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +000081 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +000082 ctxt->nameMax *
83 sizeof(ctxt->nameTab[0]));
84 if (ctxt->nameTab == NULL) {
85 xmlGenericError(xmlGenericErrorContext, "realloc failed !\n");
86 return (0);
87 }
88 }
89 ctxt->nameTab[ctxt->nameNr] = value;
90 ctxt->name = value;
91 return (ctxt->nameNr++);
92}
93/**
94 * htmlnamePop:
95 * @ctxt: an HTML parser context
96 *
97 * Pops the top element name from the name stack
98 *
99 * Returns the name just removed
100 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000101static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000102htmlnamePop(htmlParserCtxtPtr ctxt)
103{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000104 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000105
Daniel Veillard1c732d22002-11-30 11:22:59 +0000106 if (ctxt->nameNr <= 0)
107 return (0);
108 ctxt->nameNr--;
109 if (ctxt->nameNr < 0)
110 return (0);
111 if (ctxt->nameNr > 0)
112 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
113 else
114 ctxt->name = NULL;
115 ret = ctxt->nameTab[ctxt->nameNr];
116 ctxt->nameTab[ctxt->nameNr] = 0;
117 return (ret);
118}
Owen Taylor3473f882001-02-23 17:55:21 +0000119
120/*
121 * Macros for accessing the content. Those should be used only by the parser,
122 * and not exported.
123 *
124 * Dirty macros, i.e. one need to make assumption on the context to use them
125 *
126 * CUR_PTR return the current pointer to the xmlChar to be parsed.
127 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
128 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
129 * in UNICODE mode. This should be used internally by the parser
130 * only to compare to ASCII values otherwise it would break when
131 * running with UTF-8 encoding.
132 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
133 * to compare on ASCII based substring.
134 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
135 * it should be used only to compare on ASCII based substring.
136 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000137 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000138 *
139 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
140 *
141 * CURRENT Returns the current char value, with the full decoding of
142 * UTF-8 if we are using this mode. It returns an int.
143 * NEXT Skip to the next character, this does the proper decoding
144 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000145 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000146 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
147 */
148
149#define UPPER (toupper(*ctxt->input->cur))
150
Daniel Veillard77a90a72003-03-22 00:04:05 +0000151#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000152
153#define NXT(val) ctxt->input->cur[(val)]
154
155#define UPP(val) (toupper(ctxt->input->cur[(val)]))
156
157#define CUR_PTR ctxt->input->cur
158
159#define SHRINK xmlParserInputShrink(ctxt->input)
160
161#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
162
163#define CURRENT ((int) (*ctxt->input->cur))
164
165#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
166
167/* Inported from XML */
168
Daniel Veillard561b7f82002-03-20 21:55:57 +0000169/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
170#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000171#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000172
Daniel Veillard561b7f82002-03-20 21:55:57 +0000173#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000174#define NXT(val) ctxt->input->cur[(val)]
175#define CUR_PTR ctxt->input->cur
176
177
178#define NEXTL(l) do { \
179 if (*(ctxt->input->cur) == '\n') { \
180 ctxt->input->line++; ctxt->input->col = 1; \
181 } else ctxt->input->col++; \
182 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
183 } while (0)
184
185/************
186 \
187 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
188 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
189 ************/
190
191#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
192#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
193
194#define COPY_BUF(l,b,i,v) \
195 if (l == 1) b[i++] = (xmlChar) v; \
196 else i += xmlCopyChar(l,&b[i],v)
197
198/**
199 * htmlCurrentChar:
200 * @ctxt: the HTML parser context
201 * @len: pointer to the length of the char read
202 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000203 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000204 * bytes in the input buffer. Implement the end of line normalization:
205 * 2.11 End-of-Line Handling
206 * If the encoding is unspecified, in the case we find an ISO-Latin-1
207 * char, then the encoding converter is plugged in automatically.
208 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000209 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000210 */
211
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000212static int
Owen Taylor3473f882001-02-23 17:55:21 +0000213htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
214 if (ctxt->instate == XML_PARSER_EOF)
215 return(0);
216
217 if (ctxt->token != 0) {
218 *len = 0;
219 return(ctxt->token);
220 }
221 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
222 /*
223 * We are supposed to handle UTF8, check it's valid
224 * From rfc2044: encoding of the Unicode values on UTF-8:
225 *
226 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
227 * 0000 0000-0000 007F 0xxxxxxx
228 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
229 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
230 *
231 * Check for the 0x110000 limit too
232 */
233 const unsigned char *cur = ctxt->input->cur;
234 unsigned char c;
235 unsigned int val;
236
237 c = *cur;
238 if (c & 0x80) {
239 if (cur[1] == 0)
240 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
241 if ((cur[1] & 0xc0) != 0x80)
242 goto encoding_error;
243 if ((c & 0xe0) == 0xe0) {
244
245 if (cur[2] == 0)
246 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
247 if ((cur[2] & 0xc0) != 0x80)
248 goto encoding_error;
249 if ((c & 0xf0) == 0xf0) {
250 if (cur[3] == 0)
251 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
252 if (((c & 0xf8) != 0xf0) ||
253 ((cur[3] & 0xc0) != 0x80))
254 goto encoding_error;
255 /* 4-byte code */
256 *len = 4;
257 val = (cur[0] & 0x7) << 18;
258 val |= (cur[1] & 0x3f) << 12;
259 val |= (cur[2] & 0x3f) << 6;
260 val |= cur[3] & 0x3f;
261 } else {
262 /* 3-byte code */
263 *len = 3;
264 val = (cur[0] & 0xf) << 12;
265 val |= (cur[1] & 0x3f) << 6;
266 val |= cur[2] & 0x3f;
267 }
268 } else {
269 /* 2-byte code */
270 *len = 2;
271 val = (cur[0] & 0x1f) << 6;
272 val |= cur[1] & 0x3f;
273 }
274 if (!IS_CHAR(val)) {
275 ctxt->errNo = XML_ERR_INVALID_ENCODING;
276 if ((ctxt->sax != NULL) &&
277 (ctxt->sax->error != NULL))
278 ctxt->sax->error(ctxt->userData,
279 "Char 0x%X out of allowed range\n", val);
280 ctxt->wellFormed = 0;
Daniel Veillarddad3f682002-11-17 16:47:27 +0000281 if (ctxt->recovery == 0) ctxt->disableSAX = 1;
Owen Taylor3473f882001-02-23 17:55:21 +0000282 }
283 return(val);
284 } else {
285 /* 1-byte code */
286 *len = 1;
287 return((int) *ctxt->input->cur);
288 }
289 }
290 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000291 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000292 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000293 * XML constructs only use < 128 chars
294 */
295 *len = 1;
296 if ((int) *ctxt->input->cur < 0x80)
297 return((int) *ctxt->input->cur);
298
299 /*
300 * Humm this is bad, do an automatic flow conversion
301 */
302 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
303 ctxt->charset = XML_CHAR_ENCODING_UTF8;
304 return(xmlCurrentChar(ctxt, len));
305
306encoding_error:
307 /*
308 * If we detect an UTF8 error that probably mean that the
309 * input encoding didn't get properly advertized in the
310 * declaration header. Report the error and switch the encoding
311 * to ISO-Latin-1 (if you don't like this policy, just declare the
312 * encoding !)
313 */
314 ctxt->errNo = XML_ERR_INVALID_ENCODING;
315 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
316 ctxt->sax->error(ctxt->userData,
317 "Input is not proper UTF-8, indicate encoding !\n");
318 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
319 ctxt->input->cur[0], ctxt->input->cur[1],
320 ctxt->input->cur[2], ctxt->input->cur[3]);
321 }
322
323 ctxt->charset = XML_CHAR_ENCODING_8859_1;
324 *len = 1;
325 return((int) *ctxt->input->cur);
326}
327
328/**
Owen Taylor3473f882001-02-23 17:55:21 +0000329 * htmlSkipBlankChars:
330 * @ctxt: the HTML parser context
331 *
332 * skip all blanks character found at that point in the input streams.
333 *
334 * Returns the number of space chars skipped
335 */
336
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000337static int
Owen Taylor3473f882001-02-23 17:55:21 +0000338htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
339 int res = 0;
340
341 while (IS_BLANK(*(ctxt->input->cur))) {
342 if ((*ctxt->input->cur == 0) &&
343 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
344 xmlPopInput(ctxt);
345 } else {
346 if (*(ctxt->input->cur) == '\n') {
347 ctxt->input->line++; ctxt->input->col = 1;
348 } else ctxt->input->col++;
349 ctxt->input->cur++;
350 ctxt->nbChars++;
351 if (*ctxt->input->cur == 0)
352 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
353 }
354 res++;
355 }
356 return(res);
357}
358
359
360
361/************************************************************************
362 * *
363 * The list of HTML elements and their properties *
364 * *
365 ************************************************************************/
366
367/*
368 * Start Tag: 1 means the start tag can be ommited
369 * End Tag: 1 means the end tag can be ommited
370 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000371 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000372 * Depr: this element is deprecated
373 * DTD: 1 means that this element is valid only in the Loose DTD
374 * 2 means that this element is valid only in the Frameset DTD
375 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000376 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000377 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000378 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000379
380/* Definitions and a couple of vars for HTML Elements */
381
382#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
383#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
384#define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
385#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
386#define BLOCK HEADING LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
387#define FORMCTRL "input", "select", "textarea", "label", "button"
388#define PCDATA
389#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
390#define LIST "ul", "ol", "dir", "menu"
391#define MODIFIER
392#define FLOW BLOCK,INLINE
393#define EMPTY NULL
394
395
396static const char* html_flow[] = { FLOW, NULL } ;
397static const char* html_inline[] = { INLINE, NULL } ;
398
399/* placeholders: elts with content but no subelements */
400static const char* html_pcdata[] = { NULL } ;
401#define html_cdata html_pcdata
402
403
404/* ... and for HTML Attributes */
405
406#define COREATTRS "id", "class", "style", "title"
407#define I18N "lang", "dir"
408#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
409#define ATTRS COREATTRS,I18N,EVENTS
410#define CELLHALIGN "align", "char", "charoff"
411#define CELLVALIGN "valign"
412
413static const char* html_attrs[] = { ATTRS, NULL } ;
414static const char* core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
415static const char* core_attrs[] = { COREATTRS, NULL } ;
416static const char* i18n_attrs[] = { I18N, NULL } ;
417
418
419/* Other declarations that should go inline ... */
420static const char* a_attrs[] = { ATTRS, "charset", "type", "name",
421 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
422 "tabindex", "onfocus", "onblur", NULL } ;
423static const char* target_attr[] = { "target", NULL } ;
424static const char* rows_cols_attr[] = { "rows", "cols", NULL } ;
425static const char* alt_attr[] = { "alt", NULL } ;
426static const char* src_alt_attrs[] = { "src", "alt", NULL } ;
427static const char* href_attrs[] = { "href", NULL } ;
428static const char* clear_attrs[] = { "clear", NULL } ;
429static const char* inline_p[] = { INLINE, "p", NULL } ;
430static const char* flow_param[] = { FLOW, "param", NULL } ;
431static const char* applet_attrs[] = { COREATTRS , "codebase",
432 "archive", "alt", "name", "height", "width", "align",
433 "hspace", "vspace", NULL } ;
434static const char* area_attrs[] = { "shape", "coords", "href", "nohref",
435 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
436static const char* basefont_attrs[] =
437 { "id", "size", "color", "face", NULL } ;
438static const char* quote_attrs[] = { ATTRS, "cite", NULL } ;
439static const char* body_contents[] = { FLOW, "ins", "del", NULL } ;
440static const char* body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
441static const char* body_depr[] = { "background", "bgcolor", "text",
442 "link", "vlink", "alink", NULL } ;
443static const char* button_attrs[] = { ATTRS, "name", "value", "type",
444 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
445
446
447static const char* col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
448static const char* col_elt[] = { "col", NULL } ;
449static const char* edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
450static const char* compact_attrs[] = { ATTRS, "compact", NULL } ;
451static const char* dl_contents[] = { "dt", "dd", NULL } ;
452static const char* compact_attr[] = { "compact", NULL } ;
453static const char* label_attr[] = { "label", NULL } ;
454static const char* fieldset_contents[] = { FLOW, "legend" } ;
455static const char* font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
456static const char* form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
457static const char* form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
458static const char* frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
459static const char* frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
460static const char* frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
461static const char* head_attrs[] = { I18N, "profile", NULL } ;
462static const char* head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
463static const char* hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
464static const char* version_attr[] = { "version", NULL } ;
465static const char* html_content[] = { "head", "body", "frameset", NULL } ;
466static const char* iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
467static const char* img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
468static const char* input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
469static const char* prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
470static const char* label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
471static const char* legend_attrs[] = { ATTRS, "accesskey", NULL } ;
472static const char* align_attr[] = { "align", NULL } ;
473static const char* link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
474static const char* map_contents[] = { BLOCK, "area", NULL } ;
475static const char* name_attr[] = { "name", NULL } ;
476static const char* action_attr[] = { "action", NULL } ;
477static const char* blockli_elt[] = { BLOCK, "li", NULL } ;
478static const char* meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
479static const char* content_attr[] = { "content", NULL } ;
480static const char* type_attr[] = { "type", NULL } ;
481static const char* noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
482static const char* object_contents[] = { FLOW, "param", NULL } ;
483static const char* object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
484static const char* object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
485static const char* ol_attrs[] = { "type", "compact", "start", NULL} ;
486static const char* option_elt[] = { "option", NULL } ;
487static const char* optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
488static const char* option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
489static const char* param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
490static const char* width_attr[] = { "width", NULL } ;
491static const char* pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
492static const char* script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
493static const char* language_attr[] = { "language", NULL } ;
494static const char* select_content[] = { "optgroup", "option", NULL } ;
495static const char* select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
496static const char* style_attrs[] = { I18N, "media", "title", NULL } ;
497static const char* table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
498static const char* table_depr[] = { "align", "bgcolor", NULL } ;
499static const char* table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
500static const char* tr_elt[] = { "tr", NULL } ;
501static const char* talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
502static const char* th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
503static const char* th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
504static const char* textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
505static const char* tr_contents[] = { "th", "td", NULL } ;
506static const char* bgcolor_attr[] = { "bgcolor", NULL } ;
507static const char* li_elt[] = { "li", NULL } ;
508static const char* ul_depr[] = { "type", "compact", NULL} ;
509static const char* dir_attr[] = { "dir", NULL} ;
510
511#define DECL (const char**)
512
Daniel Veillard22090732001-07-16 00:06:07 +0000513static const htmlElemDesc
514html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000515{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
516 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
517},
518{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
519 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
520},
521{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
522 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
523},
524{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
525 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
526},
527{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
528 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
529},
530{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
531 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
532},
533{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
534 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
535},
536{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
537 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
538},
539{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
540 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
541},
542{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
543 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
544},
545{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
546 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
547},
548{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
549 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
550},
551{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
552 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
553},
554{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
555 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
556},
557{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
558 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
559},
560{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
561 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
562},
563{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
564 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
565},
566{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
567 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
568},
569{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
570 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
571},
572{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
573 EMPTY , NULL , DECL col_attrs , NULL, NULL
574},
575{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
576 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
577},
578{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
579 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
580},
581{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
582 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
583},
584{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
585 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
586},
587{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
588 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
589},
590{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
591 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
592},
593{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
594 DECL dl_contents , "dd" , html_attrs, DECL compact_attr, NULL
595},
596{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
597 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
598},
599{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
600 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
601},
602{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
603 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
604},
605{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
606 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
607},
608{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
609 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
610},
611{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
612 EMPTY, NULL, NULL, DECL frame_attrs, NULL
613},
614{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
615 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
616},
617{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
618 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
619},
620{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
621 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
622},
623{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
624 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
625},
626{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
627 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
628},
629{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
630 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
631},
632{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
633 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
634},
635{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
636 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
637},
638{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
639 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
640},
641{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
642 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
643},
644{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
645 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
646},
647{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
648 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
649},
650{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
651 EMPTY, NULL, DECL img_attrs, DECL align_attr, src_alt_attrs
652},
653{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
654 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
655},
656{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
657 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
658},
659{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
660 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
661},
662{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
663 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
664},
665{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
666 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
667},
668{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
669 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
670},
671{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
672 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
673},
674{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
675 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
676},
677{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
678 DECL map_contents , NULL, DECL html_attrs , NULL, name_attr
679},
680{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
681 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
682},
683{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
684 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
685},
686{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
687 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
688},
689{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
690 DECL html_flow, "div", DECL html_attrs, NULL, NULL
691},
692{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
693 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
694},
695{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
696 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
697},
698{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
699 option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
700},
701{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
702 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
703},
704{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
705 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
706},
707{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
708 EMPTY, NULL, DECL param_attrs, NULL, name_attr
709},
710{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
711 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
712},
713{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
714 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
715},
716{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
717 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
718},
719{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
720 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
721},
722{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
723 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
724},
725{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
726 DECL select_content, NULL, DECL select_attrs, NULL, NULL
727},
728{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
729 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
730},
731{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
732 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
733},
734{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
735 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
736},
737{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
738 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
739},
740{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
741 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
742},
743{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
744 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
745},
746{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
747 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
748},
749{ "table", 0, 0, 0, 0, 0, 0, 0, "",
750 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
751},
752{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
753 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
754},
755{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
756 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
757},
758{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
759 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
760},
761{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
762 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
763},
764{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
765 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
766},
767{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
768 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
769},
770{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
771 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
772},
773{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
774 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
775},
776{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
777 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
778},
779{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
780 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
781},
782{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
783 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
784},
785{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
786 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
787}
Owen Taylor3473f882001-02-23 17:55:21 +0000788};
789
790/*
Owen Taylor3473f882001-02-23 17:55:21 +0000791 * start tags that imply the end of current element
792 */
Daniel Veillard22090732001-07-16 00:06:07 +0000793static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000794"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
795 "dl", "ul", "ol", "menu", "dir", "address", "pre",
796 "listing", "xmp", "head", NULL,
797"head", "p", NULL,
798"title", "p", NULL,
799"body", "head", "style", "link", "title", "p", NULL,
800"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
801 "pre", "listing", "xmp", "head", "li", NULL,
802"hr", "p", "head", NULL,
803"h1", "p", "head", NULL,
804"h2", "p", "head", NULL,
805"h3", "p", "head", NULL,
806"h4", "p", "head", NULL,
807"h5", "p", "head", NULL,
808"h6", "p", "head", NULL,
809"dir", "p", "head", NULL,
810"address", "p", "head", "ul", NULL,
811"pre", "p", "head", "ul", NULL,
812"listing", "p", "head", NULL,
813"xmp", "p", "head", NULL,
814"blockquote", "p", "head", NULL,
815"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
816 "xmp", "head", NULL,
817"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
818 "head", "dd", NULL,
819"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
820 "head", "dt", NULL,
821"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
822 "listing", "xmp", NULL,
823"ol", "p", "head", "ul", NULL,
824"menu", "p", "head", "ul", NULL,
825"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
826"div", "p", "head", NULL,
827"noscript", "p", "head", NULL,
828"center", "font", "b", "i", "p", "head", NULL,
829"a", "a", NULL,
830"caption", "p", NULL,
831"colgroup", "caption", "colgroup", "col", "p", NULL,
832"col", "caption", "col", "p", NULL,
833"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
834 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000835"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
836"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000837"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
838"thead", "caption", "col", "colgroup", NULL,
839"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
840 "tbody", "p", NULL,
841"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
842 "tfoot", "tbody", "p", NULL,
843"optgroup", "option", NULL,
844"option", "option", NULL,
845"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
846 "pre", "listing", "xmp", "a", NULL,
847NULL
848};
849
850/*
851 * The list of HTML elements which are supposed not to have
852 * CDATA content and where a p element will be implied
853 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000854 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000855 * implied paragraph
856 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000857static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000858 "html",
859 "head",
860 "body",
861 NULL
862};
863
864/*
865 * The list of HTML attributes which are of content %Script;
866 * NOTE: when adding ones, check htmlIsScriptAttribute() since
867 * it assumes the name starts with 'on'
868 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000869static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000870 "onclick",
871 "ondblclick",
872 "onmousedown",
873 "onmouseup",
874 "onmouseover",
875 "onmousemove",
876 "onmouseout",
877 "onkeypress",
878 "onkeydown",
879 "onkeyup",
880 "onload",
881 "onunload",
882 "onfocus",
883 "onblur",
884 "onsubmit",
885 "onrest",
886 "onchange",
887 "onselect"
888};
889
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000890/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000891 * This table is used by the htmlparser to know what to do with
892 * broken html pages. By assigning different priorities to different
893 * elements the parser can decide how to handle extra endtags.
894 * Endtags are only allowed to close elements with lower or equal
895 * priority.
896 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000897
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000898typedef struct {
899 const char *name;
900 int priority;
901} elementPriority;
902
Daniel Veillard22090732001-07-16 00:06:07 +0000903static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000904 {"div", 150},
905 {"td", 160},
906 {"th", 160},
907 {"tr", 170},
908 {"thead", 180},
909 {"tbody", 180},
910 {"tfoot", 180},
911 {"table", 190},
912 {"head", 200},
913 {"body", 200},
914 {"html", 220},
915 {NULL, 100} /* Default priority */
916};
Owen Taylor3473f882001-02-23 17:55:21 +0000917
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000918static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000919static int htmlStartCloseIndexinitialized = 0;
920
921/************************************************************************
922 * *
923 * functions to handle HTML specific data *
924 * *
925 ************************************************************************/
926
927/**
928 * htmlInitAutoClose:
929 *
930 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
931 * This is not reentrant. Call xmlInitParser() once before processing in
932 * case of use in multithreaded programs.
933 */
934void
935htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000936 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000937
938 if (htmlStartCloseIndexinitialized) return;
939
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000940 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
941 indx = 0;
942 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
943 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000944 while (htmlStartClose[i] != NULL) i++;
945 i++;
946 }
947 htmlStartCloseIndexinitialized = 1;
948}
949
950/**
951 * htmlTagLookup:
952 * @tag: The tag name in lowercase
953 *
954 * Lookup the HTML tag in the ElementTable
955 *
956 * Returns the related htmlElemDescPtr or NULL if not found.
957 */
Daniel Veillardbb371292001-08-16 23:26:59 +0000958const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +0000959htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000960 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000961
962 for (i = 0; i < (sizeof(html40ElementTable) /
963 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000964 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +0000965 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +0000966 }
967 return(NULL);
968}
969
970/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000971 * htmlGetEndPriority:
972 * @name: The name of the element to look up the priority for.
973 *
974 * Return value: The "endtag" priority.
975 **/
976static int
977htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000978 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000979
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000980 while ((htmlEndPriority[i].name != NULL) &&
981 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
982 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000983
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000984 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000985}
986
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000987
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000988/**
Owen Taylor3473f882001-02-23 17:55:21 +0000989 * htmlCheckAutoClose:
990 * @newtag: The new tag name
991 * @oldtag: The old tag name
992 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000993 * Checks whether the new tag is one of the registered valid tags for
994 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +0000995 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
996 *
997 * Returns 0 if no, 1 if yes.
998 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000999static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001000htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1001{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001002 int i, indx;
1003 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001004
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001005 if (htmlStartCloseIndexinitialized == 0)
1006 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001007
1008 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001009 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001010 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001011 if (closed == NULL)
1012 return (0);
1013 if (xmlStrEqual(BAD_CAST * closed, newtag))
1014 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001015 }
1016
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001017 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001018 i++;
1019 while (htmlStartClose[i] != NULL) {
1020 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001021 return (1);
1022 }
1023 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001024 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001025 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001026}
1027
1028/**
1029 * htmlAutoCloseOnClose:
1030 * @ctxt: an HTML parser context
1031 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001032 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001033 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001034 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001035 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001036static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001037htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1038{
1039 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001040 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001041
1042#ifdef DEBUG
William M. Brack899e64a2003-09-26 18:03:42 +00001043 const xmlChar *oldname;
1044
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001045 xmlGenericError(xmlGenericErrorContext,
1046 "Close of %s stack: %d elements\n", newtag,
1047 ctxt->nameNr);
1048 for (i = 0; i < ctxt->nameNr; i++)
1049 xmlGenericError(xmlGenericErrorContext, "%d : %s\n", i,
1050 ctxt->nameTab[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001051#endif
1052
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001053 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001054
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001055 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001056
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001057 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1058 break;
1059 /*
1060 * A missplaced endtag can only close elements with lower
1061 * or equal priority, so if we find an element with higher
1062 * priority before we find an element with
1063 * matching name, we just ignore this endtag
1064 */
1065 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1066 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001067 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001068 if (i < 0)
1069 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001070
1071 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001072 info = htmlTagLookup(ctxt->name);
1073 if ((info == NULL) || (info->endTag == 1)) {
Owen Taylor3473f882001-02-23 17:55:21 +00001074#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001075 xmlGenericError(xmlGenericErrorContext,
1076 "htmlAutoCloseOnClose: %s closes %s\n", newtag,
1077 ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00001078#endif
Daniel Veillard56098d42001-04-24 12:51:09 +00001079 } else if (info->endTag == 3) {
1080#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001081 xmlGenericError(xmlGenericErrorContext,
1082 "End of tag %s: expecting %s\n", newtag,
1083 ctxt->name);
William M. Brack1633d182001-10-05 15:41:19 +00001084
Daniel Veillard56098d42001-04-24 12:51:09 +00001085#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001086 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1087 ctxt->sax->error(ctxt->userData,
1088 "Opening and ending tag mismatch: %s and %s\n",
1089 newtag, ctxt->name);
1090 ctxt->wellFormed = 0;
1091 }
1092 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1093 ctxt->sax->endElement(ctxt->userData, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00001094#ifdef DEBUG
William M. Brack899e64a2003-09-26 18:03:42 +00001095 oldname = htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001096 if (oldname != NULL) {
1097 xmlGenericError(xmlGenericErrorContext,
1098 "htmlAutoCloseOnClose: popped %s\n", oldname);
1099 }
William M. Brack899e64a2003-09-26 18:03:42 +00001100#else
1101 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001102#endif
Owen Taylor3473f882001-02-23 17:55:21 +00001103 }
1104}
1105
1106/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001107 * htmlAutoCloseOnEnd:
1108 * @ctxt: an HTML parser context
1109 *
1110 * Close all remaining tags at the end of the stream
1111 */
1112static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001113htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1114{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001115 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001116#ifdef DEBUG
William M. Brack899e64a2003-09-26 18:03:42 +00001117 const xmlChar *oldname;
1118
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001119 xmlGenericError(xmlGenericErrorContext,
1120 "Close of stack: %d elements\n", ctxt->nameNr);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001121#endif
1122
William M. Brack899e64a2003-09-26 18:03:42 +00001123 if (ctxt->nameNr == 0)
1124 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001125 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001126#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001127 xmlGenericError(xmlGenericErrorContext, "%d : %s\n", i,
1128 ctxt->nameTab[i]);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001129#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001130 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1131 ctxt->sax->endElement(ctxt->userData, ctxt->name);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001132#ifdef DEBUG
William M. Brack899e64a2003-09-26 18:03:42 +00001133 oldname = htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001134 if (oldname != NULL) {
1135 xmlGenericError(xmlGenericErrorContext,
1136 "htmlAutoCloseOnEnd: popped %s\n", oldname);
1137 }
William M. Brack899e64a2003-09-26 18:03:42 +00001138#else
1139 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001140#endif
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001141 }
1142}
1143
1144/**
Owen Taylor3473f882001-02-23 17:55:21 +00001145 * htmlAutoClose:
1146 * @ctxt: an HTML parser context
1147 * @newtag: The new tag name or NULL
1148 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001149 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001150 * The list is kept in htmlStartClose array. This function is
1151 * called when a new tag has been detected and generates the
1152 * appropriates closes if possible/needed.
1153 * If newtag is NULL this mean we are at the end of the resource
1154 * and we should check
1155 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001156static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001157htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1158{
William M. Brack899e64a2003-09-26 18:03:42 +00001159#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001160 const xmlChar *oldname;
William M. Brack899e64a2003-09-26 18:03:42 +00001161#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001162
1163 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001164 (htmlCheckAutoClose(newtag, ctxt->name))) {
1165#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001166 xmlGenericError(xmlGenericErrorContext,
1167 "htmlAutoClose: %s closes %s\n", newtag,
1168 ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00001169#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001170 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1171 ctxt->sax->endElement(ctxt->userData, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00001172#ifdef DEBUG
William M. Brack899e64a2003-09-26 18:03:42 +00001173 oldname = htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001174 if (oldname != NULL) {
1175 xmlGenericError(xmlGenericErrorContext,
1176 "htmlAutoClose: popped %s\n", oldname);
Owen Taylor3473f882001-02-23 17:55:21 +00001177 }
William M. Brack899e64a2003-09-26 18:03:42 +00001178#else
1179 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001180#endif
Owen Taylor3473f882001-02-23 17:55:21 +00001181 }
1182 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001183 htmlAutoCloseOnEnd(ctxt);
1184 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001185 }
1186 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001187 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1188 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1189 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00001190#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001191 xmlGenericError(xmlGenericErrorContext,
1192 "htmlAutoClose: EOF closes %s\n", ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00001193#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001194 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1195 ctxt->sax->endElement(ctxt->userData, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00001196#ifdef DEBUG
William M. Brack899e64a2003-09-26 18:03:42 +00001197 oldname = htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001198 if (oldname != NULL) {
1199 xmlGenericError(xmlGenericErrorContext,
1200 "htmlAutoClose: popped %s\n", oldname);
Owen Taylor3473f882001-02-23 17:55:21 +00001201 }
William M. Brack899e64a2003-09-26 18:03:42 +00001202#else
1203 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001204#endif
1205 }
Owen Taylor3473f882001-02-23 17:55:21 +00001206
1207}
1208
1209/**
1210 * htmlAutoCloseTag:
1211 * @doc: the HTML document
1212 * @name: The tag name
1213 * @elem: the HTML element
1214 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001215 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001216 * The list is kept in htmlStartClose array. This function checks
1217 * if the element or one of it's children would autoclose the
1218 * given tag.
1219 *
1220 * Returns 1 if autoclose, 0 otherwise
1221 */
1222int
1223htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1224 htmlNodePtr child;
1225
1226 if (elem == NULL) return(1);
1227 if (xmlStrEqual(name, elem->name)) return(0);
1228 if (htmlCheckAutoClose(elem->name, name)) return(1);
1229 child = elem->children;
1230 while (child != NULL) {
1231 if (htmlAutoCloseTag(doc, name, child)) return(1);
1232 child = child->next;
1233 }
1234 return(0);
1235}
1236
1237/**
1238 * htmlIsAutoClosed:
1239 * @doc: the HTML document
1240 * @elem: the HTML element
1241 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001242 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001243 * The list is kept in htmlStartClose array. This function checks
1244 * if a tag is autoclosed by one of it's child
1245 *
1246 * Returns 1 if autoclosed, 0 otherwise
1247 */
1248int
1249htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1250 htmlNodePtr child;
1251
1252 if (elem == NULL) return(1);
1253 child = elem->children;
1254 while (child != NULL) {
1255 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1256 child = child->next;
1257 }
1258 return(0);
1259}
1260
1261/**
1262 * htmlCheckImplied:
1263 * @ctxt: an HTML parser context
1264 * @newtag: The new tag name
1265 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001266 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001267 * called when a new tag has been detected and generates the
1268 * appropriates implicit tags if missing
1269 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001270static void
Owen Taylor3473f882001-02-23 17:55:21 +00001271htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1272 if (!htmlOmittedDefaultValue)
1273 return;
1274 if (xmlStrEqual(newtag, BAD_CAST"html"))
1275 return;
1276 if (ctxt->nameNr <= 0) {
1277#ifdef DEBUG
1278 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
1279#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001280 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001281 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1282 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1283 }
1284 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1285 return;
1286 if ((ctxt->nameNr <= 1) &&
1287 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1288 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1289 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1290 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1291 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1292 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1293 /*
1294 * dropped OBJECT ... i you put it first BODY will be
1295 * assumed !
1296 */
1297#ifdef DEBUG
1298 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
1299#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001300 htmlnamePush(ctxt, BAD_CAST"head");
Owen Taylor3473f882001-02-23 17:55:21 +00001301 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1302 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1303 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1304 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1305 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1306 int i;
1307 for (i = 0;i < ctxt->nameNr;i++) {
1308 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1309 return;
1310 }
1311 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1312 return;
1313 }
1314 }
1315
1316#ifdef DEBUG
1317 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
1318#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001319 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001320 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1321 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1322 }
1323}
1324
1325/**
1326 * htmlCheckParagraph
1327 * @ctxt: an HTML parser context
1328 *
1329 * Check whether a p element need to be implied before inserting
1330 * characters in the current element.
1331 *
1332 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1333 * in case of error.
1334 */
1335
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001336static int
Owen Taylor3473f882001-02-23 17:55:21 +00001337htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1338 const xmlChar *tag;
1339 int i;
1340
1341 if (ctxt == NULL)
1342 return(-1);
1343 tag = ctxt->name;
1344 if (tag == NULL) {
1345 htmlAutoClose(ctxt, BAD_CAST"p");
1346 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001347 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001348 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1349 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1350 return(1);
1351 }
1352 if (!htmlOmittedDefaultValue)
1353 return(0);
1354 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1355 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1356#ifdef DEBUG
1357 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
1358#endif
1359 htmlAutoClose(ctxt, BAD_CAST"p");
1360 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001361 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001362 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1363 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1364 return(1);
1365 }
1366 }
1367 return(0);
1368}
1369
1370/**
1371 * htmlIsScriptAttribute:
1372 * @name: an attribute name
1373 *
1374 * Check if an attribute is of content type Script
1375 *
1376 * Returns 1 is the attribute is a script 0 otherwise
1377 */
1378int
1379htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001380 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001381
1382 if (name == NULL)
1383 return(0);
1384 /*
1385 * all script attributes start with 'on'
1386 */
1387 if ((name[0] != 'o') || (name[1] != 'n'))
1388 return(0);
1389 for (i = 0;
1390 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1391 i++) {
1392 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1393 return(1);
1394 }
1395 return(0);
1396}
1397
1398/************************************************************************
1399 * *
1400 * The list of HTML predefined entities *
1401 * *
1402 ************************************************************************/
1403
1404
Daniel Veillard22090732001-07-16 00:06:07 +00001405static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001406/*
1407 * the 4 absolute ones, plus apostrophe.
1408 */
1409{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1410{ 38, "amp", "ampersand, U+0026 ISOnum" },
1411{ 39, "apos", "single quote" },
1412{ 60, "lt", "less-than sign, U+003C ISOnum" },
1413{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1414
1415/*
1416 * A bunch still in the 128-255 range
1417 * Replacing them depend really on the charset used.
1418 */
1419{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1420{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1421{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1422{ 163, "pound","pound sign, U+00A3 ISOnum" },
1423{ 164, "curren","currency sign, U+00A4 ISOnum" },
1424{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1425{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1426{ 167, "sect", "section sign, U+00A7 ISOnum" },
1427{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1428{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1429{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1430{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1431{ 172, "not", "not sign, U+00AC ISOnum" },
1432{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1433{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1434{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1435{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1436{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1437{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1438{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1439{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1440{ 181, "micro","micro sign, U+00B5 ISOnum" },
1441{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1442{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1443{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1444{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1445{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1446{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1447{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1448{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1449{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1450{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1451{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1452{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1453{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1454{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1455{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1456{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1457{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1458{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1459{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1460{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1461{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1462{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1463{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1464{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1465{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1466{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1467{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1468{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1469{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1470{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1471{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1472{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1473{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1474{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1475{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1476{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1477{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1478{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1479{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1480{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1481{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1482{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1483{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1484{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1485{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1486{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1487{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1488{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1489{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1490{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1491{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1492{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1493{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1494{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1495{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1496{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1497{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1498{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1499{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1500{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1501{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1502{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1503{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1504{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1505{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1506{ 247, "divide","division sign, U+00F7 ISOnum" },
1507{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1508{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1509{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1510{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1511{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1512{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1513{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1514{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1515
1516{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1517{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1518{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1519{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1520{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1521
1522/*
1523 * Anything below should really be kept as entities references
1524 */
1525{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1526
1527{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1528{ 732, "tilde","small tilde, U+02DC ISOdia" },
1529
1530{ 913, "Alpha","greek capital letter alpha, U+0391" },
1531{ 914, "Beta", "greek capital letter beta, U+0392" },
1532{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1533{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1534{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1535{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1536{ 919, "Eta", "greek capital letter eta, U+0397" },
1537{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1538{ 921, "Iota", "greek capital letter iota, U+0399" },
1539{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001540{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001541{ 924, "Mu", "greek capital letter mu, U+039C" },
1542{ 925, "Nu", "greek capital letter nu, U+039D" },
1543{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1544{ 927, "Omicron","greek capital letter omicron, U+039F" },
1545{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1546{ 929, "Rho", "greek capital letter rho, U+03A1" },
1547{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1548{ 932, "Tau", "greek capital letter tau, U+03A4" },
1549{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1550{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1551{ 935, "Chi", "greek capital letter chi, U+03A7" },
1552{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1553{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1554
1555{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1556{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1557{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1558{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1559{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1560{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1561{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1562{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1563{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1564{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1565{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1566{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1567{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1568{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1569{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1570{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1571{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1572{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1573{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1574{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1575{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1576{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1577{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1578{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1579{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1580{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1581{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1582{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1583
1584{ 8194, "ensp", "en space, U+2002 ISOpub" },
1585{ 8195, "emsp", "em space, U+2003 ISOpub" },
1586{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1587{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1588{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1589{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1590{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1591{ 8211, "ndash","en dash, U+2013 ISOpub" },
1592{ 8212, "mdash","em dash, U+2014 ISOpub" },
1593{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1594{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1595{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1596{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1597{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1598{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1599{ 8224, "dagger","dagger, U+2020 ISOpub" },
1600{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1601
1602{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1603{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1604
1605{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1606
1607{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1608{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1609
1610{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1611{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1612
1613{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1614{ 8260, "frasl","fraction slash, U+2044 NEW" },
1615
1616{ 8364, "euro", "euro sign, U+20AC NEW" },
1617
1618{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1619{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1620{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1621{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1622{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1623{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1624{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1625{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1626{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1627{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1628{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1629{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1630{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1631{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1632{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1633{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1634
1635{ 8704, "forall","for all, U+2200 ISOtech" },
1636{ 8706, "part", "partial differential, U+2202 ISOtech" },
1637{ 8707, "exist","there exists, U+2203 ISOtech" },
1638{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1639{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1640{ 8712, "isin", "element of, U+2208 ISOtech" },
1641{ 8713, "notin","not an element of, U+2209 ISOtech" },
1642{ 8715, "ni", "contains as member, U+220B ISOtech" },
1643{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001644{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001645{ 8722, "minus","minus sign, U+2212 ISOtech" },
1646{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1647{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1648{ 8733, "prop", "proportional to, U+221D ISOtech" },
1649{ 8734, "infin","infinity, U+221E ISOtech" },
1650{ 8736, "ang", "angle, U+2220 ISOamso" },
1651{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1652{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1653{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1654{ 8746, "cup", "union = cup, U+222A ISOtech" },
1655{ 8747, "int", "integral, U+222B ISOtech" },
1656{ 8756, "there4","therefore, U+2234 ISOtech" },
1657{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1658{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1659{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1660{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1661{ 8801, "equiv","identical to, U+2261 ISOtech" },
1662{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1663{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1664{ 8834, "sub", "subset of, U+2282 ISOtech" },
1665{ 8835, "sup", "superset of, U+2283 ISOtech" },
1666{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1667{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1668{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1669{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1670{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1671{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1672{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1673{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1674{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1675{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1676{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1677{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1678{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1679{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1680
1681{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1682{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1683{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1684{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1685
1686};
1687
1688/************************************************************************
1689 * *
1690 * Commodity functions to handle entities *
1691 * *
1692 ************************************************************************/
1693
1694/*
1695 * Macro used to grow the current buffer.
1696 */
1697#define growBuffer(buffer) { \
1698 buffer##_size *= 2; \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001699 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Owen Taylor3473f882001-02-23 17:55:21 +00001700 if (buffer == NULL) { \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001701 xmlGenericError(xmlGenericErrorContext, "realloc failed\n"); \
Owen Taylor3473f882001-02-23 17:55:21 +00001702 return(NULL); \
1703 } \
1704}
1705
1706/**
1707 * htmlEntityLookup:
1708 * @name: the entity name
1709 *
1710 * Lookup the given entity in EntitiesTable
1711 *
1712 * TODO: the linear scan is really ugly, an hash table is really needed.
1713 *
1714 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1715 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001716const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001717htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001718 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001719
1720 for (i = 0;i < (sizeof(html40EntitiesTable)/
1721 sizeof(html40EntitiesTable[0]));i++) {
1722 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1723#ifdef DEBUG
1724 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1725#endif
William M. Brack78637da2003-07-31 14:47:38 +00001726 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001727 }
1728 }
1729 return(NULL);
1730}
1731
1732/**
1733 * htmlEntityValueLookup:
1734 * @value: the entity's unicode value
1735 *
1736 * Lookup the given entity in EntitiesTable
1737 *
1738 * TODO: the linear scan is really ugly, an hash table is really needed.
1739 *
1740 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1741 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001742const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001743htmlEntityValueLookup(unsigned int value) {
1744 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001745#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001746 unsigned int lv = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001747#endif
1748
1749 for (i = 0;i < (sizeof(html40EntitiesTable)/
1750 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001751 if (html40EntitiesTable[i].value >= value) {
1752 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001753 break;
1754#ifdef DEBUG
1755 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1756#endif
William M. Brack78637da2003-07-31 14:47:38 +00001757 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001758 }
1759#ifdef DEBUG
1760 if (lv > html40EntitiesTable[i].value) {
1761 xmlGenericError(xmlGenericErrorContext,
1762 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1763 lv, html40EntitiesTable[i].value);
1764 }
1765 lv = html40EntitiesTable[i].value;
1766#endif
1767 }
1768 return(NULL);
1769}
1770
1771/**
1772 * UTF8ToHtml:
1773 * @out: a pointer to an array of bytes to store the result
1774 * @outlen: the length of @out
1775 * @in: a pointer to an array of UTF-8 chars
1776 * @inlen: the length of @in
1777 *
1778 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1779 * plus HTML entities block of chars out.
1780 *
1781 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1782 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001783 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001784 * The value of @outlen after return is the number of octets consumed.
1785 */
1786int
1787UTF8ToHtml(unsigned char* out, int *outlen,
1788 const unsigned char* in, int *inlen) {
1789 const unsigned char* processed = in;
1790 const unsigned char* outend;
1791 const unsigned char* outstart = out;
1792 const unsigned char* instart = in;
1793 const unsigned char* inend;
1794 unsigned int c, d;
1795 int trailing;
1796
1797 if (in == NULL) {
1798 /*
1799 * initialization nothing to do
1800 */
1801 *outlen = 0;
1802 *inlen = 0;
1803 return(0);
1804 }
1805 inend = in + (*inlen);
1806 outend = out + (*outlen);
1807 while (in < inend) {
1808 d = *in++;
1809 if (d < 0x80) { c= d; trailing= 0; }
1810 else if (d < 0xC0) {
1811 /* trailing byte in leading position */
1812 *outlen = out - outstart;
1813 *inlen = processed - instart;
1814 return(-2);
1815 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1816 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1817 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1818 else {
1819 /* no chance for this in Ascii */
1820 *outlen = out - outstart;
1821 *inlen = processed - instart;
1822 return(-2);
1823 }
1824
1825 if (inend - in < trailing) {
1826 break;
1827 }
1828
1829 for ( ; trailing; trailing--) {
1830 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1831 break;
1832 c <<= 6;
1833 c |= d & 0x3F;
1834 }
1835
1836 /* assertion: c is a single UTF-4 value */
1837 if (c < 0x80) {
1838 if (out + 1 >= outend)
1839 break;
1840 *out++ = c;
1841 } else {
1842 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001843 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001844
1845 /*
1846 * Try to lookup a predefined HTML entity for it
1847 */
1848
1849 ent = htmlEntityValueLookup(c);
1850 if (ent == NULL) {
1851 /* no chance for this in Ascii */
1852 *outlen = out - outstart;
1853 *inlen = processed - instart;
1854 return(-2);
1855 }
1856 len = strlen(ent->name);
1857 if (out + 2 + len >= outend)
1858 break;
1859 *out++ = '&';
1860 memcpy(out, ent->name, len);
1861 out += len;
1862 *out++ = ';';
1863 }
1864 processed = in;
1865 }
1866 *outlen = out - outstart;
1867 *inlen = processed - instart;
1868 return(0);
1869}
1870
1871/**
1872 * htmlEncodeEntities:
1873 * @out: a pointer to an array of bytes to store the result
1874 * @outlen: the length of @out
1875 * @in: a pointer to an array of UTF-8 chars
1876 * @inlen: the length of @in
1877 * @quoteChar: the quote character to escape (' or ") or zero.
1878 *
1879 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1880 * plus HTML entities block of chars out.
1881 *
1882 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1883 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001884 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001885 * The value of @outlen after return is the number of octets consumed.
1886 */
1887int
1888htmlEncodeEntities(unsigned char* out, int *outlen,
1889 const unsigned char* in, int *inlen, int quoteChar) {
1890 const unsigned char* processed = in;
1891 const unsigned char* outend = out + (*outlen);
1892 const unsigned char* outstart = out;
1893 const unsigned char* instart = in;
1894 const unsigned char* inend = in + (*inlen);
1895 unsigned int c, d;
1896 int trailing;
1897
1898 while (in < inend) {
1899 d = *in++;
1900 if (d < 0x80) { c= d; trailing= 0; }
1901 else if (d < 0xC0) {
1902 /* trailing byte in leading position */
1903 *outlen = out - outstart;
1904 *inlen = processed - instart;
1905 return(-2);
1906 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1907 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1908 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1909 else {
1910 /* no chance for this in Ascii */
1911 *outlen = out - outstart;
1912 *inlen = processed - instart;
1913 return(-2);
1914 }
1915
1916 if (inend - in < trailing)
1917 break;
1918
1919 while (trailing--) {
1920 if (((d= *in++) & 0xC0) != 0x80) {
1921 *outlen = out - outstart;
1922 *inlen = processed - instart;
1923 return(-2);
1924 }
1925 c <<= 6;
1926 c |= d & 0x3F;
1927 }
1928
1929 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001930 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1931 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001932 if (out >= outend)
1933 break;
1934 *out++ = c;
1935 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001936 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001937 const char *cp;
1938 char nbuf[16];
1939 int len;
1940
1941 /*
1942 * Try to lookup a predefined HTML entity for it
1943 */
1944 ent = htmlEntityValueLookup(c);
1945 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001946 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001947 cp = nbuf;
1948 }
1949 else
1950 cp = ent->name;
1951 len = strlen(cp);
1952 if (out + 2 + len > outend)
1953 break;
1954 *out++ = '&';
1955 memcpy(out, cp, len);
1956 out += len;
1957 *out++ = ';';
1958 }
1959 processed = in;
1960 }
1961 *outlen = out - outstart;
1962 *inlen = processed - instart;
1963 return(0);
1964}
1965
1966/**
1967 * htmlDecodeEntities:
1968 * @ctxt: the parser context
1969 * @len: the len to decode (in bytes !), -1 for no size limit
1970 * @end: an end marker xmlChar, 0 if none
1971 * @end2: an end marker xmlChar, 0 if none
1972 * @end3: an end marker xmlChar, 0 if none
1973 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001974 * Substitute the HTML entities by their value
Owen Taylor3473f882001-02-23 17:55:21 +00001975 *
1976 * DEPRECATED !!!!
1977 *
1978 * Returns A newly allocated string with the substitution done. The caller
1979 * must deallocate it !
1980 */
1981xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001982htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1983 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001984 static int deprecated = 0;
1985 if (!deprecated) {
1986 xmlGenericError(xmlGenericErrorContext,
1987 "htmlDecodeEntities() deprecated function reached\n");
1988 deprecated = 1;
1989 }
1990 return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001991}
1992
1993/************************************************************************
1994 * *
1995 * Commodity functions to handle streams *
1996 * *
1997 ************************************************************************/
1998
1999/**
Owen Taylor3473f882001-02-23 17:55:21 +00002000 * htmlNewInputStream:
2001 * @ctxt: an HTML parser context
2002 *
2003 * Create a new input stream structure
2004 * Returns the new input stream or NULL
2005 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002006static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00002007htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2008 htmlParserInputPtr input;
2009
2010 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2011 if (input == NULL) {
2012 ctxt->errNo = XML_ERR_NO_MEMORY;
2013 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2014 ctxt->sax->error(ctxt->userData,
2015 "malloc: couldn't allocate a new input stream\n");
2016 return(NULL);
2017 }
2018 memset(input, 0, sizeof(htmlParserInput));
2019 input->filename = NULL;
2020 input->directory = NULL;
2021 input->base = NULL;
2022 input->cur = NULL;
2023 input->buf = NULL;
2024 input->line = 1;
2025 input->col = 1;
2026 input->buf = NULL;
2027 input->free = NULL;
2028 input->version = NULL;
2029 input->consumed = 0;
2030 input->length = 0;
2031 return(input);
2032}
2033
2034
2035/************************************************************************
2036 * *
2037 * Commodity functions, cleanup needed ? *
2038 * *
2039 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002040/*
2041 * all tags allowing pc data from the html 4.01 loose dtd
2042 * NOTE: it might be more apropriate to integrate this information
2043 * into the html40ElementTable array but I don't want to risk any
2044 * binary incomptibility
2045 */
2046static const char *allowPCData[] = {
2047 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2048 "blockquote", "body", "button", "caption", "center", "cite", "code",
2049 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2050 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2051 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2052 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2053};
Owen Taylor3473f882001-02-23 17:55:21 +00002054
2055/**
2056 * areBlanks:
2057 * @ctxt: an HTML parser context
2058 * @str: a xmlChar *
2059 * @len: the size of @str
2060 *
2061 * Is this a sequence of blank chars that one can ignore ?
2062 *
2063 * Returns 1 if ignorable 0 otherwise.
2064 */
2065
2066static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002067 unsigned int i;
2068 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002069 xmlNodePtr lastChild;
2070
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002071 for (j = 0;j < len;j++)
2072 if (!(IS_BLANK(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002073
2074 if (CUR == 0) return(1);
2075 if (CUR != '<') return(0);
2076 if (ctxt->name == NULL)
2077 return(1);
2078 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2079 return(1);
2080 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2081 return(1);
2082 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
2083 return(1);
2084 if (ctxt->node == NULL) return(0);
2085 lastChild = xmlGetLastChild(ctxt->node);
2086 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002087 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2088 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002089 /* keep ws in constructs like ...<b> </b>...
2090 for all tags "b" allowing PCDATA */
2091 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2092 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2093 return(0);
2094 }
2095 }
Owen Taylor3473f882001-02-23 17:55:21 +00002096 } else if (xmlNodeIsText(lastChild)) {
2097 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002098 } else {
2099 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2100 for all tags "p" allowing PCDATA */
2101 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2102 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2103 return(0);
2104 }
2105 }
Owen Taylor3473f882001-02-23 17:55:21 +00002106 }
2107 return(1);
2108}
2109
2110/**
Owen Taylor3473f882001-02-23 17:55:21 +00002111 * htmlNewDocNoDtD:
2112 * @URI: URI for the dtd, or NULL
2113 * @ExternalID: the external ID of the DTD, or NULL
2114 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002115 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2116 * are NULL
2117 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002118 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002119 */
2120htmlDocPtr
2121htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2122 xmlDocPtr cur;
2123
2124 /*
2125 * Allocate a new document and fill the fields.
2126 */
2127 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2128 if (cur == NULL) {
2129 xmlGenericError(xmlGenericErrorContext,
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002130 "htmlNewDocNoDtD : malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002131 return(NULL);
2132 }
2133 memset(cur, 0, sizeof(xmlDoc));
2134
2135 cur->type = XML_HTML_DOCUMENT_NODE;
2136 cur->version = NULL;
2137 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002138 cur->doc = cur;
2139 cur->name = NULL;
2140 cur->children = NULL;
2141 cur->extSubset = NULL;
2142 cur->oldNs = NULL;
2143 cur->encoding = NULL;
2144 cur->standalone = 1;
2145 cur->compression = 0;
2146 cur->ids = NULL;
2147 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002148 cur->_private = NULL;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002149 if ((ExternalID != NULL) ||
2150 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002151 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002152 return(cur);
2153}
2154
2155/**
2156 * htmlNewDoc:
2157 * @URI: URI for the dtd, or NULL
2158 * @ExternalID: the external ID of the DTD, or NULL
2159 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002160 * Creates a new HTML document
2161 *
Owen Taylor3473f882001-02-23 17:55:21 +00002162 * Returns a new document
2163 */
2164htmlDocPtr
2165htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2166 if ((URI == NULL) && (ExternalID == NULL))
2167 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002168 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2169 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002170
2171 return(htmlNewDocNoDtD(URI, ExternalID));
2172}
2173
2174
2175/************************************************************************
2176 * *
2177 * The parser itself *
2178 * Relates to http://www.w3.org/TR/html40 *
2179 * *
2180 ************************************************************************/
2181
2182/************************************************************************
2183 * *
2184 * The parser itself *
2185 * *
2186 ************************************************************************/
2187
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002188static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002189
Owen Taylor3473f882001-02-23 17:55:21 +00002190/**
2191 * htmlParseHTMLName:
2192 * @ctxt: an HTML parser context
2193 *
2194 * parse an HTML tag or attribute name, note that we convert it to lowercase
2195 * since HTML names are not case-sensitive.
2196 *
2197 * Returns the Tag Name parsed or NULL
2198 */
2199
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002200static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002201htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002202 int i = 0;
2203 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2204
2205 if (!IS_LETTER(CUR) && (CUR != '_') &&
2206 (CUR != ':')) return(NULL);
2207
2208 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2209 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2210 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2211 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2212 else loc[i] = CUR;
2213 i++;
2214
2215 NEXT;
2216 }
2217
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002218 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002219}
2220
2221/**
2222 * htmlParseName:
2223 * @ctxt: an HTML parser context
2224 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002225 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002226 *
2227 * Returns the Name parsed or NULL
2228 */
2229
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002230static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002231htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002232 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002233 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002234 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002235
2236 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002237
2238 /*
2239 * Accelerator for simple ASCII names
2240 */
2241 in = ctxt->input->cur;
2242 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2243 ((*in >= 0x41) && (*in <= 0x5A)) ||
2244 (*in == '_') || (*in == ':')) {
2245 in++;
2246 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2247 ((*in >= 0x41) && (*in <= 0x5A)) ||
2248 ((*in >= 0x30) && (*in <= 0x39)) ||
2249 (*in == '_') || (*in == '-') ||
2250 (*in == ':') || (*in == '.'))
2251 in++;
2252 if ((*in > 0) && (*in < 0x80)) {
2253 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002254 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002255 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002256 ctxt->nbChars += count;
2257 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002258 return(ret);
2259 }
2260 }
2261 return(htmlParseNameComplex(ctxt));
2262}
2263
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002264static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002265htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002266 int len = 0, l;
2267 int c;
2268 int count = 0;
2269
2270 /*
2271 * Handler for more complex cases
2272 */
2273 GROW;
2274 c = CUR_CHAR(l);
2275 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2276 (!IS_LETTER(c) && (c != '_') &&
2277 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002278 return(NULL);
2279 }
2280
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002281 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2282 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2283 (c == '.') || (c == '-') ||
2284 (c == '_') || (c == ':') ||
2285 (IS_COMBINING(c)) ||
2286 (IS_EXTENDER(c)))) {
2287 if (count++ > 100) {
2288 count = 0;
2289 GROW;
2290 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002291 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002292 NEXTL(l);
2293 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002294 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002295 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002296}
2297
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002298
Owen Taylor3473f882001-02-23 17:55:21 +00002299/**
2300 * htmlParseHTMLAttribute:
2301 * @ctxt: an HTML parser context
2302 * @stop: a char stop value
2303 *
2304 * parse an HTML attribute value till the stop (quote), if
2305 * stop is 0 then it stops at the first space
2306 *
2307 * Returns the attribute parsed or NULL
2308 */
2309
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002310static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002311htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2312 xmlChar *buffer = NULL;
2313 int buffer_size = 0;
2314 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002315 const xmlChar *name = NULL;
2316 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002317 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002318
2319 /*
2320 * allocate a translation buffer.
2321 */
2322 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002323 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002324 if (buffer == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00002325 xmlGenericError(xmlGenericErrorContext,
2326 "htmlParseHTMLAttribute: malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002327 return(NULL);
2328 }
2329 out = buffer;
2330
2331 /*
2332 * Ok loop until we reach one of the ending chars
2333 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002334 while ((CUR != 0) && (CUR != stop)) {
2335 if ((stop == 0) && (CUR == '>')) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002336 if ((stop == 0) && (IS_BLANK(CUR))) break;
2337 if (CUR == '&') {
2338 if (NXT(1) == '#') {
2339 unsigned int c;
2340 int bits;
2341
2342 c = htmlParseCharRef(ctxt);
2343 if (c < 0x80)
2344 { *out++ = c; bits= -6; }
2345 else if (c < 0x800)
2346 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2347 else if (c < 0x10000)
2348 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2349 else
2350 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2351
2352 for ( ; bits >= 0; bits-= 6) {
2353 *out++ = ((c >> bits) & 0x3F) | 0x80;
2354 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002355
2356 if (out - buffer > buffer_size - 100) {
2357 int indx = out - buffer;
2358
2359 growBuffer(buffer);
2360 out = &buffer[indx];
2361 }
Owen Taylor3473f882001-02-23 17:55:21 +00002362 } else {
2363 ent = htmlParseEntityRef(ctxt, &name);
2364 if (name == NULL) {
2365 *out++ = '&';
2366 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002367 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002368
2369 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002370 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002371 }
2372 } else if (ent == NULL) {
2373 *out++ = '&';
2374 cur = name;
2375 while (*cur != 0) {
2376 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002377 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002378
2379 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002380 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002381 }
2382 *out++ = *cur++;
2383 }
Owen Taylor3473f882001-02-23 17:55:21 +00002384 } else {
2385 unsigned int c;
2386 int bits;
2387
2388 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002389 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002390
2391 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002392 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002393 }
2394 c = (xmlChar)ent->value;
2395 if (c < 0x80)
2396 { *out++ = c; bits= -6; }
2397 else if (c < 0x800)
2398 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2399 else if (c < 0x10000)
2400 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2401 else
2402 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2403
2404 for ( ; bits >= 0; bits-= 6) {
2405 *out++ = ((c >> bits) & 0x3F) | 0x80;
2406 }
Owen Taylor3473f882001-02-23 17:55:21 +00002407 }
2408 }
2409 } else {
2410 unsigned int c;
2411 int bits, l;
2412
2413 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002414 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002415
2416 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002417 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002418 }
2419 c = CUR_CHAR(l);
2420 if (c < 0x80)
2421 { *out++ = c; bits= -6; }
2422 else if (c < 0x800)
2423 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2424 else if (c < 0x10000)
2425 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2426 else
2427 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2428
2429 for ( ; bits >= 0; bits-= 6) {
2430 *out++ = ((c >> bits) & 0x3F) | 0x80;
2431 }
2432 NEXT;
2433 }
2434 }
2435 *out++ = 0;
2436 return(buffer);
2437}
2438
2439/**
Owen Taylor3473f882001-02-23 17:55:21 +00002440 * htmlParseEntityRef:
2441 * @ctxt: an HTML parser context
2442 * @str: location to store the entity name
2443 *
2444 * parse an HTML ENTITY references
2445 *
2446 * [68] EntityRef ::= '&' Name ';'
2447 *
2448 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2449 * if non-NULL *str will have to be freed by the caller.
2450 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002451const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002452htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2453 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002454 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002455 *str = NULL;
2456
2457 if (CUR == '&') {
2458 NEXT;
2459 name = htmlParseName(ctxt);
2460 if (name == NULL) {
2461 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2462 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2463 ctxt->wellFormed = 0;
2464 } else {
2465 GROW;
2466 if (CUR == ';') {
2467 *str = name;
2468
2469 /*
2470 * Lookup the entity in the table.
2471 */
2472 ent = htmlEntityLookup(name);
2473 if (ent != NULL) /* OK that's ugly !!! */
2474 NEXT;
2475 } else {
2476 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2477 ctxt->sax->error(ctxt->userData,
2478 "htmlParseEntityRef: expecting ';'\n");
2479 *str = name;
2480 }
2481 }
2482 }
2483 return(ent);
2484}
2485
2486/**
2487 * htmlParseAttValue:
2488 * @ctxt: an HTML parser context
2489 *
2490 * parse a value for an attribute
2491 * Note: the parser won't do substitution of entities here, this
2492 * will be handled later in xmlStringGetNodeList, unless it was
2493 * asked for ctxt->replaceEntities != 0
2494 *
2495 * Returns the AttValue parsed or NULL.
2496 */
2497
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002498static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002499htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2500 xmlChar *ret = NULL;
2501
2502 if (CUR == '"') {
2503 NEXT;
2504 ret = htmlParseHTMLAttribute(ctxt, '"');
2505 if (CUR != '"') {
2506 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2507 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2508 ctxt->wellFormed = 0;
2509 } else
2510 NEXT;
2511 } else if (CUR == '\'') {
2512 NEXT;
2513 ret = htmlParseHTMLAttribute(ctxt, '\'');
2514 if (CUR != '\'') {
2515 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2516 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2517 ctxt->wellFormed = 0;
2518 } else
2519 NEXT;
2520 } else {
2521 /*
2522 * That's an HTMLism, the attribute value may not be quoted
2523 */
2524 ret = htmlParseHTMLAttribute(ctxt, 0);
2525 if (ret == NULL) {
2526 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2527 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2528 ctxt->wellFormed = 0;
2529 }
2530 }
2531 return(ret);
2532}
2533
2534/**
2535 * htmlParseSystemLiteral:
2536 * @ctxt: an HTML parser context
2537 *
2538 * parse an HTML Literal
2539 *
2540 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2541 *
2542 * Returns the SystemLiteral parsed or NULL
2543 */
2544
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002545static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002546htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2547 const xmlChar *q;
2548 xmlChar *ret = NULL;
2549
2550 if (CUR == '"') {
2551 NEXT;
2552 q = CUR_PTR;
Daniel Veillard34ba3872003-07-15 13:34:05 +00002553 while ((IS_CHAR((unsigned int) CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002554 NEXT;
Daniel Veillard34ba3872003-07-15 13:34:05 +00002555 if (!IS_CHAR((unsigned int) CUR)) {
Owen Taylor3473f882001-02-23 17:55:21 +00002556 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2557 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2558 ctxt->wellFormed = 0;
2559 } else {
2560 ret = xmlStrndup(q, CUR_PTR - q);
2561 NEXT;
2562 }
2563 } else if (CUR == '\'') {
2564 NEXT;
2565 q = CUR_PTR;
Daniel Veillard34ba3872003-07-15 13:34:05 +00002566 while ((IS_CHAR((unsigned int) CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002567 NEXT;
Daniel Veillard34ba3872003-07-15 13:34:05 +00002568 if (!IS_CHAR((unsigned int) CUR)) {
Owen Taylor3473f882001-02-23 17:55:21 +00002569 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2570 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2571 ctxt->wellFormed = 0;
2572 } else {
2573 ret = xmlStrndup(q, CUR_PTR - q);
2574 NEXT;
2575 }
2576 } else {
2577 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2578 ctxt->sax->error(ctxt->userData,
2579 "SystemLiteral \" or ' expected\n");
2580 ctxt->wellFormed = 0;
2581 }
2582
2583 return(ret);
2584}
2585
2586/**
2587 * htmlParsePubidLiteral:
2588 * @ctxt: an HTML parser context
2589 *
2590 * parse an HTML public literal
2591 *
2592 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2593 *
2594 * Returns the PubidLiteral parsed or NULL.
2595 */
2596
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002597static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002598htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2599 const xmlChar *q;
2600 xmlChar *ret = NULL;
2601 /*
2602 * Name ::= (Letter | '_') (NameChar)*
2603 */
2604 if (CUR == '"') {
2605 NEXT;
2606 q = CUR_PTR;
2607 while (IS_PUBIDCHAR(CUR)) NEXT;
2608 if (CUR != '"') {
2609 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2610 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2611 ctxt->wellFormed = 0;
2612 } else {
2613 ret = xmlStrndup(q, CUR_PTR - q);
2614 NEXT;
2615 }
2616 } else if (CUR == '\'') {
2617 NEXT;
2618 q = CUR_PTR;
Daniel Veillard6560a422003-03-27 21:25:38 +00002619 while ((IS_PUBIDCHAR(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002620 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002621 if (CUR != '\'') {
Owen Taylor3473f882001-02-23 17:55:21 +00002622 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2623 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2624 ctxt->wellFormed = 0;
2625 } else {
2626 ret = xmlStrndup(q, CUR_PTR - q);
2627 NEXT;
2628 }
2629 } else {
2630 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2631 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2632 ctxt->wellFormed = 0;
2633 }
2634
2635 return(ret);
2636}
2637
2638/**
2639 * htmlParseScript:
2640 * @ctxt: an HTML parser context
2641 *
2642 * parse the content of an HTML SCRIPT or STYLE element
2643 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2644 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2645 * http://www.w3.org/TR/html4/types.html#type-script
2646 * http://www.w3.org/TR/html4/types.html#h-6.15
2647 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2648 *
2649 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2650 * element and the value of intrinsic event attributes. User agents must
2651 * not evaluate script data as HTML markup but instead must pass it on as
2652 * data to a script engine.
2653 * NOTES:
2654 * - The content is passed like CDATA
2655 * - the attributes for style and scripting "onXXX" are also described
2656 * as CDATA but SGML allows entities references in attributes so their
2657 * processing is identical as other attributes
2658 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002659static void
Owen Taylor3473f882001-02-23 17:55:21 +00002660htmlParseScript(htmlParserCtxtPtr ctxt) {
2661 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2662 int nbchar = 0;
2663 xmlChar cur;
2664
2665 SHRINK;
2666 cur = CUR;
Daniel Veillard34ba3872003-07-15 13:34:05 +00002667 while (IS_CHAR((unsigned int) cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002668 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2669 (NXT(3) == '-')) {
2670 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2671 if (ctxt->sax->cdataBlock!= NULL) {
2672 /*
2673 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2674 */
2675 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002676 } else if (ctxt->sax->characters != NULL) {
2677 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Daniel Veillardc1f78342001-11-10 11:43:05 +00002678 }
2679 }
2680 nbchar = 0;
2681 htmlParseComment(ctxt);
2682 cur = CUR;
2683 continue;
2684 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002685 /*
2686 * One should break here, the specification is clear:
2687 * Authors should therefore escape "</" within the content.
2688 * Escape mechanisms are specific to each scripting or
2689 * style sheet language.
2690 */
2691 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2692 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2693 break; /* while */
2694 }
2695 buf[nbchar++] = cur;
2696 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2697 if (ctxt->sax->cdataBlock!= NULL) {
2698 /*
2699 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2700 */
2701 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002702 } else if (ctxt->sax->characters != NULL) {
2703 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002704 }
2705 nbchar = 0;
2706 }
2707 NEXT;
2708 cur = CUR;
2709 }
Daniel Veillard34ba3872003-07-15 13:34:05 +00002710 if (!(IS_CHAR((unsigned int) cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002711 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2712 ctxt->sax->error(ctxt->userData,
2713 "Invalid char in CDATA 0x%X\n", cur);
2714 ctxt->wellFormed = 0;
2715 NEXT;
2716 }
2717
2718 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2719 if (ctxt->sax->cdataBlock!= NULL) {
2720 /*
2721 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2722 */
2723 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002724 } else if (ctxt->sax->characters != NULL) {
2725 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002726 }
2727 }
2728}
2729
2730
2731/**
2732 * htmlParseCharData:
2733 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002734 *
2735 * parse a CharData section.
2736 * if we are within a CDATA section ']]>' marks an end of section.
2737 *
2738 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2739 */
2740
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002741static void
2742htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002743 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2744 int nbchar = 0;
2745 int cur, l;
2746
2747 SHRINK;
2748 cur = CUR_CHAR(l);
2749 while (((cur != '<') || (ctxt->token == '<')) &&
2750 ((cur != '&') || (ctxt->token == '&')) &&
2751 (IS_CHAR(cur))) {
2752 COPY_BUF(l,buf,nbchar,cur);
2753 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2754 /*
2755 * Ok the segment is to be consumed as chars.
2756 */
2757 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2758 if (areBlanks(ctxt, buf, nbchar)) {
2759 if (ctxt->sax->ignorableWhitespace != NULL)
2760 ctxt->sax->ignorableWhitespace(ctxt->userData,
2761 buf, nbchar);
2762 } else {
2763 htmlCheckParagraph(ctxt);
2764 if (ctxt->sax->characters != NULL)
2765 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2766 }
2767 }
2768 nbchar = 0;
2769 }
2770 NEXTL(l);
2771 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002772 if (cur == 0) {
2773 SHRINK;
2774 GROW;
2775 cur = CUR_CHAR(l);
2776 }
Owen Taylor3473f882001-02-23 17:55:21 +00002777 }
2778 if (nbchar != 0) {
2779 /*
2780 * Ok the segment is to be consumed as chars.
2781 */
2782 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2783 if (areBlanks(ctxt, buf, nbchar)) {
2784 if (ctxt->sax->ignorableWhitespace != NULL)
2785 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2786 } else {
2787 htmlCheckParagraph(ctxt);
2788 if (ctxt->sax->characters != NULL)
2789 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2790 }
2791 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002792 } else {
2793 /*
2794 * Loop detection
2795 */
2796 if (cur == 0)
2797 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002798 }
2799}
2800
2801/**
2802 * htmlParseExternalID:
2803 * @ctxt: an HTML parser context
2804 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002805 *
2806 * Parse an External ID or a Public ID
2807 *
Owen Taylor3473f882001-02-23 17:55:21 +00002808 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2809 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2810 *
2811 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2812 *
2813 * Returns the function returns SystemLiteral and in the second
2814 * case publicID receives PubidLiteral, is strict is off
2815 * it is possible to return NULL and have publicID set.
2816 */
2817
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002818static xmlChar *
2819htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002820 xmlChar *URI = NULL;
2821
2822 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2823 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2824 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2825 SKIP(6);
2826 if (!IS_BLANK(CUR)) {
2827 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2828 ctxt->sax->error(ctxt->userData,
2829 "Space required after 'SYSTEM'\n");
2830 ctxt->wellFormed = 0;
2831 }
2832 SKIP_BLANKS;
2833 URI = htmlParseSystemLiteral(ctxt);
2834 if (URI == NULL) {
2835 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2836 ctxt->sax->error(ctxt->userData,
2837 "htmlParseExternalID: SYSTEM, no URI\n");
2838 ctxt->wellFormed = 0;
2839 }
2840 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2841 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2842 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2843 SKIP(6);
2844 if (!IS_BLANK(CUR)) {
2845 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2846 ctxt->sax->error(ctxt->userData,
2847 "Space required after 'PUBLIC'\n");
2848 ctxt->wellFormed = 0;
2849 }
2850 SKIP_BLANKS;
2851 *publicID = htmlParsePubidLiteral(ctxt);
2852 if (*publicID == NULL) {
2853 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2854 ctxt->sax->error(ctxt->userData,
2855 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2856 ctxt->wellFormed = 0;
2857 }
2858 SKIP_BLANKS;
2859 if ((CUR == '"') || (CUR == '\'')) {
2860 URI = htmlParseSystemLiteral(ctxt);
2861 }
2862 }
2863 return(URI);
2864}
2865
2866/**
2867 * htmlParseComment:
2868 * @ctxt: an HTML parser context
2869 *
2870 * Parse an XML (SGML) comment <!-- .... -->
2871 *
2872 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2873 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002874static void
Owen Taylor3473f882001-02-23 17:55:21 +00002875htmlParseComment(htmlParserCtxtPtr ctxt) {
2876 xmlChar *buf = NULL;
2877 int len;
2878 int size = HTML_PARSER_BUFFER_SIZE;
2879 int q, ql;
2880 int r, rl;
2881 int cur, l;
2882 xmlParserInputState state;
2883
2884 /*
2885 * Check that there is a comment right here.
2886 */
2887 if ((RAW != '<') || (NXT(1) != '!') ||
2888 (NXT(2) != '-') || (NXT(3) != '-')) return;
2889
2890 state = ctxt->instate;
2891 ctxt->instate = XML_PARSER_COMMENT;
2892 SHRINK;
2893 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002894 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002895 if (buf == NULL) {
2896 xmlGenericError(xmlGenericErrorContext,
2897 "malloc of %d byte failed\n", size);
2898 ctxt->instate = state;
2899 return;
2900 }
2901 q = CUR_CHAR(ql);
2902 NEXTL(ql);
2903 r = CUR_CHAR(rl);
2904 NEXTL(rl);
2905 cur = CUR_CHAR(l);
2906 len = 0;
2907 while (IS_CHAR(cur) &&
2908 ((cur != '>') ||
2909 (r != '-') || (q != '-'))) {
2910 if (len + 5 >= size) {
2911 size *= 2;
2912 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2913 if (buf == NULL) {
2914 xmlGenericError(xmlGenericErrorContext,
2915 "realloc of %d byte failed\n", size);
2916 ctxt->instate = state;
2917 return;
2918 }
2919 }
2920 COPY_BUF(ql,buf,len,q);
2921 q = r;
2922 ql = rl;
2923 r = cur;
2924 rl = l;
2925 NEXTL(l);
2926 cur = CUR_CHAR(l);
2927 if (cur == 0) {
2928 SHRINK;
2929 GROW;
2930 cur = CUR_CHAR(l);
2931 }
2932 }
2933 buf[len] = 0;
2934 if (!IS_CHAR(cur)) {
2935 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2936 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2937 ctxt->sax->error(ctxt->userData,
2938 "Comment not terminated \n<!--%.50s\n", buf);
2939 ctxt->wellFormed = 0;
2940 xmlFree(buf);
2941 } else {
2942 NEXT;
2943 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2944 (!ctxt->disableSAX))
2945 ctxt->sax->comment(ctxt->userData, buf);
2946 xmlFree(buf);
2947 }
2948 ctxt->instate = state;
2949}
2950
2951/**
2952 * htmlParseCharRef:
2953 * @ctxt: an HTML parser context
2954 *
2955 * parse Reference declarations
2956 *
2957 * [66] CharRef ::= '&#' [0-9]+ ';' |
2958 * '&#x' [0-9a-fA-F]+ ';'
2959 *
2960 * Returns the value parsed (as an int)
2961 */
2962int
2963htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2964 int val = 0;
2965
2966 if ((CUR == '&') && (NXT(1) == '#') &&
2967 (NXT(2) == 'x')) {
2968 SKIP(3);
2969 while (CUR != ';') {
2970 if ((CUR >= '0') && (CUR <= '9'))
2971 val = val * 16 + (CUR - '0');
2972 else if ((CUR >= 'a') && (CUR <= 'f'))
2973 val = val * 16 + (CUR - 'a') + 10;
2974 else if ((CUR >= 'A') && (CUR <= 'F'))
2975 val = val * 16 + (CUR - 'A') + 10;
2976 else {
2977 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2978 ctxt->sax->error(ctxt->userData,
2979 "htmlParseCharRef: invalid hexadecimal value\n");
2980 ctxt->wellFormed = 0;
2981 return(0);
2982 }
2983 NEXT;
2984 }
2985 if (CUR == ';')
2986 NEXT;
2987 } else if ((CUR == '&') && (NXT(1) == '#')) {
2988 SKIP(2);
2989 while (CUR != ';') {
2990 if ((CUR >= '0') && (CUR <= '9'))
2991 val = val * 10 + (CUR - '0');
2992 else {
2993 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2994 ctxt->sax->error(ctxt->userData,
2995 "htmlParseCharRef: invalid decimal value\n");
2996 ctxt->wellFormed = 0;
2997 return(0);
2998 }
2999 NEXT;
3000 }
3001 if (CUR == ';')
3002 NEXT;
3003 } else {
3004 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3005 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
3006 ctxt->wellFormed = 0;
3007 }
3008 /*
3009 * Check the value IS_CHAR ...
3010 */
3011 if (IS_CHAR(val)) {
3012 return(val);
3013 } else {
3014 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3015 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
3016 val);
3017 ctxt->wellFormed = 0;
3018 }
3019 return(0);
3020}
3021
3022
3023/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003024 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003025 * @ctxt: an HTML parser context
3026 *
3027 * parse a DOCTYPE declaration
3028 *
3029 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3030 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3031 */
3032
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003033static void
Owen Taylor3473f882001-02-23 17:55:21 +00003034htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003035 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003036 xmlChar *ExternalID = NULL;
3037 xmlChar *URI = NULL;
3038
3039 /*
3040 * We know that '<!DOCTYPE' has been detected.
3041 */
3042 SKIP(9);
3043
3044 SKIP_BLANKS;
3045
3046 /*
3047 * Parse the DOCTYPE name.
3048 */
3049 name = htmlParseName(ctxt);
3050 if (name == NULL) {
3051 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3052 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
3053 ctxt->wellFormed = 0;
3054 }
3055 /*
3056 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3057 */
3058
3059 SKIP_BLANKS;
3060
3061 /*
3062 * Check for SystemID and ExternalID
3063 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003064 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003065 SKIP_BLANKS;
3066
3067 /*
3068 * We should be at the end of the DOCTYPE declaration.
3069 */
3070 if (CUR != '>') {
3071 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardf6ed8bc2001-10-02 09:22:47 +00003072 ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003073 ctxt->wellFormed = 0;
3074 /* We shouldn't try to resynchronize ... */
3075 }
3076 NEXT;
3077
3078 /*
3079 * Create or update the document accordingly to the DOCTYPE
3080 */
3081 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3082 (!ctxt->disableSAX))
3083 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3084
3085 /*
3086 * Cleanup, since we don't use all those identifiers
3087 */
3088 if (URI != NULL) xmlFree(URI);
3089 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003090}
3091
3092/**
3093 * htmlParseAttribute:
3094 * @ctxt: an HTML parser context
3095 * @value: a xmlChar ** used to store the value of the attribute
3096 *
3097 * parse an attribute
3098 *
3099 * [41] Attribute ::= Name Eq AttValue
3100 *
3101 * [25] Eq ::= S? '=' S?
3102 *
3103 * With namespace:
3104 *
3105 * [NS 11] Attribute ::= QName Eq AttValue
3106 *
3107 * Also the case QName == xmlns:??? is handled independently as a namespace
3108 * definition.
3109 *
3110 * Returns the attribute name, and the value in *value.
3111 */
3112
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003113static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003114htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003115 const xmlChar *name;
3116 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003117
3118 *value = NULL;
3119 name = htmlParseHTMLName(ctxt);
3120 if (name == NULL) {
3121 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3122 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
3123 ctxt->wellFormed = 0;
3124 return(NULL);
3125 }
3126
3127 /*
3128 * read the value
3129 */
3130 SKIP_BLANKS;
3131 if (CUR == '=') {
3132 NEXT;
3133 SKIP_BLANKS;
3134 val = htmlParseAttValue(ctxt);
3135 /******
3136 } else {
3137 * TODO : some attribute must have values, some may not
3138 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3139 ctxt->sax->warning(ctxt->userData,
3140 "No value for attribute %s\n", name); */
3141 }
3142
3143 *value = val;
3144 return(name);
3145}
3146
3147/**
3148 * htmlCheckEncoding:
3149 * @ctxt: an HTML parser context
3150 * @attvalue: the attribute value
3151 *
3152 * Checks an http-equiv attribute from a Meta tag to detect
3153 * the encoding
3154 * If a new encoding is detected the parser is switched to decode
3155 * it and pass UTF8
3156 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003157static void
Owen Taylor3473f882001-02-23 17:55:21 +00003158htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3159 const xmlChar *encoding;
3160
3161 if ((ctxt == NULL) || (attvalue == NULL))
3162 return;
3163
3164 /* do not change encoding */
3165 if (ctxt->input->encoding != NULL)
3166 return;
3167
3168 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3169 if (encoding != NULL) {
3170 encoding += 8;
3171 } else {
3172 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3173 if (encoding != NULL)
3174 encoding += 9;
3175 }
3176 if (encoding != NULL) {
3177 xmlCharEncoding enc;
3178 xmlCharEncodingHandlerPtr handler;
3179
3180 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3181
3182 if (ctxt->input->encoding != NULL)
3183 xmlFree((xmlChar *) ctxt->input->encoding);
3184 ctxt->input->encoding = xmlStrdup(encoding);
3185
3186 enc = xmlParseCharEncoding((const char *) encoding);
3187 /*
3188 * registered set of known encodings
3189 */
3190 if (enc != XML_CHAR_ENCODING_ERROR) {
3191 xmlSwitchEncoding(ctxt, enc);
3192 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3193 } else {
3194 /*
3195 * fallback for unknown encodings
3196 */
3197 handler = xmlFindCharEncodingHandler((const char *) encoding);
3198 if (handler != NULL) {
3199 xmlSwitchToEncoding(ctxt, handler);
3200 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3201 } else {
3202 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3203 }
3204 }
3205
3206 if ((ctxt->input->buf != NULL) &&
3207 (ctxt->input->buf->encoder != NULL) &&
3208 (ctxt->input->buf->raw != NULL) &&
3209 (ctxt->input->buf->buffer != NULL)) {
3210 int nbchars;
3211 int processed;
3212
3213 /*
3214 * convert as much as possible to the parser reading buffer.
3215 */
3216 processed = ctxt->input->cur - ctxt->input->base;
3217 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3218 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3219 ctxt->input->buf->buffer,
3220 ctxt->input->buf->raw);
3221 if (nbchars < 0) {
3222 ctxt->errNo = XML_ERR_INVALID_ENCODING;
3223 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3224 ctxt->sax->error(ctxt->userData,
3225 "htmlCheckEncoding: encoder error\n");
3226 }
3227 ctxt->input->base =
3228 ctxt->input->cur = ctxt->input->buf->buffer->content;
3229 }
3230 }
3231}
3232
3233/**
3234 * htmlCheckMeta:
3235 * @ctxt: an HTML parser context
3236 * @atts: the attributes values
3237 *
3238 * Checks an attributes from a Meta tag
3239 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003240static void
Owen Taylor3473f882001-02-23 17:55:21 +00003241htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3242 int i;
3243 const xmlChar *att, *value;
3244 int http = 0;
3245 const xmlChar *content = NULL;
3246
3247 if ((ctxt == NULL) || (atts == NULL))
3248 return;
3249
3250 i = 0;
3251 att = atts[i++];
3252 while (att != NULL) {
3253 value = atts[i++];
3254 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3255 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3256 http = 1;
3257 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3258 content = value;
3259 att = atts[i++];
3260 }
3261 if ((http) && (content != NULL))
3262 htmlCheckEncoding(ctxt, content);
3263
3264}
3265
3266/**
3267 * htmlParseStartTag:
3268 * @ctxt: an HTML parser context
3269 *
3270 * parse a start of tag either for rule element or
3271 * EmptyElement. In both case we don't parse the tag closing chars.
3272 *
3273 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3274 *
3275 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3276 *
3277 * With namespace:
3278 *
3279 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3280 *
3281 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3282 *
3283 */
3284
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003285static void
Owen Taylor3473f882001-02-23 17:55:21 +00003286htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003287 const xmlChar *name;
3288 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003289 xmlChar *attvalue;
3290 const xmlChar **atts = NULL;
3291 int nbatts = 0;
3292 int maxatts = 0;
3293 int meta = 0;
3294 int i;
3295
3296 if (CUR != '<') return;
3297 NEXT;
3298
3299 GROW;
3300 name = htmlParseHTMLName(ctxt);
3301 if (name == NULL) {
3302 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3303 ctxt->sax->error(ctxt->userData,
3304 "htmlParseStartTag: invalid element name\n");
3305 ctxt->wellFormed = 0;
3306 /* Dump the bogus tag like browsers do */
Daniel Veillard34ba3872003-07-15 13:34:05 +00003307 while ((IS_CHAR((unsigned int) CUR)) && (CUR != '>'))
Owen Taylor3473f882001-02-23 17:55:21 +00003308 NEXT;
3309 return;
3310 }
3311 if (xmlStrEqual(name, BAD_CAST"meta"))
3312 meta = 1;
3313
3314 /*
3315 * Check for auto-closure of HTML elements.
3316 */
3317 htmlAutoClose(ctxt, name);
3318
3319 /*
3320 * Check for implied HTML elements.
3321 */
3322 htmlCheckImplied(ctxt, name);
3323
3324 /*
3325 * Avoid html at any level > 0, head at any level != 1
3326 * or any attempt to recurse body
3327 */
3328 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3329 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3330 ctxt->sax->error(ctxt->userData,
3331 "htmlParseStartTag: misplaced <html> tag\n");
3332 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003333 return;
3334 }
3335 if ((ctxt->nameNr != 1) &&
3336 (xmlStrEqual(name, BAD_CAST"head"))) {
3337 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3338 ctxt->sax->error(ctxt->userData,
3339 "htmlParseStartTag: misplaced <head> tag\n");
3340 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003341 return;
3342 }
3343 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003344 int indx;
3345 for (indx = 0;indx < ctxt->nameNr;indx++) {
3346 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00003347 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3348 ctxt->sax->error(ctxt->userData,
3349 "htmlParseStartTag: misplaced <body> tag\n");
3350 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003351 return;
3352 }
3353 }
3354 }
3355
3356 /*
3357 * Now parse the attributes, it ends up with the ending
3358 *
3359 * (S Attribute)* S?
3360 */
3361 SKIP_BLANKS;
Daniel Veillard34ba3872003-07-15 13:34:05 +00003362 while ((IS_CHAR((unsigned int) CUR)) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003363 (CUR != '>') &&
3364 ((CUR != '/') || (NXT(1) != '>'))) {
3365 long cons = ctxt->nbChars;
3366
3367 GROW;
3368 attname = htmlParseAttribute(ctxt, &attvalue);
3369 if (attname != NULL) {
3370
3371 /*
3372 * Well formedness requires at most one declaration of an attribute
3373 */
3374 for (i = 0; i < nbatts;i += 2) {
3375 if (xmlStrEqual(atts[i], attname)) {
3376 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3377 ctxt->sax->error(ctxt->userData,
3378 "Attribute %s redefined\n",
3379 attname);
3380 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003381 if (attvalue != NULL)
3382 xmlFree(attvalue);
3383 goto failed;
3384 }
3385 }
3386
3387 /*
3388 * Add the pair to atts
3389 */
3390 if (atts == NULL) {
3391 maxatts = 10;
3392 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
3393 if (atts == NULL) {
3394 xmlGenericError(xmlGenericErrorContext,
3395 "malloc of %ld byte failed\n",
3396 maxatts * (long)sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003397 return;
3398 }
3399 } else if (nbatts + 4 > maxatts) {
3400 maxatts *= 2;
3401 atts = (const xmlChar **) xmlRealloc((void *) atts,
3402 maxatts * sizeof(xmlChar *));
3403 if (atts == NULL) {
3404 xmlGenericError(xmlGenericErrorContext,
3405 "realloc of %ld byte failed\n",
3406 maxatts * (long)sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003407 return;
3408 }
3409 }
3410 atts[nbatts++] = attname;
3411 atts[nbatts++] = attvalue;
3412 atts[nbatts] = NULL;
3413 atts[nbatts + 1] = NULL;
3414 }
3415 else {
3416 /* Dump the bogus attribute string up to the next blank or
3417 * the end of the tag. */
Daniel Veillard34ba3872003-07-15 13:34:05 +00003418 while ((IS_CHAR((unsigned int) CUR)) &&
3419 !(IS_BLANK(CUR)) && (CUR != '>') &&
3420 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003421 NEXT;
3422 }
3423
3424failed:
3425 SKIP_BLANKS;
3426 if (cons == ctxt->nbChars) {
3427 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3428 ctxt->sax->error(ctxt->userData,
3429 "htmlParseStartTag: problem parsing attributes\n");
3430 ctxt->wellFormed = 0;
3431 break;
3432 }
3433 }
3434
3435 /*
3436 * Handle specific association to the META tag
3437 */
3438 if (meta)
3439 htmlCheckMeta(ctxt, atts);
3440
3441 /*
3442 * SAX: Start of Element !
3443 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003444 htmlnamePush(ctxt, name);
Owen Taylor3473f882001-02-23 17:55:21 +00003445#ifdef DEBUG
3446 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3447#endif
3448 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3449 ctxt->sax->startElement(ctxt->userData, name, atts);
3450
3451 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003452 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003453 if (atts[i] != NULL)
3454 xmlFree((xmlChar *) atts[i]);
3455 }
3456 xmlFree((void *) atts);
3457 }
Owen Taylor3473f882001-02-23 17:55:21 +00003458}
3459
3460/**
3461 * htmlParseEndTag:
3462 * @ctxt: an HTML parser context
3463 *
3464 * parse an end of tag
3465 *
3466 * [42] ETag ::= '</' Name S? '>'
3467 *
3468 * With namespace
3469 *
3470 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003471 *
3472 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003473 */
3474
Daniel Veillardf420ac52001-07-04 16:04:09 +00003475static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003476htmlParseEndTag(htmlParserCtxtPtr ctxt)
3477{
3478 const xmlChar *name;
3479 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003480 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003481
3482 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003483 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3484 ctxt->sax->error(ctxt->userData,
3485 "htmlParseEndTag: '</' not found\n");
3486 ctxt->wellFormed = 0;
3487 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003488 }
3489 SKIP(2);
3490
3491 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003492 if (name == NULL)
3493 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003494
3495 /*
3496 * We should definitely be at the ending "S? '>'" part
3497 */
3498 SKIP_BLANKS;
Daniel Veillard34ba3872003-07-15 13:34:05 +00003499 if ((!IS_CHAR((unsigned int) CUR)) || (CUR != '>')) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003500 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3501 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3502 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003503 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003504 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003505
3506 /*
3507 * If the name read is not one of the element in the parsing stack
3508 * then return, it's just an error.
3509 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003510 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3511 if (xmlStrEqual(name, ctxt->nameTab[i]))
3512 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003513 }
3514 if (i < 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003515 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3516 ctxt->sax->error(ctxt->userData,
3517 "Unexpected end tag : %s\n", name);
3518 ctxt->wellFormed = 0;
3519 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003520 }
3521
3522
3523 /*
3524 * Check for auto-closure of HTML elements.
3525 */
3526
3527 htmlAutoCloseOnClose(ctxt, name);
3528
3529 /*
3530 * Well formedness constraints, opening and closing must match.
3531 * With the exception that the autoclose may have popped stuff out
3532 * of the stack.
3533 */
3534 if (!xmlStrEqual(name, ctxt->name)) {
3535#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003536 xmlGenericError(xmlGenericErrorContext,
3537 "End of tag %s: expecting %s\n", name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003538#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003539 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
3540 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3541 ctxt->sax->error(ctxt->userData,
3542 "Opening and ending tag mismatch: %s and %s\n",
3543 name, ctxt->name);
3544 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003545 }
3546 }
3547
3548 /*
3549 * SAX: End of Tag
3550 */
3551 oldname = ctxt->name;
3552 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003553 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3554 ctxt->sax->endElement(ctxt->userData, name);
3555 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003556#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003557 if (oldname != NULL) {
3558 xmlGenericError(xmlGenericErrorContext,
3559 "End of tag %s: popping out %s\n", name,
3560 oldname);
3561 } else {
3562 xmlGenericError(xmlGenericErrorContext,
3563 "End of tag %s: stack empty !!!\n", name);
3564 }
Owen Taylor3473f882001-02-23 17:55:21 +00003565#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003566 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003567 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003568 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003569 }
3570
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003571 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003572}
3573
3574
3575/**
3576 * htmlParseReference:
3577 * @ctxt: an HTML parser context
3578 *
3579 * parse and handle entity references in content,
3580 * this will end-up in a call to character() since this is either a
3581 * CharRef, or a predefined entity.
3582 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003583static void
Owen Taylor3473f882001-02-23 17:55:21 +00003584htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003585 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003586 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003587 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003588 if (CUR != '&') return;
3589
3590 if (NXT(1) == '#') {
3591 unsigned int c;
3592 int bits, i = 0;
3593
3594 c = htmlParseCharRef(ctxt);
3595 if (c == 0)
3596 return;
3597
3598 if (c < 0x80) { out[i++]= c; bits= -6; }
3599 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3600 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3601 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3602
3603 for ( ; bits >= 0; bits-= 6) {
3604 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3605 }
3606 out[i] = 0;
3607
3608 htmlCheckParagraph(ctxt);
3609 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3610 ctxt->sax->characters(ctxt->userData, out, i);
3611 } else {
3612 ent = htmlParseEntityRef(ctxt, &name);
3613 if (name == NULL) {
3614 htmlCheckParagraph(ctxt);
3615 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3616 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3617 return;
3618 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003619 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003620 htmlCheckParagraph(ctxt);
3621 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3622 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3623 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3624 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3625 }
3626 } else {
3627 unsigned int c;
3628 int bits, i = 0;
3629
3630 c = ent->value;
3631 if (c < 0x80)
3632 { out[i++]= c; bits= -6; }
3633 else if (c < 0x800)
3634 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3635 else if (c < 0x10000)
3636 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3637 else
3638 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3639
3640 for ( ; bits >= 0; bits-= 6) {
3641 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3642 }
3643 out[i] = 0;
3644
3645 htmlCheckParagraph(ctxt);
3646 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3647 ctxt->sax->characters(ctxt->userData, out, i);
3648 }
Owen Taylor3473f882001-02-23 17:55:21 +00003649 }
3650}
3651
3652/**
3653 * htmlParseContent:
3654 * @ctxt: an HTML parser context
3655 * @name: the node name
3656 *
3657 * Parse a content: comment, sub-element, reference or text.
3658 *
3659 */
3660
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003661static void
Owen Taylor3473f882001-02-23 17:55:21 +00003662htmlParseContent(htmlParserCtxtPtr ctxt) {
3663 xmlChar *currentNode;
3664 int depth;
3665
3666 currentNode = xmlStrdup(ctxt->name);
3667 depth = ctxt->nameNr;
3668 while (1) {
3669 long cons = ctxt->nbChars;
3670
3671 GROW;
3672 /*
3673 * Our tag or one of it's parent or children is ending.
3674 */
3675 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003676 if (htmlParseEndTag(ctxt) &&
3677 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3678 if (currentNode != NULL)
3679 xmlFree(currentNode);
3680 return;
3681 }
3682 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003683 }
3684
3685 /*
3686 * Has this node been popped out during parsing of
3687 * the next element
3688 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003689 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3690 (!xmlStrEqual(currentNode, ctxt->name)))
3691 {
Owen Taylor3473f882001-02-23 17:55:21 +00003692 if (currentNode != NULL) xmlFree(currentNode);
3693 return;
3694 }
3695
Daniel Veillardf9533d12001-03-03 10:04:57 +00003696 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3697 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003698 /*
3699 * Handle SCRIPT/STYLE separately
3700 */
3701 htmlParseScript(ctxt);
3702 } else {
3703 /*
3704 * Sometimes DOCTYPE arrives in the middle of the document
3705 */
3706 if ((CUR == '<') && (NXT(1) == '!') &&
3707 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3708 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3709 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3710 (UPP(8) == 'E')) {
3711 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3712 ctxt->sax->error(ctxt->userData,
3713 "Misplaced DOCTYPE declaration\n");
3714 ctxt->wellFormed = 0;
3715 htmlParseDocTypeDecl(ctxt);
3716 }
3717
3718 /*
3719 * First case : a comment
3720 */
3721 if ((CUR == '<') && (NXT(1) == '!') &&
3722 (NXT(2) == '-') && (NXT(3) == '-')) {
3723 htmlParseComment(ctxt);
3724 }
3725
3726 /*
3727 * Second case : a sub-element.
3728 */
3729 else if (CUR == '<') {
3730 htmlParseElement(ctxt);
3731 }
3732
3733 /*
3734 * Third case : a reference. If if has not been resolved,
3735 * parsing returns it's Name, create the node
3736 */
3737 else if (CUR == '&') {
3738 htmlParseReference(ctxt);
3739 }
3740
3741 /*
3742 * Fourth : end of the resource
3743 */
3744 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003745 htmlAutoCloseOnEnd(ctxt);
3746 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003747 }
3748
3749 /*
3750 * Last case, text. Note that References are handled directly.
3751 */
3752 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003753 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003754 }
3755
3756 if (cons == ctxt->nbChars) {
3757 if (ctxt->node != NULL) {
3758 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3759 ctxt->sax->error(ctxt->userData,
3760 "detected an error in element content\n");
3761 ctxt->wellFormed = 0;
3762 }
3763 break;
3764 }
3765 }
3766 GROW;
3767 }
3768 if (currentNode != NULL) xmlFree(currentNode);
3769}
3770
3771/**
3772 * htmlParseElement:
3773 * @ctxt: an HTML parser context
3774 *
3775 * parse an HTML element, this is highly recursive
3776 *
3777 * [39] element ::= EmptyElemTag | STag content ETag
3778 *
3779 * [41] Attribute ::= Name Eq AttValue
3780 */
3781
3782void
3783htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003784 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003785 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003786 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003787 htmlParserNodeInfo node_info;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003788 const xmlChar *oldname;
Owen Taylor3473f882001-02-23 17:55:21 +00003789 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003790 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003791
3792 /* Capture start position */
3793 if (ctxt->record_info) {
3794 node_info.begin_pos = ctxt->input->consumed +
3795 (CUR_PTR - ctxt->input->base);
3796 node_info.begin_line = ctxt->input->line;
3797 }
3798
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003799 oldname = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00003800 htmlParseStartTag(ctxt);
3801 name = ctxt->name;
3802#ifdef DEBUG
3803 if (oldname == NULL)
3804 xmlGenericError(xmlGenericErrorContext,
3805 "Start of element %s\n", name);
3806 else if (name == NULL)
3807 xmlGenericError(xmlGenericErrorContext,
3808 "Start of element failed, was %s\n", oldname);
3809 else
3810 xmlGenericError(xmlGenericErrorContext,
3811 "Start of element %s, was %s\n", name, oldname);
3812#endif
3813 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3814 (name == NULL)) {
3815 if (CUR == '>')
3816 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003817 return;
3818 }
Owen Taylor3473f882001-02-23 17:55:21 +00003819
3820 /*
3821 * Lookup the info for that element.
3822 */
3823 info = htmlTagLookup(name);
3824 if (info == NULL) {
3825 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3826 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3827 name);
3828 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003829 }
3830
3831 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003832 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003833 */
3834 if ((CUR == '/') && (NXT(1) == '>')) {
3835 SKIP(2);
3836 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3837 ctxt->sax->endElement(ctxt->userData, name);
3838 oldname = htmlnamePop(ctxt);
3839#ifdef DEBUG
3840 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3841#endif
Owen Taylor3473f882001-02-23 17:55:21 +00003842 return;
3843 }
3844
3845 if (CUR == '>') {
3846 NEXT;
3847 } else {
3848 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3849 ctxt->sax->error(ctxt->userData,
3850 "Couldn't find end of Start Tag %s\n",
3851 name);
3852 ctxt->wellFormed = 0;
3853
3854 /*
3855 * end of parsing of this node.
3856 */
3857 if (xmlStrEqual(name, ctxt->name)) {
3858 nodePop(ctxt);
3859 oldname = htmlnamePop(ctxt);
3860#ifdef DEBUG
3861 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3862#endif
Owen Taylor3473f882001-02-23 17:55:21 +00003863 }
3864
3865 /*
3866 * Capture end position and add node
3867 */
3868 if ( currentNode != NULL && ctxt->record_info ) {
3869 node_info.end_pos = ctxt->input->consumed +
3870 (CUR_PTR - ctxt->input->base);
3871 node_info.end_line = ctxt->input->line;
3872 node_info.node = ctxt->node;
3873 xmlParserAddNodeInfo(ctxt, &node_info);
3874 }
3875 return;
3876 }
3877
3878 /*
3879 * Check for an Empty Element from DTD definition
3880 */
3881 if ((info != NULL) && (info->empty)) {
3882 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3883 ctxt->sax->endElement(ctxt->userData, name);
3884 oldname = htmlnamePop(ctxt);
3885#ifdef DEBUG
3886 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3887#endif
Owen Taylor3473f882001-02-23 17:55:21 +00003888 return;
3889 }
3890
3891 /*
3892 * Parse the content of the element:
3893 */
3894 currentNode = xmlStrdup(ctxt->name);
3895 depth = ctxt->nameNr;
Daniel Veillard34ba3872003-07-15 13:34:05 +00003896 while (IS_CHAR((unsigned int) CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003897 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003898 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003899 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003900 if (ctxt->nameNr < depth) break;
3901 }
3902
Owen Taylor3473f882001-02-23 17:55:21 +00003903 /*
3904 * Capture end position and add node
3905 */
3906 if ( currentNode != NULL && ctxt->record_info ) {
3907 node_info.end_pos = ctxt->input->consumed +
3908 (CUR_PTR - ctxt->input->base);
3909 node_info.end_line = ctxt->input->line;
3910 node_info.node = ctxt->node;
3911 xmlParserAddNodeInfo(ctxt, &node_info);
3912 }
Daniel Veillard34ba3872003-07-15 13:34:05 +00003913 if (!IS_CHAR((unsigned int) CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003914 htmlAutoCloseOnEnd(ctxt);
3915 }
3916
Owen Taylor3473f882001-02-23 17:55:21 +00003917 if (currentNode != NULL)
3918 xmlFree(currentNode);
3919}
3920
3921/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003922 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00003923 * @ctxt: an HTML parser context
3924 *
3925 * parse an HTML document (and build a tree if using the standard SAX
3926 * interface).
3927 *
3928 * Returns 0, -1 in case of error. the parser context is augmented
3929 * as a result of the parsing.
3930 */
3931
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00003932int
Owen Taylor3473f882001-02-23 17:55:21 +00003933htmlParseDocument(htmlParserCtxtPtr ctxt) {
3934 xmlDtdPtr dtd;
3935
Daniel Veillardd0463562001-10-13 09:15:48 +00003936 xmlInitParser();
3937
Owen Taylor3473f882001-02-23 17:55:21 +00003938 htmlDefaultSAXHandlerInit();
3939 ctxt->html = 1;
3940
3941 GROW;
3942 /*
3943 * SAX: beginning of the document processing.
3944 */
3945 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3946 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3947
3948 /*
3949 * Wipe out everything which is before the first '<'
3950 */
3951 SKIP_BLANKS;
3952 if (CUR == 0) {
3953 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3954 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3955 ctxt->wellFormed = 0;
3956 }
3957
3958 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3959 ctxt->sax->startDocument(ctxt->userData);
3960
3961
3962 /*
3963 * Parse possible comments before any content
3964 */
3965 while ((CUR == '<') && (NXT(1) == '!') &&
3966 (NXT(2) == '-') && (NXT(3) == '-')) {
3967 htmlParseComment(ctxt);
3968 SKIP_BLANKS;
3969 }
3970
3971
3972 /*
3973 * Then possibly doc type declaration(s) and more Misc
3974 * (doctypedecl Misc*)?
3975 */
3976 if ((CUR == '<') && (NXT(1) == '!') &&
3977 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3978 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3979 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3980 (UPP(8) == 'E')) {
3981 htmlParseDocTypeDecl(ctxt);
3982 }
3983 SKIP_BLANKS;
3984
3985 /*
3986 * Parse possible comments before any content
3987 */
3988 while ((CUR == '<') && (NXT(1) == '!') &&
3989 (NXT(2) == '-') && (NXT(3) == '-')) {
3990 htmlParseComment(ctxt);
3991 SKIP_BLANKS;
3992 }
3993
3994 /*
3995 * Time to start parsing the tree itself
3996 */
3997 htmlParseContent(ctxt);
3998
3999 /*
4000 * autoclose
4001 */
4002 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004003 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004004
4005
4006 /*
4007 * SAX: end of the document processing.
4008 */
4009 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4010 ctxt->sax->endDocument(ctxt->userData);
4011
4012 if (ctxt->myDoc != NULL) {
4013 dtd = xmlGetIntSubset(ctxt->myDoc);
4014 if (dtd == NULL)
4015 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00004016 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004017 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4018 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4019 }
4020 if (! ctxt->wellFormed) return(-1);
4021 return(0);
4022}
4023
4024
4025/************************************************************************
4026 * *
4027 * Parser contexts handling *
4028 * *
4029 ************************************************************************/
4030
4031/**
4032 * xmlInitParserCtxt:
4033 * @ctxt: an HTML parser context
4034 *
4035 * Initialize a parser context
4036 */
4037
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004038static void
Owen Taylor3473f882001-02-23 17:55:21 +00004039htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4040{
4041 htmlSAXHandler *sax;
4042
4043 if (ctxt == NULL) return;
4044 memset(ctxt, 0, sizeof(htmlParserCtxt));
4045
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004046 ctxt->dict = xmlDictCreate();
4047 if (ctxt->dict == NULL) {
4048 xmlGenericError(xmlGenericErrorContext,
4049 "xmlInitParserCtxt: out of memory\n");
4050 return;
4051 }
Owen Taylor3473f882001-02-23 17:55:21 +00004052 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4053 if (sax == NULL) {
4054 xmlGenericError(xmlGenericErrorContext,
4055 "htmlInitParserCtxt: out of memory\n");
4056 }
4057 else
4058 memset(sax, 0, sizeof(htmlSAXHandler));
4059
4060 /* Allocate the Input stack */
4061 ctxt->inputTab = (htmlParserInputPtr *)
4062 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4063 if (ctxt->inputTab == NULL) {
4064 xmlGenericError(xmlGenericErrorContext,
4065 "htmlInitParserCtxt: out of memory\n");
4066 ctxt->inputNr = 0;
4067 ctxt->inputMax = 0;
4068 ctxt->input = NULL;
4069 return;
4070 }
4071 ctxt->inputNr = 0;
4072 ctxt->inputMax = 5;
4073 ctxt->input = NULL;
4074 ctxt->version = NULL;
4075 ctxt->encoding = NULL;
4076 ctxt->standalone = -1;
4077 ctxt->instate = XML_PARSER_START;
4078
4079 /* Allocate the Node stack */
4080 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4081 if (ctxt->nodeTab == NULL) {
4082 xmlGenericError(xmlGenericErrorContext,
4083 "htmlInitParserCtxt: out of memory\n");
4084 ctxt->nodeNr = 0;
4085 ctxt->nodeMax = 0;
4086 ctxt->node = NULL;
4087 ctxt->inputNr = 0;
4088 ctxt->inputMax = 0;
4089 ctxt->input = NULL;
4090 return;
4091 }
4092 ctxt->nodeNr = 0;
4093 ctxt->nodeMax = 10;
4094 ctxt->node = NULL;
4095
4096 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004097 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004098 if (ctxt->nameTab == NULL) {
4099 xmlGenericError(xmlGenericErrorContext,
4100 "htmlInitParserCtxt: out of memory\n");
4101 ctxt->nameNr = 0;
4102 ctxt->nameMax = 10;
4103 ctxt->name = NULL;
4104 ctxt->nodeNr = 0;
4105 ctxt->nodeMax = 0;
4106 ctxt->node = NULL;
4107 ctxt->inputNr = 0;
4108 ctxt->inputMax = 0;
4109 ctxt->input = NULL;
4110 return;
4111 }
4112 ctxt->nameNr = 0;
4113 ctxt->nameMax = 10;
4114 ctxt->name = NULL;
4115
Daniel Veillard092643b2003-09-25 14:29:29 +00004116 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004117 else {
4118 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004119 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004120 }
4121 ctxt->userData = ctxt;
4122 ctxt->myDoc = NULL;
4123 ctxt->wellFormed = 1;
4124 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004125 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004126 ctxt->html = 1;
4127 ctxt->record_info = 0;
4128 ctxt->validate = 0;
4129 ctxt->nbChars = 0;
4130 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004131 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004132 xmlInitNodeInfoSeq(&ctxt->node_seq);
4133}
4134
4135/**
4136 * htmlFreeParserCtxt:
4137 * @ctxt: an HTML parser context
4138 *
4139 * Free all the memory used by a parser context. However the parsed
4140 * document in ctxt->myDoc is not freed.
4141 */
4142
4143void
4144htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4145{
4146 xmlFreeParserCtxt(ctxt);
4147}
4148
4149/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004150 * htmlNewParserCtxt:
4151 *
4152 * Allocate and initialize a new parser context.
4153 *
4154 * Returns the xmlParserCtxtPtr or NULL
4155 */
4156
4157static htmlParserCtxtPtr
4158htmlNewParserCtxt(void)
4159{
4160 xmlParserCtxtPtr ctxt;
4161
4162 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4163 if (ctxt == NULL) {
4164 xmlGenericError(xmlGenericErrorContext,
4165 "xmlNewParserCtxt : cannot allocate context\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004166 return(NULL);
4167 }
4168 memset(ctxt, 0, sizeof(xmlParserCtxt));
4169 htmlInitParserCtxt(ctxt);
4170 return(ctxt);
4171}
4172
4173/**
4174 * htmlCreateMemoryParserCtxt:
4175 * @buffer: a pointer to a char array
4176 * @size: the size of the array
4177 *
4178 * Create a parser context for an HTML in-memory document.
4179 *
4180 * Returns the new parser context or NULL
4181 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004182htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004183htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4184 xmlParserCtxtPtr ctxt;
4185 xmlParserInputPtr input;
4186 xmlParserInputBufferPtr buf;
4187
4188 if (buffer == NULL)
4189 return(NULL);
4190 if (size <= 0)
4191 return(NULL);
4192
4193 ctxt = htmlNewParserCtxt();
4194 if (ctxt == NULL)
4195 return(NULL);
4196
4197 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4198 if (buf == NULL) return(NULL);
4199
4200 input = xmlNewInputStream(ctxt);
4201 if (input == NULL) {
4202 xmlFreeParserCtxt(ctxt);
4203 return(NULL);
4204 }
4205
4206 input->filename = NULL;
4207 input->buf = buf;
4208 input->base = input->buf->buffer->content;
4209 input->cur = input->buf->buffer->content;
4210 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4211
4212 inputPush(ctxt, input);
4213 return(ctxt);
4214}
4215
4216/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004217 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004218 * @cur: a pointer to an array of xmlChar
4219 * @encoding: a free form C string describing the HTML document encoding, or NULL
4220 *
4221 * Create a parser context for an HTML document.
4222 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004223 * TODO: check the need to add encoding handling there
4224 *
Owen Taylor3473f882001-02-23 17:55:21 +00004225 * Returns the new parser context or NULL
4226 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004227static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00004228htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004229 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004230 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004231
Daniel Veillard1d995272002-07-22 16:43:32 +00004232 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004233 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004234 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004235 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4236
4237 if (encoding != NULL) {
4238 xmlCharEncoding enc;
4239 xmlCharEncodingHandlerPtr handler;
4240
4241 if (ctxt->input->encoding != NULL)
4242 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004243 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004244
4245 enc = xmlParseCharEncoding(encoding);
4246 /*
4247 * registered set of known encodings
4248 */
4249 if (enc != XML_CHAR_ENCODING_ERROR) {
4250 xmlSwitchEncoding(ctxt, enc);
4251 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4252 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4253 ctxt->sax->error(ctxt->userData,
4254 "Unsupported encoding %s\n", encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004255 }
4256 } else {
4257 /*
4258 * fallback for unknown encodings
4259 */
4260 handler = xmlFindCharEncodingHandler((const char *) encoding);
4261 if (handler != NULL) {
4262 xmlSwitchToEncoding(ctxt, handler);
4263 } else {
4264 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
4265 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4266 ctxt->sax->error(ctxt->userData,
4267 "Unsupported encoding %s\n", encoding);
4268 }
4269 }
4270 }
4271 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004272}
4273
Daniel Veillard73b013f2003-09-30 12:36:01 +00004274#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004275/************************************************************************
4276 * *
4277 * Progressive parsing interfaces *
4278 * *
4279 ************************************************************************/
4280
4281/**
4282 * htmlParseLookupSequence:
4283 * @ctxt: an HTML parser context
4284 * @first: the first char to lookup
4285 * @next: the next char to lookup or zero
4286 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004287 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004288 *
4289 * Try to find if a sequence (first, next, third) or just (first next) or
4290 * (first) is available in the input stream.
4291 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4292 * to avoid rescanning sequences of bytes, it DOES change the state of the
4293 * parser, do not use liberally.
4294 * This is basically similar to xmlParseLookupSequence()
4295 *
4296 * Returns the index to the current parsing point if the full sequence
4297 * is available, -1 otherwise.
4298 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004299static int
Owen Taylor3473f882001-02-23 17:55:21 +00004300htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
William M. Brackc1939562003-08-05 15:52:22 +00004301 xmlChar next, xmlChar third, int iscomment) {
Owen Taylor3473f882001-02-23 17:55:21 +00004302 int base, len;
4303 htmlParserInputPtr in;
4304 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004305 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004306
4307 in = ctxt->input;
4308 if (in == NULL) return(-1);
4309 base = in->cur - in->base;
4310 if (base < 0) return(-1);
4311 if (ctxt->checkIndex > base)
4312 base = ctxt->checkIndex;
4313 if (in->buf == NULL) {
4314 buf = in->base;
4315 len = in->length;
4316 } else {
4317 buf = in->buf->buffer->content;
4318 len = in->buf->buffer->use;
4319 }
4320 /* take into account the sequence length */
4321 if (third) len -= 2;
4322 else if (next) len --;
4323 for (;base < len;base++) {
William M. Brackc1939562003-08-05 15:52:22 +00004324 if (!incomment && (base + 4 < len) && !iscomment) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004325 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4326 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4327 incomment = 1;
Daniel Veillard97e01882003-07-30 18:59:19 +00004328 /* do not increment past <! - some people use <!--> */
4329 base += 2;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004330 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004331 }
4332 if (incomment) {
William M. Brack4a557d92003-07-29 04:28:04 +00004333 if (base + 3 > len)
Daniel Veillardc1f78342001-11-10 11:43:05 +00004334 return(-1);
4335 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4336 (buf[base + 2] == '>')) {
4337 incomment = 0;
4338 base += 2;
4339 }
4340 continue;
4341 }
Owen Taylor3473f882001-02-23 17:55:21 +00004342 if (buf[base] == first) {
4343 if (third != 0) {
4344 if ((buf[base + 1] != next) ||
4345 (buf[base + 2] != third)) continue;
4346 } else if (next != 0) {
4347 if (buf[base + 1] != next) continue;
4348 }
4349 ctxt->checkIndex = 0;
4350#ifdef DEBUG_PUSH
4351 if (next == 0)
4352 xmlGenericError(xmlGenericErrorContext,
4353 "HPP: lookup '%c' found at %d\n",
4354 first, base);
4355 else if (third == 0)
4356 xmlGenericError(xmlGenericErrorContext,
4357 "HPP: lookup '%c%c' found at %d\n",
4358 first, next, base);
4359 else
4360 xmlGenericError(xmlGenericErrorContext,
4361 "HPP: lookup '%c%c%c' found at %d\n",
4362 first, next, third, base);
4363#endif
4364 return(base - (in->cur - in->base));
4365 }
4366 }
4367 ctxt->checkIndex = base;
4368#ifdef DEBUG_PUSH
4369 if (next == 0)
4370 xmlGenericError(xmlGenericErrorContext,
4371 "HPP: lookup '%c' failed\n", first);
4372 else if (third == 0)
4373 xmlGenericError(xmlGenericErrorContext,
4374 "HPP: lookup '%c%c' failed\n", first, next);
4375 else
4376 xmlGenericError(xmlGenericErrorContext,
4377 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4378#endif
4379 return(-1);
4380}
4381
4382/**
4383 * htmlParseTryOrFinish:
4384 * @ctxt: an HTML parser context
4385 * @terminate: last chunk indicator
4386 *
4387 * Try to progress on parsing
4388 *
4389 * Returns zero if no parsing was possible
4390 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004391static int
Owen Taylor3473f882001-02-23 17:55:21 +00004392htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4393 int ret = 0;
4394 htmlParserInputPtr in;
4395 int avail = 0;
4396 xmlChar cur, next;
4397
4398#ifdef DEBUG_PUSH
4399 switch (ctxt->instate) {
4400 case XML_PARSER_EOF:
4401 xmlGenericError(xmlGenericErrorContext,
4402 "HPP: try EOF\n"); break;
4403 case XML_PARSER_START:
4404 xmlGenericError(xmlGenericErrorContext,
4405 "HPP: try START\n"); break;
4406 case XML_PARSER_MISC:
4407 xmlGenericError(xmlGenericErrorContext,
4408 "HPP: try MISC\n");break;
4409 case XML_PARSER_COMMENT:
4410 xmlGenericError(xmlGenericErrorContext,
4411 "HPP: try COMMENT\n");break;
4412 case XML_PARSER_PROLOG:
4413 xmlGenericError(xmlGenericErrorContext,
4414 "HPP: try PROLOG\n");break;
4415 case XML_PARSER_START_TAG:
4416 xmlGenericError(xmlGenericErrorContext,
4417 "HPP: try START_TAG\n");break;
4418 case XML_PARSER_CONTENT:
4419 xmlGenericError(xmlGenericErrorContext,
4420 "HPP: try CONTENT\n");break;
4421 case XML_PARSER_CDATA_SECTION:
4422 xmlGenericError(xmlGenericErrorContext,
4423 "HPP: try CDATA_SECTION\n");break;
4424 case XML_PARSER_END_TAG:
4425 xmlGenericError(xmlGenericErrorContext,
4426 "HPP: try END_TAG\n");break;
4427 case XML_PARSER_ENTITY_DECL:
4428 xmlGenericError(xmlGenericErrorContext,
4429 "HPP: try ENTITY_DECL\n");break;
4430 case XML_PARSER_ENTITY_VALUE:
4431 xmlGenericError(xmlGenericErrorContext,
4432 "HPP: try ENTITY_VALUE\n");break;
4433 case XML_PARSER_ATTRIBUTE_VALUE:
4434 xmlGenericError(xmlGenericErrorContext,
4435 "HPP: try ATTRIBUTE_VALUE\n");break;
4436 case XML_PARSER_DTD:
4437 xmlGenericError(xmlGenericErrorContext,
4438 "HPP: try DTD\n");break;
4439 case XML_PARSER_EPILOG:
4440 xmlGenericError(xmlGenericErrorContext,
4441 "HPP: try EPILOG\n");break;
4442 case XML_PARSER_PI:
4443 xmlGenericError(xmlGenericErrorContext,
4444 "HPP: try PI\n");break;
4445 case XML_PARSER_SYSTEM_LITERAL:
4446 xmlGenericError(xmlGenericErrorContext,
4447 "HPP: try SYSTEM_LITERAL\n");break;
4448 }
4449#endif
4450
4451 while (1) {
4452
4453 in = ctxt->input;
4454 if (in == NULL) break;
4455 if (in->buf == NULL)
4456 avail = in->length - (in->cur - in->base);
4457 else
4458 avail = in->buf->buffer->use - (in->cur - in->base);
4459 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004460 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004461 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4462 /*
4463 * SAX: end of the document processing.
4464 */
4465 ctxt->instate = XML_PARSER_EOF;
4466 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4467 ctxt->sax->endDocument(ctxt->userData);
4468 }
4469 }
4470 if (avail < 1)
4471 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004472 cur = in->cur[0];
4473 if (cur == 0) {
4474 SKIP(1);
4475 continue;
4476 }
4477
Owen Taylor3473f882001-02-23 17:55:21 +00004478 switch (ctxt->instate) {
4479 case XML_PARSER_EOF:
4480 /*
4481 * Document parsing is done !
4482 */
4483 goto done;
4484 case XML_PARSER_START:
4485 /*
4486 * Very first chars read from the document flow.
4487 */
4488 cur = in->cur[0];
4489 if (IS_BLANK(cur)) {
4490 SKIP_BLANKS;
4491 if (in->buf == NULL)
4492 avail = in->length - (in->cur - in->base);
4493 else
4494 avail = in->buf->buffer->use - (in->cur - in->base);
4495 }
4496 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4497 ctxt->sax->setDocumentLocator(ctxt->userData,
4498 &xmlDefaultSAXLocator);
4499 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4500 (!ctxt->disableSAX))
4501 ctxt->sax->startDocument(ctxt->userData);
4502
4503 cur = in->cur[0];
4504 next = in->cur[1];
4505 if ((cur == '<') && (next == '!') &&
4506 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4507 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4508 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4509 (UPP(8) == 'E')) {
4510 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004511 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004512 goto done;
4513#ifdef DEBUG_PUSH
4514 xmlGenericError(xmlGenericErrorContext,
4515 "HPP: Parsing internal subset\n");
4516#endif
4517 htmlParseDocTypeDecl(ctxt);
4518 ctxt->instate = XML_PARSER_PROLOG;
4519#ifdef DEBUG_PUSH
4520 xmlGenericError(xmlGenericErrorContext,
4521 "HPP: entering PROLOG\n");
4522#endif
4523 } else {
4524 ctxt->instate = XML_PARSER_MISC;
4525 }
4526#ifdef DEBUG_PUSH
4527 xmlGenericError(xmlGenericErrorContext,
4528 "HPP: entering MISC\n");
4529#endif
4530 break;
4531 case XML_PARSER_MISC:
4532 SKIP_BLANKS;
4533 if (in->buf == NULL)
4534 avail = in->length - (in->cur - in->base);
4535 else
4536 avail = in->buf->buffer->use - (in->cur - in->base);
4537 if (avail < 2)
4538 goto done;
4539 cur = in->cur[0];
4540 next = in->cur[1];
4541 if ((cur == '<') && (next == '!') &&
4542 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4543 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004544 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004545 goto done;
4546#ifdef DEBUG_PUSH
4547 xmlGenericError(xmlGenericErrorContext,
4548 "HPP: Parsing Comment\n");
4549#endif
4550 htmlParseComment(ctxt);
4551 ctxt->instate = XML_PARSER_MISC;
4552 } else if ((cur == '<') && (next == '!') &&
4553 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4554 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4555 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4556 (UPP(8) == 'E')) {
4557 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004558 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004559 goto done;
4560#ifdef DEBUG_PUSH
4561 xmlGenericError(xmlGenericErrorContext,
4562 "HPP: Parsing internal subset\n");
4563#endif
4564 htmlParseDocTypeDecl(ctxt);
4565 ctxt->instate = XML_PARSER_PROLOG;
4566#ifdef DEBUG_PUSH
4567 xmlGenericError(xmlGenericErrorContext,
4568 "HPP: entering PROLOG\n");
4569#endif
4570 } else if ((cur == '<') && (next == '!') &&
4571 (avail < 9)) {
4572 goto done;
4573 } else {
4574 ctxt->instate = XML_PARSER_START_TAG;
4575#ifdef DEBUG_PUSH
4576 xmlGenericError(xmlGenericErrorContext,
4577 "HPP: entering START_TAG\n");
4578#endif
4579 }
4580 break;
4581 case XML_PARSER_PROLOG:
4582 SKIP_BLANKS;
4583 if (in->buf == NULL)
4584 avail = in->length - (in->cur - in->base);
4585 else
4586 avail = in->buf->buffer->use - (in->cur - in->base);
4587 if (avail < 2)
4588 goto done;
4589 cur = in->cur[0];
4590 next = in->cur[1];
4591 if ((cur == '<') && (next == '!') &&
4592 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4593 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004594 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004595 goto done;
4596#ifdef DEBUG_PUSH
4597 xmlGenericError(xmlGenericErrorContext,
4598 "HPP: Parsing Comment\n");
4599#endif
4600 htmlParseComment(ctxt);
4601 ctxt->instate = XML_PARSER_PROLOG;
4602 } else if ((cur == '<') && (next == '!') &&
4603 (avail < 4)) {
4604 goto done;
4605 } else {
4606 ctxt->instate = XML_PARSER_START_TAG;
4607#ifdef DEBUG_PUSH
4608 xmlGenericError(xmlGenericErrorContext,
4609 "HPP: entering START_TAG\n");
4610#endif
4611 }
4612 break;
4613 case XML_PARSER_EPILOG:
4614 if (in->buf == NULL)
4615 avail = in->length - (in->cur - in->base);
4616 else
4617 avail = in->buf->buffer->use - (in->cur - in->base);
4618 if (avail < 1)
4619 goto done;
4620 cur = in->cur[0];
4621 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004622 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004623 goto done;
4624 }
4625 if (avail < 2)
4626 goto done;
4627 next = in->cur[1];
4628 if ((cur == '<') && (next == '!') &&
4629 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4630 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004631 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004632 goto done;
4633#ifdef DEBUG_PUSH
4634 xmlGenericError(xmlGenericErrorContext,
4635 "HPP: Parsing Comment\n");
4636#endif
4637 htmlParseComment(ctxt);
4638 ctxt->instate = XML_PARSER_EPILOG;
4639 } else if ((cur == '<') && (next == '!') &&
4640 (avail < 4)) {
4641 goto done;
4642 } else {
4643 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004644 ctxt->wellFormed = 0;
4645 ctxt->instate = XML_PARSER_EOF;
4646#ifdef DEBUG_PUSH
4647 xmlGenericError(xmlGenericErrorContext,
4648 "HPP: entering EOF\n");
4649#endif
4650 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4651 ctxt->sax->endDocument(ctxt->userData);
4652 goto done;
4653 }
4654 break;
4655 case XML_PARSER_START_TAG: {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004656 const xmlChar *name, *oldname;
Owen Taylor3473f882001-02-23 17:55:21 +00004657 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004658 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004659
4660 if (avail < 2)
4661 goto done;
4662 cur = in->cur[0];
4663 if (cur != '<') {
4664 ctxt->instate = XML_PARSER_CONTENT;
4665#ifdef DEBUG_PUSH
4666 xmlGenericError(xmlGenericErrorContext,
4667 "HPP: entering CONTENT\n");
4668#endif
4669 break;
4670 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004671 if (in->cur[1] == '/') {
4672 ctxt->instate = XML_PARSER_END_TAG;
4673 ctxt->checkIndex = 0;
4674#ifdef DEBUG_PUSH
4675 xmlGenericError(xmlGenericErrorContext,
4676 "HPP: entering END_TAG\n");
4677#endif
4678 break;
4679 }
Owen Taylor3473f882001-02-23 17:55:21 +00004680 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004681 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004682 goto done;
4683
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004684 oldname = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00004685 htmlParseStartTag(ctxt);
4686 name = ctxt->name;
4687#ifdef DEBUG
4688 if (oldname == NULL)
4689 xmlGenericError(xmlGenericErrorContext,
4690 "Start of element %s\n", name);
4691 else if (name == NULL)
4692 xmlGenericError(xmlGenericErrorContext,
4693 "Start of element failed, was %s\n",
4694 oldname);
4695 else
4696 xmlGenericError(xmlGenericErrorContext,
4697 "Start of element %s, was %s\n",
4698 name, oldname);
4699#endif
4700 if (((depth == ctxt->nameNr) &&
4701 (xmlStrEqual(oldname, ctxt->name))) ||
4702 (name == NULL)) {
4703 if (CUR == '>')
4704 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004705 break;
4706 }
Owen Taylor3473f882001-02-23 17:55:21 +00004707
4708 /*
4709 * Lookup the info for that element.
4710 */
4711 info = htmlTagLookup(name);
4712 if (info == NULL) {
4713 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4714 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4715 name);
4716 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004717 }
4718
4719 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004720 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004721 */
4722 if ((CUR == '/') && (NXT(1) == '>')) {
4723 SKIP(2);
4724 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4725 ctxt->sax->endElement(ctxt->userData, name);
4726 oldname = htmlnamePop(ctxt);
4727#ifdef DEBUG
4728 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4729 oldname);
4730#endif
Owen Taylor3473f882001-02-23 17:55:21 +00004731 ctxt->instate = XML_PARSER_CONTENT;
4732#ifdef DEBUG_PUSH
4733 xmlGenericError(xmlGenericErrorContext,
4734 "HPP: entering CONTENT\n");
4735#endif
4736 break;
4737 }
4738
4739 if (CUR == '>') {
4740 NEXT;
4741 } else {
4742 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4743 ctxt->sax->error(ctxt->userData,
4744 "Couldn't find end of Start Tag %s\n",
4745 name);
4746 ctxt->wellFormed = 0;
4747
4748 /*
4749 * end of parsing of this node.
4750 */
4751 if (xmlStrEqual(name, ctxt->name)) {
4752 nodePop(ctxt);
4753 oldname = htmlnamePop(ctxt);
4754#ifdef DEBUG
4755 xmlGenericError(xmlGenericErrorContext,
4756 "End of start tag problem: popping out %s\n", oldname);
4757#endif
Owen Taylor3473f882001-02-23 17:55:21 +00004758 }
4759
4760 ctxt->instate = XML_PARSER_CONTENT;
4761#ifdef DEBUG_PUSH
4762 xmlGenericError(xmlGenericErrorContext,
4763 "HPP: entering CONTENT\n");
4764#endif
4765 break;
4766 }
4767
4768 /*
4769 * Check for an Empty Element from DTD definition
4770 */
4771 if ((info != NULL) && (info->empty)) {
4772 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4773 ctxt->sax->endElement(ctxt->userData, name);
4774 oldname = htmlnamePop(ctxt);
4775#ifdef DEBUG
4776 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4777#endif
Owen Taylor3473f882001-02-23 17:55:21 +00004778 }
4779 ctxt->instate = XML_PARSER_CONTENT;
4780#ifdef DEBUG_PUSH
4781 xmlGenericError(xmlGenericErrorContext,
4782 "HPP: entering CONTENT\n");
4783#endif
4784 break;
4785 }
4786 case XML_PARSER_CONTENT: {
4787 long cons;
4788 /*
4789 * Handle preparsed entities and charRef
4790 */
4791 if (ctxt->token != 0) {
4792 xmlChar chr[2] = { 0 , 0 } ;
4793
4794 chr[0] = (xmlChar) ctxt->token;
4795 htmlCheckParagraph(ctxt);
4796 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4797 ctxt->sax->characters(ctxt->userData, chr, 1);
4798 ctxt->token = 0;
4799 ctxt->checkIndex = 0;
4800 }
4801 if ((avail == 1) && (terminate)) {
4802 cur = in->cur[0];
4803 if ((cur != '<') && (cur != '&')) {
4804 if (ctxt->sax != NULL) {
4805 if (IS_BLANK(cur)) {
4806 if (ctxt->sax->ignorableWhitespace != NULL)
4807 ctxt->sax->ignorableWhitespace(
4808 ctxt->userData, &cur, 1);
4809 } else {
4810 htmlCheckParagraph(ctxt);
4811 if (ctxt->sax->characters != NULL)
4812 ctxt->sax->characters(
4813 ctxt->userData, &cur, 1);
4814 }
4815 }
4816 ctxt->token = 0;
4817 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00004818 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00004819 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004820 }
Owen Taylor3473f882001-02-23 17:55:21 +00004821 }
4822 if (avail < 2)
4823 goto done;
4824 cur = in->cur[0];
4825 next = in->cur[1];
4826 cons = ctxt->nbChars;
4827 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4828 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4829 /*
4830 * Handle SCRIPT/STYLE separately
4831 */
4832 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004833 (htmlParseLookupSequence(ctxt, '<', '/', 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004834 goto done;
4835 htmlParseScript(ctxt);
4836 if ((cur == '<') && (next == '/')) {
4837 ctxt->instate = XML_PARSER_END_TAG;
4838 ctxt->checkIndex = 0;
4839#ifdef DEBUG_PUSH
4840 xmlGenericError(xmlGenericErrorContext,
4841 "HPP: entering END_TAG\n");
4842#endif
4843 break;
4844 }
4845 } else {
4846 /*
4847 * Sometimes DOCTYPE arrives in the middle of the document
4848 */
4849 if ((cur == '<') && (next == '!') &&
4850 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4851 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4852 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4853 (UPP(8) == 'E')) {
4854 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004855 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004856 goto done;
4857 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4858 ctxt->sax->error(ctxt->userData,
4859 "Misplaced DOCTYPE declaration\n");
4860 ctxt->wellFormed = 0;
4861 htmlParseDocTypeDecl(ctxt);
4862 } else if ((cur == '<') && (next == '!') &&
4863 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4864 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004865 (htmlParseLookupSequence(
4866 ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004867 goto done;
4868#ifdef DEBUG_PUSH
4869 xmlGenericError(xmlGenericErrorContext,
4870 "HPP: Parsing Comment\n");
4871#endif
4872 htmlParseComment(ctxt);
4873 ctxt->instate = XML_PARSER_CONTENT;
4874 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4875 goto done;
4876 } else if ((cur == '<') && (next == '/')) {
4877 ctxt->instate = XML_PARSER_END_TAG;
4878 ctxt->checkIndex = 0;
4879#ifdef DEBUG_PUSH
4880 xmlGenericError(xmlGenericErrorContext,
4881 "HPP: entering END_TAG\n");
4882#endif
4883 break;
4884 } else if (cur == '<') {
4885 ctxt->instate = XML_PARSER_START_TAG;
4886 ctxt->checkIndex = 0;
4887#ifdef DEBUG_PUSH
4888 xmlGenericError(xmlGenericErrorContext,
4889 "HPP: entering START_TAG\n");
4890#endif
4891 break;
4892 } else if (cur == '&') {
4893 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004894 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004895 goto done;
4896#ifdef DEBUG_PUSH
4897 xmlGenericError(xmlGenericErrorContext,
4898 "HPP: Parsing Reference\n");
4899#endif
4900 /* TODO: check generation of subtrees if noent !!! */
4901 htmlParseReference(ctxt);
4902 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00004903 /*
4904 * check that the text sequence is complete
4905 * before handing out the data to the parser
4906 * to avoid problems with erroneous end of
4907 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00004908 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00004909 if ((!terminate) &&
4910 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
4911 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00004912 ctxt->checkIndex = 0;
4913#ifdef DEBUG_PUSH
4914 xmlGenericError(xmlGenericErrorContext,
4915 "HPP: Parsing char data\n");
4916#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004917 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004918 }
4919 }
4920 if (cons == ctxt->nbChars) {
4921 if (ctxt->node != NULL) {
4922 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4923 ctxt->sax->error(ctxt->userData,
4924 "detected an error in element content\n");
4925 ctxt->wellFormed = 0;
4926 }
4927 NEXT;
4928 break;
4929 }
4930
4931 break;
4932 }
4933 case XML_PARSER_END_TAG:
4934 if (avail < 2)
4935 goto done;
4936 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004937 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004938 goto done;
4939 htmlParseEndTag(ctxt);
4940 if (ctxt->nameNr == 0) {
4941 ctxt->instate = XML_PARSER_EPILOG;
4942 } else {
4943 ctxt->instate = XML_PARSER_CONTENT;
4944 }
4945 ctxt->checkIndex = 0;
4946#ifdef DEBUG_PUSH
4947 xmlGenericError(xmlGenericErrorContext,
4948 "HPP: entering CONTENT\n");
4949#endif
4950 break;
4951 case XML_PARSER_CDATA_SECTION:
4952 xmlGenericError(xmlGenericErrorContext,
4953 "HPP: internal error, state == CDATA\n");
4954 ctxt->instate = XML_PARSER_CONTENT;
4955 ctxt->checkIndex = 0;
4956#ifdef DEBUG_PUSH
4957 xmlGenericError(xmlGenericErrorContext,
4958 "HPP: entering CONTENT\n");
4959#endif
4960 break;
4961 case XML_PARSER_DTD:
4962 xmlGenericError(xmlGenericErrorContext,
4963 "HPP: internal error, state == DTD\n");
4964 ctxt->instate = XML_PARSER_CONTENT;
4965 ctxt->checkIndex = 0;
4966#ifdef DEBUG_PUSH
4967 xmlGenericError(xmlGenericErrorContext,
4968 "HPP: entering CONTENT\n");
4969#endif
4970 break;
4971 case XML_PARSER_COMMENT:
4972 xmlGenericError(xmlGenericErrorContext,
4973 "HPP: internal error, state == COMMENT\n");
4974 ctxt->instate = XML_PARSER_CONTENT;
4975 ctxt->checkIndex = 0;
4976#ifdef DEBUG_PUSH
4977 xmlGenericError(xmlGenericErrorContext,
4978 "HPP: entering CONTENT\n");
4979#endif
4980 break;
4981 case XML_PARSER_PI:
4982 xmlGenericError(xmlGenericErrorContext,
4983 "HPP: internal error, state == PI\n");
4984 ctxt->instate = XML_PARSER_CONTENT;
4985 ctxt->checkIndex = 0;
4986#ifdef DEBUG_PUSH
4987 xmlGenericError(xmlGenericErrorContext,
4988 "HPP: entering CONTENT\n");
4989#endif
4990 break;
4991 case XML_PARSER_ENTITY_DECL:
4992 xmlGenericError(xmlGenericErrorContext,
4993 "HPP: internal error, state == ENTITY_DECL\n");
4994 ctxt->instate = XML_PARSER_CONTENT;
4995 ctxt->checkIndex = 0;
4996#ifdef DEBUG_PUSH
4997 xmlGenericError(xmlGenericErrorContext,
4998 "HPP: entering CONTENT\n");
4999#endif
5000 break;
5001 case XML_PARSER_ENTITY_VALUE:
5002 xmlGenericError(xmlGenericErrorContext,
5003 "HPP: internal error, state == ENTITY_VALUE\n");
5004 ctxt->instate = XML_PARSER_CONTENT;
5005 ctxt->checkIndex = 0;
5006#ifdef DEBUG_PUSH
5007 xmlGenericError(xmlGenericErrorContext,
5008 "HPP: entering DTD\n");
5009#endif
5010 break;
5011 case XML_PARSER_ATTRIBUTE_VALUE:
5012 xmlGenericError(xmlGenericErrorContext,
5013 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
5014 ctxt->instate = XML_PARSER_START_TAG;
5015 ctxt->checkIndex = 0;
5016#ifdef DEBUG_PUSH
5017 xmlGenericError(xmlGenericErrorContext,
5018 "HPP: entering START_TAG\n");
5019#endif
5020 break;
5021 case XML_PARSER_SYSTEM_LITERAL:
5022 xmlGenericError(xmlGenericErrorContext,
5023 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
5024 ctxt->instate = XML_PARSER_CONTENT;
5025 ctxt->checkIndex = 0;
5026#ifdef DEBUG_PUSH
5027 xmlGenericError(xmlGenericErrorContext,
5028 "HPP: entering CONTENT\n");
5029#endif
5030 break;
5031 case XML_PARSER_IGNORE:
5032 xmlGenericError(xmlGenericErrorContext,
5033 "HPP: internal error, state == XML_PARSER_IGNORE\n");
5034 ctxt->instate = XML_PARSER_CONTENT;
5035 ctxt->checkIndex = 0;
5036#ifdef DEBUG_PUSH
5037 xmlGenericError(xmlGenericErrorContext,
5038 "HPP: entering CONTENT\n");
5039#endif
5040 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005041 case XML_PARSER_PUBLIC_LITERAL:
5042 xmlGenericError(xmlGenericErrorContext,
5043 "HPP: internal error, state == XML_PARSER_LITERAL\n");
5044 ctxt->instate = XML_PARSER_CONTENT;
5045 ctxt->checkIndex = 0;
5046#ifdef DEBUG_PUSH
5047 xmlGenericError(xmlGenericErrorContext,
5048 "HPP: entering CONTENT\n");
5049#endif
5050 break;
5051
Owen Taylor3473f882001-02-23 17:55:21 +00005052 }
5053 }
5054done:
5055 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005056 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005057 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5058 /*
5059 * SAX: end of the document processing.
5060 */
5061 ctxt->instate = XML_PARSER_EOF;
5062 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5063 ctxt->sax->endDocument(ctxt->userData);
5064 }
5065 }
5066 if ((ctxt->myDoc != NULL) &&
5067 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5068 (ctxt->instate == XML_PARSER_EPILOG))) {
5069 xmlDtdPtr dtd;
5070 dtd = xmlGetIntSubset(ctxt->myDoc);
5071 if (dtd == NULL)
5072 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00005073 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005074 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5075 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5076 }
5077#ifdef DEBUG_PUSH
5078 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5079#endif
5080 return(ret);
5081}
5082
5083/**
Owen Taylor3473f882001-02-23 17:55:21 +00005084 * htmlParseChunk:
5085 * @ctxt: an XML parser context
5086 * @chunk: an char array
5087 * @size: the size in byte of the chunk
5088 * @terminate: last chunk indicator
5089 *
5090 * Parse a Chunk of memory
5091 *
5092 * Returns zero if no error, the xmlParserErrors otherwise.
5093 */
5094int
5095htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5096 int terminate) {
5097 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5098 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5099 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5100 int cur = ctxt->input->cur - ctxt->input->base;
5101
5102 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5103 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5104 ctxt->input->cur = ctxt->input->base + cur;
5105#ifdef DEBUG_PUSH
5106 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5107#endif
5108
Daniel Veillard14f752c2003-08-09 11:44:50 +00005109#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00005110 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5111 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005112#endif
Owen Taylor3473f882001-02-23 17:55:21 +00005113 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005114 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5115 xmlParserInputBufferPtr in = ctxt->input->buf;
5116 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5117 (in->raw != NULL)) {
5118 int nbchars;
5119
5120 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5121 if (nbchars < 0) {
5122 xmlGenericError(xmlGenericErrorContext,
5123 "htmlParseChunk: encoder error\n");
5124 return(XML_ERR_INVALID_ENCODING);
5125 }
5126 }
5127 }
Owen Taylor3473f882001-02-23 17:55:21 +00005128 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00005129 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00005130 if (terminate) {
5131 if ((ctxt->instate != XML_PARSER_EOF) &&
5132 (ctxt->instate != XML_PARSER_EPILOG) &&
5133 (ctxt->instate != XML_PARSER_MISC)) {
5134 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005135 ctxt->wellFormed = 0;
5136 }
5137 if (ctxt->instate != XML_PARSER_EOF) {
5138 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5139 ctxt->sax->endDocument(ctxt->userData);
5140 }
5141 ctxt->instate = XML_PARSER_EOF;
5142 }
5143 return((xmlParserErrors) ctxt->errNo);
5144}
Daniel Veillard73b013f2003-09-30 12:36:01 +00005145#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00005146
5147/************************************************************************
5148 * *
5149 * User entry points *
5150 * *
5151 ************************************************************************/
5152
5153/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005154 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005155 * @sax: a SAX handler
5156 * @user_data: The user data returned on SAX callbacks
5157 * @chunk: a pointer to an array of chars
5158 * @size: number of chars in the array
5159 * @filename: an optional file name or URI
5160 * @enc: an optional encoding
5161 *
5162 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005163 * The value of @filename is used for fetching external entities
5164 * and error/warning reports.
5165 *
5166 * Returns the new parser context or NULL
5167 */
5168htmlParserCtxtPtr
5169htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5170 const char *chunk, int size, const char *filename,
5171 xmlCharEncoding enc) {
5172 htmlParserCtxtPtr ctxt;
5173 htmlParserInputPtr inputStream;
5174 xmlParserInputBufferPtr buf;
5175
Daniel Veillardd0463562001-10-13 09:15:48 +00005176 xmlInitParser();
5177
Owen Taylor3473f882001-02-23 17:55:21 +00005178 buf = xmlAllocParserInputBuffer(enc);
5179 if (buf == NULL) return(NULL);
5180
5181 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
5182 if (ctxt == NULL) {
5183 xmlFree(buf);
5184 return(NULL);
5185 }
5186 memset(ctxt, 0, sizeof(htmlParserCtxt));
5187 htmlInitParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005188 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5189 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005190 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005191 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005192 xmlFree(ctxt->sax);
5193 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5194 if (ctxt->sax == NULL) {
5195 xmlFree(buf);
5196 xmlFree(ctxt);
5197 return(NULL);
5198 }
5199 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5200 if (user_data != NULL)
5201 ctxt->userData = user_data;
5202 }
5203 if (filename == NULL) {
5204 ctxt->directory = NULL;
5205 } else {
5206 ctxt->directory = xmlParserGetDirectory(filename);
5207 }
5208
5209 inputStream = htmlNewInputStream(ctxt);
5210 if (inputStream == NULL) {
5211 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005212 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005213 return(NULL);
5214 }
5215
5216 if (filename == NULL)
5217 inputStream->filename = NULL;
5218 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005219 inputStream->filename = (char *)
5220 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005221 inputStream->buf = buf;
5222 inputStream->base = inputStream->buf->buffer->content;
5223 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillard5f704af2003-03-05 10:01:43 +00005224 inputStream->end =
5225 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005226
5227 inputPush(ctxt, inputStream);
5228
5229 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5230 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005231 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5232 int cur = ctxt->input->cur - ctxt->input->base;
5233
Owen Taylor3473f882001-02-23 17:55:21 +00005234 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005235
5236 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5237 ctxt->input->cur = ctxt->input->base + cur;
5238 ctxt->input->end =
5239 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005240#ifdef DEBUG_PUSH
5241 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5242#endif
5243 }
5244
5245 return(ctxt);
5246}
5247
5248/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005249 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005250 * @cur: a pointer to an array of xmlChar
5251 * @encoding: a free form C string describing the HTML document encoding, or NULL
5252 * @sax: the SAX handler block
5253 * @userData: if using SAX, this pointer will be provided on callbacks.
5254 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005255 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5256 * to handle parse events. If sax is NULL, fallback to the default DOM
5257 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00005258 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005259 * Returns the resulting document tree unless SAX is NULL or the document is
5260 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005261 */
5262
5263htmlDocPtr
5264htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5265 htmlDocPtr ret;
5266 htmlParserCtxtPtr ctxt;
5267
Daniel Veillardd0463562001-10-13 09:15:48 +00005268 xmlInitParser();
5269
Owen Taylor3473f882001-02-23 17:55:21 +00005270 if (cur == NULL) return(NULL);
5271
5272
5273 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5274 if (ctxt == NULL) return(NULL);
5275 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005276 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005277 ctxt->sax = sax;
5278 ctxt->userData = userData;
5279 }
5280
5281 htmlParseDocument(ctxt);
5282 ret = ctxt->myDoc;
5283 if (sax != NULL) {
5284 ctxt->sax = NULL;
5285 ctxt->userData = NULL;
5286 }
5287 htmlFreeParserCtxt(ctxt);
5288
5289 return(ret);
5290}
5291
5292/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005293 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005294 * @cur: a pointer to an array of xmlChar
5295 * @encoding: a free form C string describing the HTML document encoding, or NULL
5296 *
5297 * parse an HTML in-memory document and build a tree.
5298 *
5299 * Returns the resulting document tree
5300 */
5301
5302htmlDocPtr
5303htmlParseDoc(xmlChar *cur, const char *encoding) {
5304 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5305}
5306
5307
5308/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005309 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005310 * @filename: the filename
5311 * @encoding: a free form C string describing the HTML document encoding, or NULL
5312 *
5313 * Create a parser context for a file content.
5314 * Automatic support for ZLIB/Compress compressed document is provided
5315 * by default if found at compile-time.
5316 *
5317 * Returns the new parser context or NULL
5318 */
5319htmlParserCtxtPtr
5320htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5321{
5322 htmlParserCtxtPtr ctxt;
5323 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005324 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005325 /* htmlCharEncoding enc; */
5326 xmlChar *content, *content_line = (xmlChar *) "charset=";
5327
Owen Taylor3473f882001-02-23 17:55:21 +00005328 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
5329 if (ctxt == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00005330 xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00005331 return(NULL);
5332 }
5333 memset(ctxt, 0, sizeof(htmlParserCtxt));
5334 htmlInitParserCtxt(ctxt);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005335 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5336 if (canonicFilename == NULL) {
5337 if (xmlDefaultSAXHandler.error != NULL) {
5338 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5339 }
Daniel Veillard104caa32003-05-13 22:54:05 +00005340 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005341 return(NULL);
5342 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005343
5344 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5345 xmlFree(canonicFilename);
5346 if (inputStream == NULL) {
5347 xmlFreeParserCtxt(ctxt);
5348 return(NULL);
5349 }
Owen Taylor3473f882001-02-23 17:55:21 +00005350
5351 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005352
Owen Taylor3473f882001-02-23 17:55:21 +00005353 /* set encoding */
5354 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005355 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Owen Taylor3473f882001-02-23 17:55:21 +00005356 if (content) {
5357 strcpy ((char *)content, (char *)content_line);
5358 strcat ((char *)content, (char *)encoding);
5359 htmlCheckEncoding (ctxt, content);
5360 xmlFree (content);
5361 }
5362 }
5363
5364 return(ctxt);
5365}
5366
5367/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005368 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005369 * @filename: the filename
5370 * @encoding: a free form C string describing the HTML document encoding, or NULL
5371 * @sax: the SAX handler block
5372 * @userData: if using SAX, this pointer will be provided on callbacks.
5373 *
5374 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5375 * compressed document is provided by default if found at compile-time.
5376 * It use the given SAX function block to handle the parsing callback.
5377 * If sax is NULL, fallback to the default DOM tree building routines.
5378 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005379 * Returns the resulting document tree unless SAX is NULL or the document is
5380 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005381 */
5382
5383htmlDocPtr
5384htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5385 void *userData) {
5386 htmlDocPtr ret;
5387 htmlParserCtxtPtr ctxt;
5388 htmlSAXHandlerPtr oldsax = NULL;
5389
Daniel Veillardd0463562001-10-13 09:15:48 +00005390 xmlInitParser();
5391
Owen Taylor3473f882001-02-23 17:55:21 +00005392 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5393 if (ctxt == NULL) return(NULL);
5394 if (sax != NULL) {
5395 oldsax = ctxt->sax;
5396 ctxt->sax = sax;
5397 ctxt->userData = userData;
5398 }
5399
5400 htmlParseDocument(ctxt);
5401
5402 ret = ctxt->myDoc;
5403 if (sax != NULL) {
5404 ctxt->sax = oldsax;
5405 ctxt->userData = NULL;
5406 }
5407 htmlFreeParserCtxt(ctxt);
5408
5409 return(ret);
5410}
5411
5412/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005413 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005414 * @filename: the filename
5415 * @encoding: a free form C string describing the HTML document encoding, or NULL
5416 *
5417 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5418 * compressed document is provided by default if found at compile-time.
5419 *
5420 * Returns the resulting document tree
5421 */
5422
5423htmlDocPtr
5424htmlParseFile(const char *filename, const char *encoding) {
5425 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5426}
5427
5428/**
5429 * htmlHandleOmittedElem:
5430 * @val: int 0 or 1
5431 *
5432 * Set and return the previous value for handling HTML omitted tags.
5433 *
5434 * Returns the last value for 0 for no handling, 1 for auto insertion.
5435 */
5436
5437int
5438htmlHandleOmittedElem(int val) {
5439 int old = htmlOmittedDefaultValue;
5440
5441 htmlOmittedDefaultValue = val;
5442 return(old);
5443}
5444
Daniel Veillard930dfb62003-02-05 10:17:38 +00005445/**
5446 * htmlElementAllowedHere:
5447 * @parent: HTML parent element
5448 * @elt: HTML element
5449 *
5450 * Checks whether an HTML element may be a direct child of a parent element.
5451 * Note - doesn't check for deprecated elements
5452 *
5453 * Returns 1 if allowed; 0 otherwise.
5454 */
5455int
5456htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5457 const char** p ;
5458
5459 if ( ! elt || ! parent || ! parent->subelts )
5460 return 0 ;
5461
5462 for ( p = parent->subelts; *p; ++p )
5463 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5464 return 1 ;
5465
5466 return 0 ;
5467}
5468/**
5469 * htmlElementStatusHere:
5470 * @parent: HTML parent element
5471 * @elt: HTML element
5472 *
5473 * Checks whether an HTML element may be a direct child of a parent element.
5474 * and if so whether it is valid or deprecated.
5475 *
5476 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5477 */
5478htmlStatus
5479htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5480 if ( ! parent || ! elt )
5481 return HTML_INVALID ;
5482 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5483 return HTML_INVALID ;
5484
5485 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5486}
5487/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005488 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005489 * @elt: HTML element
5490 * @attr: HTML attribute
5491 * @legacy: whether to allow deprecated attributes
5492 *
5493 * Checks whether an attribute is valid for an element
5494 * Has full knowledge of Required and Deprecated attributes
5495 *
5496 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5497 */
5498htmlStatus
5499htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5500 const char** p ;
5501
5502 if ( !elt || ! attr )
5503 return HTML_INVALID ;
5504
5505 if ( elt->attrs_req )
5506 for ( p = elt->attrs_req; *p; ++p)
5507 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5508 return HTML_REQUIRED ;
5509
5510 if ( elt->attrs_opt )
5511 for ( p = elt->attrs_opt; *p; ++p)
5512 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5513 return HTML_VALID ;
5514
5515 if ( legacy && elt->attrs_depr )
5516 for ( p = elt->attrs_depr; *p; ++p)
5517 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5518 return HTML_DEPRECATED ;
5519
5520 return HTML_INVALID ;
5521}
5522/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005523 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005524 * @node: an htmlNodePtr in a tree
5525 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005526 * for Element nodes)
5527 *
5528 * Checks whether the tree node is valid. Experimental (the author
5529 * only uses the HTML enhancements in a SAX parser)
5530 *
5531 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5532 * legacy allowed) or htmlElementStatusHere (otherwise).
5533 * for Attribute nodes, a return from htmlAttrAllowed
5534 * for other nodes, HTML_NA (no checks performed)
5535 */
5536htmlStatus
5537htmlNodeStatus(const htmlNodePtr node, int legacy) {
5538 if ( ! node )
5539 return HTML_INVALID ;
5540
5541 switch ( node->type ) {
5542 case XML_ELEMENT_NODE:
5543 return legacy
5544 ? ( htmlElementAllowedHere (
5545 htmlTagLookup(node->parent->name) , node->name
5546 ) ? HTML_VALID : HTML_INVALID )
5547 : htmlElementStatusHere(
5548 htmlTagLookup(node->parent->name) ,
5549 htmlTagLookup(node->name) )
5550 ;
5551 case XML_ATTRIBUTE_NODE:
5552 return htmlAttrAllowed(
5553 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5554 default: return HTML_NA ;
5555 }
5556}
Daniel Veillard9475a352003-09-26 12:47:50 +00005557/************************************************************************
5558 * *
5559 * New set (2.6.0) of simpler and more flexible APIs *
5560 * *
5561 ************************************************************************/
5562/**
5563 * DICT_FREE:
5564 * @str: a string
5565 *
5566 * Free a string if it is not owned by the "dict" dictionnary in the
5567 * current scope
5568 */
5569#define DICT_FREE(str) \
5570 if ((str) && ((!dict) || \
5571 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5572 xmlFree((char *)(str));
5573
5574/**
5575 * htmlCtxtReset:
5576 * @ctxt: an XML parser context
5577 *
5578 * Reset a parser context
5579 */
5580void
5581htmlCtxtReset(htmlParserCtxtPtr ctxt)
5582{
5583 xmlParserInputPtr input;
5584 xmlDictPtr dict = ctxt->dict;
5585
5586 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5587 xmlFreeInputStream(input);
5588 }
5589 ctxt->inputNr = 0;
5590 ctxt->input = NULL;
5591
5592 ctxt->spaceNr = 0;
5593 ctxt->spaceTab[0] = -1;
5594 ctxt->space = &ctxt->spaceTab[0];
5595
5596
5597 ctxt->nodeNr = 0;
5598 ctxt->node = NULL;
5599
5600 ctxt->nameNr = 0;
5601 ctxt->name = NULL;
5602
5603 DICT_FREE(ctxt->version);
5604 ctxt->version = NULL;
5605 DICT_FREE(ctxt->encoding);
5606 ctxt->encoding = NULL;
5607 DICT_FREE(ctxt->directory);
5608 ctxt->directory = NULL;
5609 DICT_FREE(ctxt->extSubURI);
5610 ctxt->extSubURI = NULL;
5611 DICT_FREE(ctxt->extSubSystem);
5612 ctxt->extSubSystem = NULL;
5613 if (ctxt->myDoc != NULL)
5614 xmlFreeDoc(ctxt->myDoc);
5615 ctxt->myDoc = NULL;
5616
5617 ctxt->standalone = -1;
5618 ctxt->hasExternalSubset = 0;
5619 ctxt->hasPErefs = 0;
5620 ctxt->html = 1;
5621 ctxt->external = 0;
5622 ctxt->instate = XML_PARSER_START;
5623 ctxt->token = 0;
5624
5625 ctxt->wellFormed = 1;
5626 ctxt->nsWellFormed = 1;
5627 ctxt->valid = 1;
5628 ctxt->vctxt.userData = ctxt;
5629 ctxt->vctxt.error = xmlParserValidityError;
5630 ctxt->vctxt.warning = xmlParserValidityWarning;
5631 ctxt->record_info = 0;
5632 ctxt->nbChars = 0;
5633 ctxt->checkIndex = 0;
5634 ctxt->inSubset = 0;
5635 ctxt->errNo = XML_ERR_OK;
5636 ctxt->depth = 0;
5637 ctxt->charset = XML_CHAR_ENCODING_UTF8;
5638 ctxt->catalogs = NULL;
5639 xmlInitNodeInfoSeq(&ctxt->node_seq);
5640
5641 if (ctxt->attsDefault != NULL) {
5642 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5643 ctxt->attsDefault = NULL;
5644 }
5645 if (ctxt->attsSpecial != NULL) {
5646 xmlHashFree(ctxt->attsSpecial, NULL);
5647 ctxt->attsSpecial = NULL;
5648 }
5649}
5650
5651/**
5652 * htmlCtxtUseOptions:
5653 * @ctxt: an HTML parser context
5654 * @options: a combination of htmlParserOption(s)
5655 *
5656 * Applies the options to the parser context
5657 *
5658 * Returns 0 in case of success, the set of unknown or unimplemented options
5659 * in case of error.
5660 */
5661int
5662htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5663{
5664 if (options & HTML_PARSE_NOWARNING) {
5665 ctxt->sax->warning = NULL;
5666 options -= XML_PARSE_NOWARNING;
5667 }
5668 if (options & HTML_PARSE_NOERROR) {
5669 ctxt->sax->error = NULL;
5670 ctxt->sax->fatalError = NULL;
5671 options -= XML_PARSE_NOERROR;
5672 }
5673 if (options & HTML_PARSE_PEDANTIC) {
5674 ctxt->pedantic = 1;
5675 options -= XML_PARSE_PEDANTIC;
5676 } else
5677 ctxt->pedantic = 0;
5678 if (options & XML_PARSE_NOBLANKS) {
5679 ctxt->keepBlanks = 0;
5680 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5681 options -= XML_PARSE_NOBLANKS;
5682 } else
5683 ctxt->keepBlanks = 1;
5684 ctxt->dictNames = 0;
5685 return (options);
5686}
5687
5688/**
5689 * htmlDoRead:
5690 * @ctxt: an HTML parser context
5691 * @URL: the base URL to use for the document
5692 * @encoding: the document encoding, or NULL
5693 * @options: a combination of htmlParserOption(s)
5694 * @reuse: keep the context for reuse
5695 *
5696 * Common front-end for the htmlRead functions
5697 *
5698 * Returns the resulting document tree or NULL
5699 */
5700static htmlDocPtr
5701htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5702 int options, int reuse)
5703{
5704 htmlDocPtr ret;
5705
5706 htmlCtxtUseOptions(ctxt, options);
5707 ctxt->html = 1;
5708 if (encoding != NULL) {
5709 xmlCharEncodingHandlerPtr hdlr;
5710
5711 hdlr = xmlFindCharEncodingHandler(encoding);
5712 if (hdlr != NULL)
5713 xmlSwitchToEncoding(ctxt, hdlr);
5714 }
5715 if ((URL != NULL) && (ctxt->input != NULL) &&
5716 (ctxt->input->filename == NULL))
5717 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
5718 htmlParseDocument(ctxt);
5719 ret = ctxt->myDoc;
5720 ctxt->myDoc = NULL;
5721 if (!reuse) {
5722 if ((ctxt->dictNames) &&
5723 (ret != NULL) &&
5724 (ret->dict == ctxt->dict))
5725 ctxt->dict = NULL;
5726 xmlFreeParserCtxt(ctxt);
5727 } else {
5728 /* Must duplicate the reference to the dictionary */
5729 if ((ctxt->dictNames) &&
5730 (ret != NULL) &&
5731 (ret->dict == ctxt->dict))
5732 xmlDictReference(ctxt->dict);
5733 }
5734 return (ret);
5735}
5736
5737/**
5738 * htmlReadDoc:
5739 * @cur: a pointer to a zero terminated string
5740 * @URL: the base URL to use for the document
5741 * @encoding: the document encoding, or NULL
5742 * @options: a combination of htmlParserOption(s)
5743 *
5744 * parse an XML in-memory document and build a tree.
5745 *
5746 * Returns the resulting document tree
5747 */
5748htmlDocPtr
5749htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
5750{
5751 htmlParserCtxtPtr ctxt;
5752
5753 if (cur == NULL)
5754 return (NULL);
5755
5756 ctxt = xmlCreateDocParserCtxt(cur);
5757 if (ctxt == NULL)
5758 return (NULL);
5759 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5760}
5761
5762/**
5763 * htmlReadFile:
5764 * @filename: a file or URL
5765 * @encoding: the document encoding, or NULL
5766 * @options: a combination of htmlParserOption(s)
5767 *
5768 * parse an XML file from the filesystem or the network.
5769 *
5770 * Returns the resulting document tree
5771 */
5772htmlDocPtr
5773htmlReadFile(const char *filename, const char *encoding, int options)
5774{
5775 htmlParserCtxtPtr ctxt;
5776
5777 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5778 if (ctxt == NULL)
5779 return (NULL);
5780 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
5781}
5782
5783/**
5784 * htmlReadMemory:
5785 * @buffer: a pointer to a char array
5786 * @size: the size of the array
5787 * @URL: the base URL to use for the document
5788 * @encoding: the document encoding, or NULL
5789 * @options: a combination of htmlParserOption(s)
5790 *
5791 * parse an XML in-memory document and build a tree.
5792 *
5793 * Returns the resulting document tree
5794 */
5795htmlDocPtr
5796htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
5797{
5798 htmlParserCtxtPtr ctxt;
5799
5800 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
5801 if (ctxt == NULL)
5802 return (NULL);
5803 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5804}
5805
5806/**
5807 * htmlReadFd:
5808 * @fd: an open file descriptor
5809 * @URL: the base URL to use for the document
5810 * @encoding: the document encoding, or NULL
5811 * @options: a combination of htmlParserOption(s)
5812 *
5813 * parse an XML from a file descriptor and build a tree.
5814 *
5815 * Returns the resulting document tree
5816 */
5817htmlDocPtr
5818htmlReadFd(int fd, const char *URL, const char *encoding, int options)
5819{
5820 htmlParserCtxtPtr ctxt;
5821 xmlParserInputBufferPtr input;
5822 xmlParserInputPtr stream;
5823
5824 if (fd < 0)
5825 return (NULL);
5826
5827 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
5828 if (input == NULL)
5829 return (NULL);
5830 ctxt = xmlNewParserCtxt();
5831 if (ctxt == NULL) {
5832 xmlFreeParserInputBuffer(input);
5833 return (NULL);
5834 }
5835 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5836 if (stream == NULL) {
5837 xmlFreeParserInputBuffer(input);
5838 xmlFreeParserCtxt(ctxt);
5839 return (NULL);
5840 }
5841 inputPush(ctxt, stream);
5842 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5843}
5844
5845/**
5846 * htmlReadIO:
5847 * @ioread: an I/O read function
5848 * @ioclose: an I/O close function
5849 * @ioctx: an I/O handler
5850 * @URL: the base URL to use for the document
5851 * @encoding: the document encoding, or NULL
5852 * @options: a combination of htmlParserOption(s)
5853 *
5854 * parse an HTML document from I/O functions and source and build a tree.
5855 *
5856 * Returns the resulting document tree
5857 */
5858htmlDocPtr
5859htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
5860 void *ioctx, const char *URL, const char *encoding, int options)
5861{
5862 htmlParserCtxtPtr ctxt;
5863 xmlParserInputBufferPtr input;
5864 xmlParserInputPtr stream;
5865
5866 if (ioread == NULL)
5867 return (NULL);
5868
5869 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
5870 XML_CHAR_ENCODING_NONE);
5871 if (input == NULL)
5872 return (NULL);
5873 ctxt = xmlNewParserCtxt();
5874 if (ctxt == NULL) {
5875 xmlFreeParserInputBuffer(input);
5876 return (NULL);
5877 }
5878 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5879 if (stream == NULL) {
5880 xmlFreeParserInputBuffer(input);
5881 xmlFreeParserCtxt(ctxt);
5882 return (NULL);
5883 }
5884 inputPush(ctxt, stream);
5885 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5886}
5887
5888/**
5889 * htmlCtxtReadDoc:
5890 * @ctxt: an HTML parser context
5891 * @cur: a pointer to a zero terminated string
5892 * @URL: the base URL to use for the document
5893 * @encoding: the document encoding, or NULL
5894 * @options: a combination of htmlParserOption(s)
5895 *
5896 * parse an XML in-memory document and build a tree.
5897 * This reuses the existing @ctxt parser context
5898 *
5899 * Returns the resulting document tree
5900 */
5901htmlDocPtr
5902htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
5903 const char *URL, const char *encoding, int options)
5904{
5905 xmlParserInputPtr stream;
5906
5907 if (cur == NULL)
5908 return (NULL);
5909 if (ctxt == NULL)
5910 return (NULL);
5911
5912 htmlCtxtReset(ctxt);
5913
5914 stream = xmlNewStringInputStream(ctxt, cur);
5915 if (stream == NULL) {
5916 return (NULL);
5917 }
5918 inputPush(ctxt, stream);
5919 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5920}
5921
5922/**
5923 * htmlCtxtReadFile:
5924 * @ctxt: an HTML parser context
5925 * @filename: a file or URL
5926 * @encoding: the document encoding, or NULL
5927 * @options: a combination of htmlParserOption(s)
5928 *
5929 * parse an XML file from the filesystem or the network.
5930 * This reuses the existing @ctxt parser context
5931 *
5932 * Returns the resulting document tree
5933 */
5934htmlDocPtr
5935htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
5936 const char *encoding, int options)
5937{
5938 xmlParserInputPtr stream;
5939
5940 if (filename == NULL)
5941 return (NULL);
5942 if (ctxt == NULL)
5943 return (NULL);
5944
5945 htmlCtxtReset(ctxt);
5946
5947 stream = xmlNewInputFromFile(ctxt, filename);
5948 if (stream == NULL) {
5949 return (NULL);
5950 }
5951 inputPush(ctxt, stream);
5952 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
5953}
5954
5955/**
5956 * htmlCtxtReadMemory:
5957 * @ctxt: an HTML parser context
5958 * @buffer: a pointer to a char array
5959 * @size: the size of the array
5960 * @URL: the base URL to use for the document
5961 * @encoding: the document encoding, or NULL
5962 * @options: a combination of htmlParserOption(s)
5963 *
5964 * parse an XML in-memory document and build a tree.
5965 * This reuses the existing @ctxt parser context
5966 *
5967 * Returns the resulting document tree
5968 */
5969htmlDocPtr
5970htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
5971 const char *URL, const char *encoding, int options)
5972{
5973 xmlParserInputBufferPtr input;
5974 xmlParserInputPtr stream;
5975
5976 if (ctxt == NULL)
5977 return (NULL);
5978 if (buffer == NULL)
5979 return (NULL);
5980
5981 htmlCtxtReset(ctxt);
5982
5983 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5984 if (input == NULL) {
5985 return(NULL);
5986 }
5987
5988 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5989 if (stream == NULL) {
5990 xmlFreeParserInputBuffer(input);
5991 return(NULL);
5992 }
5993
5994 inputPush(ctxt, stream);
5995 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5996}
5997
5998/**
5999 * htmlCtxtReadFd:
6000 * @ctxt: an HTML parser context
6001 * @fd: an open file descriptor
6002 * @URL: the base URL to use for the document
6003 * @encoding: the document encoding, or NULL
6004 * @options: a combination of htmlParserOption(s)
6005 *
6006 * parse an XML from a file descriptor and build a tree.
6007 * This reuses the existing @ctxt parser context
6008 *
6009 * Returns the resulting document tree
6010 */
6011htmlDocPtr
6012htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6013 const char *URL, const char *encoding, int options)
6014{
6015 xmlParserInputBufferPtr input;
6016 xmlParserInputPtr stream;
6017
6018 if (fd < 0)
6019 return (NULL);
6020 if (ctxt == NULL)
6021 return (NULL);
6022
6023 htmlCtxtReset(ctxt);
6024
6025
6026 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6027 if (input == NULL)
6028 return (NULL);
6029 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6030 if (stream == NULL) {
6031 xmlFreeParserInputBuffer(input);
6032 return (NULL);
6033 }
6034 inputPush(ctxt, stream);
6035 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6036}
6037
6038/**
6039 * htmlCtxtReadIO:
6040 * @ctxt: an HTML parser context
6041 * @ioread: an I/O read function
6042 * @ioclose: an I/O close function
6043 * @ioctx: an I/O handler
6044 * @URL: the base URL to use for the document
6045 * @encoding: the document encoding, or NULL
6046 * @options: a combination of htmlParserOption(s)
6047 *
6048 * parse an HTML document from I/O functions and source and build a tree.
6049 * This reuses the existing @ctxt parser context
6050 *
6051 * Returns the resulting document tree
6052 */
6053htmlDocPtr
6054htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6055 xmlInputCloseCallback ioclose, void *ioctx,
6056 const char *URL,
6057 const char *encoding, int options)
6058{
6059 xmlParserInputBufferPtr input;
6060 xmlParserInputPtr stream;
6061
6062 if (ioread == NULL)
6063 return (NULL);
6064 if (ctxt == NULL)
6065 return (NULL);
6066
6067 htmlCtxtReset(ctxt);
6068
6069 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6070 XML_CHAR_ENCODING_NONE);
6071 if (input == NULL)
6072 return (NULL);
6073 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6074 if (stream == NULL) {
6075 xmlFreeParserInputBuffer(input);
6076 return (NULL);
6077 }
6078 inputPush(ctxt, stream);
6079 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6080}
6081
Owen Taylor3473f882001-02-23 17:55:21 +00006082#endif /* LIBXML_HTML_ENABLED */