blob: 90e2460e07d14379fcf5a37a5afce9aa8377a3fc [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Owen Taylor3473f882001-02-23 17:55:21 +000062 * Parser stacks related functions and macros *
63 * *
64 ************************************************************************/
65
Daniel Veillard1c732d22002-11-30 11:22:59 +000066/**
67 * htmlnamePush:
68 * @ctxt: an HTML parser context
69 * @value: the element name
70 *
71 * Pushes a new element name on top of the name stack
72 *
73 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +000074 */
Daniel Veillard1c732d22002-11-30 11:22:59 +000075static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +000076htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +000077{
78 if (ctxt->nameNr >= ctxt->nameMax) {
79 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +000080 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +000081 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +000082 ctxt->nameMax *
83 sizeof(ctxt->nameTab[0]));
84 if (ctxt->nameTab == NULL) {
85 xmlGenericError(xmlGenericErrorContext, "realloc failed !\n");
86 return (0);
87 }
88 }
89 ctxt->nameTab[ctxt->nameNr] = value;
90 ctxt->name = value;
91 return (ctxt->nameNr++);
92}
93/**
94 * htmlnamePop:
95 * @ctxt: an HTML parser context
96 *
97 * Pops the top element name from the name stack
98 *
99 * Returns the name just removed
100 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000101static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000102htmlnamePop(htmlParserCtxtPtr ctxt)
103{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000104 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000105
Daniel Veillard1c732d22002-11-30 11:22:59 +0000106 if (ctxt->nameNr <= 0)
107 return (0);
108 ctxt->nameNr--;
109 if (ctxt->nameNr < 0)
110 return (0);
111 if (ctxt->nameNr > 0)
112 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
113 else
114 ctxt->name = NULL;
115 ret = ctxt->nameTab[ctxt->nameNr];
116 ctxt->nameTab[ctxt->nameNr] = 0;
117 return (ret);
118}
Owen Taylor3473f882001-02-23 17:55:21 +0000119
120/*
121 * Macros for accessing the content. Those should be used only by the parser,
122 * and not exported.
123 *
124 * Dirty macros, i.e. one need to make assumption on the context to use them
125 *
126 * CUR_PTR return the current pointer to the xmlChar to be parsed.
127 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
128 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
129 * in UNICODE mode. This should be used internally by the parser
130 * only to compare to ASCII values otherwise it would break when
131 * running with UTF-8 encoding.
132 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
133 * to compare on ASCII based substring.
134 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
135 * it should be used only to compare on ASCII based substring.
136 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000137 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000138 *
139 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
140 *
141 * CURRENT Returns the current char value, with the full decoding of
142 * UTF-8 if we are using this mode. It returns an int.
143 * NEXT Skip to the next character, this does the proper decoding
144 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000145 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000146 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
147 */
148
149#define UPPER (toupper(*ctxt->input->cur))
150
Daniel Veillard77a90a72003-03-22 00:04:05 +0000151#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000152
153#define NXT(val) ctxt->input->cur[(val)]
154
155#define UPP(val) (toupper(ctxt->input->cur[(val)]))
156
157#define CUR_PTR ctxt->input->cur
158
159#define SHRINK xmlParserInputShrink(ctxt->input)
160
161#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
162
163#define CURRENT ((int) (*ctxt->input->cur))
164
165#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
166
167/* Inported from XML */
168
Daniel Veillard561b7f82002-03-20 21:55:57 +0000169/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
170#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000171#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000172
Daniel Veillard561b7f82002-03-20 21:55:57 +0000173#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000174#define NXT(val) ctxt->input->cur[(val)]
175#define CUR_PTR ctxt->input->cur
176
177
178#define NEXTL(l) do { \
179 if (*(ctxt->input->cur) == '\n') { \
180 ctxt->input->line++; ctxt->input->col = 1; \
181 } else ctxt->input->col++; \
182 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
183 } while (0)
184
185/************
186 \
187 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
188 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
189 ************/
190
191#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
192#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
193
194#define COPY_BUF(l,b,i,v) \
195 if (l == 1) b[i++] = (xmlChar) v; \
196 else i += xmlCopyChar(l,&b[i],v)
197
198/**
199 * htmlCurrentChar:
200 * @ctxt: the HTML parser context
201 * @len: pointer to the length of the char read
202 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000203 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000204 * bytes in the input buffer. Implement the end of line normalization:
205 * 2.11 End-of-Line Handling
206 * If the encoding is unspecified, in the case we find an ISO-Latin-1
207 * char, then the encoding converter is plugged in automatically.
208 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000209 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000210 */
211
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000212static int
Owen Taylor3473f882001-02-23 17:55:21 +0000213htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
214 if (ctxt->instate == XML_PARSER_EOF)
215 return(0);
216
217 if (ctxt->token != 0) {
218 *len = 0;
219 return(ctxt->token);
220 }
221 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
222 /*
223 * We are supposed to handle UTF8, check it's valid
224 * From rfc2044: encoding of the Unicode values on UTF-8:
225 *
226 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
227 * 0000 0000-0000 007F 0xxxxxxx
228 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
229 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
230 *
231 * Check for the 0x110000 limit too
232 */
233 const unsigned char *cur = ctxt->input->cur;
234 unsigned char c;
235 unsigned int val;
236
237 c = *cur;
238 if (c & 0x80) {
239 if (cur[1] == 0)
240 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
241 if ((cur[1] & 0xc0) != 0x80)
242 goto encoding_error;
243 if ((c & 0xe0) == 0xe0) {
244
245 if (cur[2] == 0)
246 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
247 if ((cur[2] & 0xc0) != 0x80)
248 goto encoding_error;
249 if ((c & 0xf0) == 0xf0) {
250 if (cur[3] == 0)
251 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
252 if (((c & 0xf8) != 0xf0) ||
253 ((cur[3] & 0xc0) != 0x80))
254 goto encoding_error;
255 /* 4-byte code */
256 *len = 4;
257 val = (cur[0] & 0x7) << 18;
258 val |= (cur[1] & 0x3f) << 12;
259 val |= (cur[2] & 0x3f) << 6;
260 val |= cur[3] & 0x3f;
261 } else {
262 /* 3-byte code */
263 *len = 3;
264 val = (cur[0] & 0xf) << 12;
265 val |= (cur[1] & 0x3f) << 6;
266 val |= cur[2] & 0x3f;
267 }
268 } else {
269 /* 2-byte code */
270 *len = 2;
271 val = (cur[0] & 0x1f) << 6;
272 val |= cur[1] & 0x3f;
273 }
274 if (!IS_CHAR(val)) {
275 ctxt->errNo = XML_ERR_INVALID_ENCODING;
276 if ((ctxt->sax != NULL) &&
277 (ctxt->sax->error != NULL))
278 ctxt->sax->error(ctxt->userData,
279 "Char 0x%X out of allowed range\n", val);
280 ctxt->wellFormed = 0;
Daniel Veillarddad3f682002-11-17 16:47:27 +0000281 if (ctxt->recovery == 0) ctxt->disableSAX = 1;
Owen Taylor3473f882001-02-23 17:55:21 +0000282 }
283 return(val);
284 } else {
285 /* 1-byte code */
286 *len = 1;
287 return((int) *ctxt->input->cur);
288 }
289 }
290 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000291 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000292 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000293 * XML constructs only use < 128 chars
294 */
295 *len = 1;
296 if ((int) *ctxt->input->cur < 0x80)
297 return((int) *ctxt->input->cur);
298
299 /*
300 * Humm this is bad, do an automatic flow conversion
301 */
302 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
303 ctxt->charset = XML_CHAR_ENCODING_UTF8;
304 return(xmlCurrentChar(ctxt, len));
305
306encoding_error:
307 /*
308 * If we detect an UTF8 error that probably mean that the
309 * input encoding didn't get properly advertized in the
310 * declaration header. Report the error and switch the encoding
311 * to ISO-Latin-1 (if you don't like this policy, just declare the
312 * encoding !)
313 */
314 ctxt->errNo = XML_ERR_INVALID_ENCODING;
315 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
316 ctxt->sax->error(ctxt->userData,
317 "Input is not proper UTF-8, indicate encoding !\n");
318 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
319 ctxt->input->cur[0], ctxt->input->cur[1],
320 ctxt->input->cur[2], ctxt->input->cur[3]);
321 }
322
323 ctxt->charset = XML_CHAR_ENCODING_8859_1;
324 *len = 1;
325 return((int) *ctxt->input->cur);
326}
327
328/**
Owen Taylor3473f882001-02-23 17:55:21 +0000329 * htmlSkipBlankChars:
330 * @ctxt: the HTML parser context
331 *
332 * skip all blanks character found at that point in the input streams.
333 *
334 * Returns the number of space chars skipped
335 */
336
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000337static int
Owen Taylor3473f882001-02-23 17:55:21 +0000338htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
339 int res = 0;
340
341 while (IS_BLANK(*(ctxt->input->cur))) {
342 if ((*ctxt->input->cur == 0) &&
343 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
344 xmlPopInput(ctxt);
345 } else {
346 if (*(ctxt->input->cur) == '\n') {
347 ctxt->input->line++; ctxt->input->col = 1;
348 } else ctxt->input->col++;
349 ctxt->input->cur++;
350 ctxt->nbChars++;
351 if (*ctxt->input->cur == 0)
352 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
353 }
354 res++;
355 }
356 return(res);
357}
358
359
360
361/************************************************************************
362 * *
363 * The list of HTML elements and their properties *
364 * *
365 ************************************************************************/
366
367/*
368 * Start Tag: 1 means the start tag can be ommited
369 * End Tag: 1 means the end tag can be ommited
370 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000371 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000372 * Depr: this element is deprecated
373 * DTD: 1 means that this element is valid only in the Loose DTD
374 * 2 means that this element is valid only in the Frameset DTD
375 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000376 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000377 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000378 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000379
380/* Definitions and a couple of vars for HTML Elements */
381
382#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
383#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
384#define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
385#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
386#define BLOCK HEADING LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
387#define FORMCTRL "input", "select", "textarea", "label", "button"
388#define PCDATA
389#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
390#define LIST "ul", "ol", "dir", "menu"
391#define MODIFIER
392#define FLOW BLOCK,INLINE
393#define EMPTY NULL
394
395
396static const char* html_flow[] = { FLOW, NULL } ;
397static const char* html_inline[] = { INLINE, NULL } ;
398
399/* placeholders: elts with content but no subelements */
400static const char* html_pcdata[] = { NULL } ;
401#define html_cdata html_pcdata
402
403
404/* ... and for HTML Attributes */
405
406#define COREATTRS "id", "class", "style", "title"
407#define I18N "lang", "dir"
408#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
409#define ATTRS COREATTRS,I18N,EVENTS
410#define CELLHALIGN "align", "char", "charoff"
411#define CELLVALIGN "valign"
412
413static const char* html_attrs[] = { ATTRS, NULL } ;
414static const char* core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
415static const char* core_attrs[] = { COREATTRS, NULL } ;
416static const char* i18n_attrs[] = { I18N, NULL } ;
417
418
419/* Other declarations that should go inline ... */
420static const char* a_attrs[] = { ATTRS, "charset", "type", "name",
421 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
422 "tabindex", "onfocus", "onblur", NULL } ;
423static const char* target_attr[] = { "target", NULL } ;
424static const char* rows_cols_attr[] = { "rows", "cols", NULL } ;
425static const char* alt_attr[] = { "alt", NULL } ;
426static const char* src_alt_attrs[] = { "src", "alt", NULL } ;
427static const char* href_attrs[] = { "href", NULL } ;
428static const char* clear_attrs[] = { "clear", NULL } ;
429static const char* inline_p[] = { INLINE, "p", NULL } ;
430static const char* flow_param[] = { FLOW, "param", NULL } ;
431static const char* applet_attrs[] = { COREATTRS , "codebase",
432 "archive", "alt", "name", "height", "width", "align",
433 "hspace", "vspace", NULL } ;
434static const char* area_attrs[] = { "shape", "coords", "href", "nohref",
435 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
436static const char* basefont_attrs[] =
437 { "id", "size", "color", "face", NULL } ;
438static const char* quote_attrs[] = { ATTRS, "cite", NULL } ;
439static const char* body_contents[] = { FLOW, "ins", "del", NULL } ;
440static const char* body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
441static const char* body_depr[] = { "background", "bgcolor", "text",
442 "link", "vlink", "alink", NULL } ;
443static const char* button_attrs[] = { ATTRS, "name", "value", "type",
444 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
445
446
447static const char* col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
448static const char* col_elt[] = { "col", NULL } ;
449static const char* edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
450static const char* compact_attrs[] = { ATTRS, "compact", NULL } ;
451static const char* dl_contents[] = { "dt", "dd", NULL } ;
452static const char* compact_attr[] = { "compact", NULL } ;
453static const char* label_attr[] = { "label", NULL } ;
454static const char* fieldset_contents[] = { FLOW, "legend" } ;
455static const char* font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
456static const char* form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
457static const char* form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
458static const char* frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
459static const char* frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
460static const char* frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
461static const char* head_attrs[] = { I18N, "profile", NULL } ;
462static const char* head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
463static const char* hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
464static const char* version_attr[] = { "version", NULL } ;
465static const char* html_content[] = { "head", "body", "frameset", NULL } ;
466static const char* iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
467static const char* img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
468static const char* input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
469static const char* prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
470static const char* label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
471static const char* legend_attrs[] = { ATTRS, "accesskey", NULL } ;
472static const char* align_attr[] = { "align", NULL } ;
473static const char* link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
474static const char* map_contents[] = { BLOCK, "area", NULL } ;
475static const char* name_attr[] = { "name", NULL } ;
476static const char* action_attr[] = { "action", NULL } ;
477static const char* blockli_elt[] = { BLOCK, "li", NULL } ;
478static const char* meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
479static const char* content_attr[] = { "content", NULL } ;
480static const char* type_attr[] = { "type", NULL } ;
481static const char* noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
482static const char* object_contents[] = { FLOW, "param", NULL } ;
483static const char* object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
484static const char* object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
485static const char* ol_attrs[] = { "type", "compact", "start", NULL} ;
486static const char* option_elt[] = { "option", NULL } ;
487static const char* optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
488static const char* option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
489static const char* param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
490static const char* width_attr[] = { "width", NULL } ;
491static const char* pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
492static const char* script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
493static const char* language_attr[] = { "language", NULL } ;
494static const char* select_content[] = { "optgroup", "option", NULL } ;
495static const char* select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
496static const char* style_attrs[] = { I18N, "media", "title", NULL } ;
497static const char* table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
498static const char* table_depr[] = { "align", "bgcolor", NULL } ;
499static const char* table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
500static const char* tr_elt[] = { "tr", NULL } ;
501static const char* talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
502static const char* th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
503static const char* th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
504static const char* textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
505static const char* tr_contents[] = { "th", "td", NULL } ;
506static const char* bgcolor_attr[] = { "bgcolor", NULL } ;
507static const char* li_elt[] = { "li", NULL } ;
508static const char* ul_depr[] = { "type", "compact", NULL} ;
509static const char* dir_attr[] = { "dir", NULL} ;
510
511#define DECL (const char**)
512
Daniel Veillard22090732001-07-16 00:06:07 +0000513static const htmlElemDesc
514html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000515{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
516 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
517},
518{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
519 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
520},
521{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
522 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
523},
524{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
525 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
526},
527{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
528 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
529},
530{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
531 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
532},
533{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
534 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
535},
536{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
537 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
538},
539{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
540 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
541},
542{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
543 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
544},
545{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
546 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
547},
548{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
549 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
550},
551{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
552 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
553},
554{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
555 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
556},
557{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
558 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
559},
560{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
561 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
562},
563{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
564 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
565},
566{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
567 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
568},
569{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
570 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
571},
572{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
573 EMPTY , NULL , DECL col_attrs , NULL, NULL
574},
575{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
576 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
577},
578{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
579 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
580},
581{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
582 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
583},
584{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
585 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
586},
587{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
588 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
589},
590{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
591 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
592},
593{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
594 DECL dl_contents , "dd" , html_attrs, DECL compact_attr, NULL
595},
596{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
597 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
598},
599{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
600 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
601},
602{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
603 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
604},
605{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
606 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
607},
608{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
609 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
610},
611{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
612 EMPTY, NULL, NULL, DECL frame_attrs, NULL
613},
614{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
615 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
616},
617{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
618 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
619},
620{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
621 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
622},
623{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
624 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
625},
626{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
627 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
628},
629{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
630 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
631},
632{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
633 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
634},
635{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
636 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
637},
638{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
639 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
640},
641{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
642 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
643},
644{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
645 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
646},
647{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
648 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
649},
650{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
651 EMPTY, NULL, DECL img_attrs, DECL align_attr, src_alt_attrs
652},
653{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
654 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
655},
656{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
657 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
658},
659{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
660 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
661},
662{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
663 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
664},
665{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
666 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
667},
668{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
669 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
670},
671{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
672 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
673},
674{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
675 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
676},
677{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
678 DECL map_contents , NULL, DECL html_attrs , NULL, name_attr
679},
680{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
681 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
682},
683{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
684 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
685},
686{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
687 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
688},
689{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
690 DECL html_flow, "div", DECL html_attrs, NULL, NULL
691},
692{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
693 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
694},
695{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
696 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
697},
698{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
699 option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
700},
701{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
702 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
703},
704{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
705 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
706},
707{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
708 EMPTY, NULL, DECL param_attrs, NULL, name_attr
709},
710{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
711 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
712},
713{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
714 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
715},
716{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
717 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
718},
719{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
720 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
721},
722{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
723 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
724},
725{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
726 DECL select_content, NULL, DECL select_attrs, NULL, NULL
727},
728{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
729 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
730},
731{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
732 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
733},
734{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
735 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
736},
737{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
738 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
739},
740{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
741 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
742},
743{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
744 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
745},
746{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
747 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
748},
749{ "table", 0, 0, 0, 0, 0, 0, 0, "",
750 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
751},
752{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
753 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
754},
755{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
756 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
757},
758{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
759 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
760},
761{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
762 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
763},
764{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
765 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
766},
767{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
768 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
769},
770{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
771 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
772},
773{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
774 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
775},
776{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
777 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
778},
779{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
780 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
781},
782{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
783 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
784},
785{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
786 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
787}
Owen Taylor3473f882001-02-23 17:55:21 +0000788};
789
790/*
Owen Taylor3473f882001-02-23 17:55:21 +0000791 * start tags that imply the end of current element
792 */
Daniel Veillard22090732001-07-16 00:06:07 +0000793static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000794"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
795 "dl", "ul", "ol", "menu", "dir", "address", "pre",
796 "listing", "xmp", "head", NULL,
797"head", "p", NULL,
798"title", "p", NULL,
799"body", "head", "style", "link", "title", "p", NULL,
800"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
801 "pre", "listing", "xmp", "head", "li", NULL,
802"hr", "p", "head", NULL,
803"h1", "p", "head", NULL,
804"h2", "p", "head", NULL,
805"h3", "p", "head", NULL,
806"h4", "p", "head", NULL,
807"h5", "p", "head", NULL,
808"h6", "p", "head", NULL,
809"dir", "p", "head", NULL,
810"address", "p", "head", "ul", NULL,
811"pre", "p", "head", "ul", NULL,
812"listing", "p", "head", NULL,
813"xmp", "p", "head", NULL,
814"blockquote", "p", "head", NULL,
815"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
816 "xmp", "head", NULL,
817"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
818 "head", "dd", NULL,
819"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
820 "head", "dt", NULL,
821"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
822 "listing", "xmp", NULL,
823"ol", "p", "head", "ul", NULL,
824"menu", "p", "head", "ul", NULL,
825"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
826"div", "p", "head", NULL,
827"noscript", "p", "head", NULL,
828"center", "font", "b", "i", "p", "head", NULL,
829"a", "a", NULL,
830"caption", "p", NULL,
831"colgroup", "caption", "colgroup", "col", "p", NULL,
832"col", "caption", "col", "p", NULL,
833"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
834 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000835"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
836"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000837"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
838"thead", "caption", "col", "colgroup", NULL,
839"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
840 "tbody", "p", NULL,
841"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
842 "tfoot", "tbody", "p", NULL,
843"optgroup", "option", NULL,
844"option", "option", NULL,
845"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
846 "pre", "listing", "xmp", "a", NULL,
847NULL
848};
849
850/*
851 * The list of HTML elements which are supposed not to have
852 * CDATA content and where a p element will be implied
853 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000854 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000855 * implied paragraph
856 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000857static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000858 "html",
859 "head",
860 "body",
861 NULL
862};
863
864/*
865 * The list of HTML attributes which are of content %Script;
866 * NOTE: when adding ones, check htmlIsScriptAttribute() since
867 * it assumes the name starts with 'on'
868 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000869static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000870 "onclick",
871 "ondblclick",
872 "onmousedown",
873 "onmouseup",
874 "onmouseover",
875 "onmousemove",
876 "onmouseout",
877 "onkeypress",
878 "onkeydown",
879 "onkeyup",
880 "onload",
881 "onunload",
882 "onfocus",
883 "onblur",
884 "onsubmit",
885 "onrest",
886 "onchange",
887 "onselect"
888};
889
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000890/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000891 * This table is used by the htmlparser to know what to do with
892 * broken html pages. By assigning different priorities to different
893 * elements the parser can decide how to handle extra endtags.
894 * Endtags are only allowed to close elements with lower or equal
895 * priority.
896 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000897
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000898typedef struct {
899 const char *name;
900 int priority;
901} elementPriority;
902
Daniel Veillard22090732001-07-16 00:06:07 +0000903static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000904 {"div", 150},
905 {"td", 160},
906 {"th", 160},
907 {"tr", 170},
908 {"thead", 180},
909 {"tbody", 180},
910 {"tfoot", 180},
911 {"table", 190},
912 {"head", 200},
913 {"body", 200},
914 {"html", 220},
915 {NULL, 100} /* Default priority */
916};
Owen Taylor3473f882001-02-23 17:55:21 +0000917
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000918static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000919static int htmlStartCloseIndexinitialized = 0;
920
921/************************************************************************
922 * *
923 * functions to handle HTML specific data *
924 * *
925 ************************************************************************/
926
927/**
928 * htmlInitAutoClose:
929 *
930 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
931 * This is not reentrant. Call xmlInitParser() once before processing in
932 * case of use in multithreaded programs.
933 */
934void
935htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000936 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000937
938 if (htmlStartCloseIndexinitialized) return;
939
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000940 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
941 indx = 0;
942 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
943 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000944 while (htmlStartClose[i] != NULL) i++;
945 i++;
946 }
947 htmlStartCloseIndexinitialized = 1;
948}
949
950/**
951 * htmlTagLookup:
952 * @tag: The tag name in lowercase
953 *
954 * Lookup the HTML tag in the ElementTable
955 *
956 * Returns the related htmlElemDescPtr or NULL if not found.
957 */
Daniel Veillardbb371292001-08-16 23:26:59 +0000958const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +0000959htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000960 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000961
962 for (i = 0; i < (sizeof(html40ElementTable) /
963 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000964 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +0000965 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +0000966 }
967 return(NULL);
968}
969
970/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000971 * htmlGetEndPriority:
972 * @name: The name of the element to look up the priority for.
973 *
974 * Return value: The "endtag" priority.
975 **/
976static int
977htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000978 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000979
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000980 while ((htmlEndPriority[i].name != NULL) &&
981 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
982 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000983
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000984 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000985}
986
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000987
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000988/**
Owen Taylor3473f882001-02-23 17:55:21 +0000989 * htmlCheckAutoClose:
990 * @newtag: The new tag name
991 * @oldtag: The old tag name
992 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000993 * Checks whether the new tag is one of the registered valid tags for
994 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +0000995 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
996 *
997 * Returns 0 if no, 1 if yes.
998 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000999static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001000htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1001{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001002 int i, indx;
1003 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001004
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001005 if (htmlStartCloseIndexinitialized == 0)
1006 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001007
1008 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001009 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001010 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001011 if (closed == NULL)
1012 return (0);
1013 if (xmlStrEqual(BAD_CAST * closed, newtag))
1014 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001015 }
1016
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001017 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001018 i++;
1019 while (htmlStartClose[i] != NULL) {
1020 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001021 return (1);
1022 }
1023 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001024 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001025 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001026}
1027
1028/**
1029 * htmlAutoCloseOnClose:
1030 * @ctxt: an HTML parser context
1031 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001032 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001033 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001034 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001035 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001036static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001037htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1038{
1039 const htmlElemDesc *info;
1040 const xmlChar *oldname;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001041 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001042
1043#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001044 xmlGenericError(xmlGenericErrorContext,
1045 "Close of %s stack: %d elements\n", newtag,
1046 ctxt->nameNr);
1047 for (i = 0; i < ctxt->nameNr; i++)
1048 xmlGenericError(xmlGenericErrorContext, "%d : %s\n", i,
1049 ctxt->nameTab[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001050#endif
1051
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001052 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001053
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001054 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001055
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001056 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1057 break;
1058 /*
1059 * A missplaced endtag can only close elements with lower
1060 * or equal priority, so if we find an element with higher
1061 * priority before we find an element with
1062 * matching name, we just ignore this endtag
1063 */
1064 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1065 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001066 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001067 if (i < 0)
1068 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001069
1070 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001071 info = htmlTagLookup(ctxt->name);
1072 if ((info == NULL) || (info->endTag == 1)) {
Owen Taylor3473f882001-02-23 17:55:21 +00001073#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001074 xmlGenericError(xmlGenericErrorContext,
1075 "htmlAutoCloseOnClose: %s closes %s\n", newtag,
1076 ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00001077#endif
Daniel Veillard56098d42001-04-24 12:51:09 +00001078 } else if (info->endTag == 3) {
1079#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001080 xmlGenericError(xmlGenericErrorContext,
1081 "End of tag %s: expecting %s\n", newtag,
1082 ctxt->name);
William M. Brack1633d182001-10-05 15:41:19 +00001083
Daniel Veillard56098d42001-04-24 12:51:09 +00001084#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001085 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1086 ctxt->sax->error(ctxt->userData,
1087 "Opening and ending tag mismatch: %s and %s\n",
1088 newtag, ctxt->name);
1089 ctxt->wellFormed = 0;
1090 }
1091 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1092 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1093 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001094#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001095 if (oldname != NULL) {
1096 xmlGenericError(xmlGenericErrorContext,
1097 "htmlAutoCloseOnClose: popped %s\n", oldname);
1098 }
Owen Taylor3473f882001-02-23 17:55:21 +00001099#endif
Owen Taylor3473f882001-02-23 17:55:21 +00001100 }
1101}
1102
1103/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001104 * htmlAutoCloseOnEnd:
1105 * @ctxt: an HTML parser context
1106 *
1107 * Close all remaining tags at the end of the stream
1108 */
1109static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001110htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1111{
1112 const xmlChar *oldname;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001113 int i;
1114
1115 if (ctxt->nameNr == 0)
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001116 return;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001117#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001118 xmlGenericError(xmlGenericErrorContext,
1119 "Close of stack: %d elements\n", ctxt->nameNr);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001120#endif
1121
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001122 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001123#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001124 xmlGenericError(xmlGenericErrorContext, "%d : %s\n", i,
1125 ctxt->nameTab[i]);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001126#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001127 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1128 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1129 oldname = htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001130#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001131 if (oldname != NULL) {
1132 xmlGenericError(xmlGenericErrorContext,
1133 "htmlAutoCloseOnEnd: popped %s\n", oldname);
1134 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001135#endif
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001136 }
1137}
1138
1139/**
Owen Taylor3473f882001-02-23 17:55:21 +00001140 * htmlAutoClose:
1141 * @ctxt: an HTML parser context
1142 * @newtag: The new tag name or NULL
1143 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001144 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001145 * The list is kept in htmlStartClose array. This function is
1146 * called when a new tag has been detected and generates the
1147 * appropriates closes if possible/needed.
1148 * If newtag is NULL this mean we are at the end of the resource
1149 * and we should check
1150 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001151static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001152htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1153{
1154 const xmlChar *oldname;
1155
1156 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001157 (htmlCheckAutoClose(newtag, ctxt->name))) {
1158#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001159 xmlGenericError(xmlGenericErrorContext,
1160 "htmlAutoClose: %s closes %s\n", newtag,
1161 ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00001162#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001163 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1164 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1165 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001166#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001167 if (oldname != NULL) {
1168 xmlGenericError(xmlGenericErrorContext,
1169 "htmlAutoClose: popped %s\n", oldname);
Owen Taylor3473f882001-02-23 17:55:21 +00001170 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001171#endif
Owen Taylor3473f882001-02-23 17:55:21 +00001172 }
1173 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001174 htmlAutoCloseOnEnd(ctxt);
1175 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001176 }
1177 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001178 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1179 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1180 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00001181#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001182 xmlGenericError(xmlGenericErrorContext,
1183 "htmlAutoClose: EOF closes %s\n", ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00001184#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001185 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1186 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1187 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001188#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001189 if (oldname != NULL) {
1190 xmlGenericError(xmlGenericErrorContext,
1191 "htmlAutoClose: popped %s\n", oldname);
Owen Taylor3473f882001-02-23 17:55:21 +00001192 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001193#endif
1194 }
Owen Taylor3473f882001-02-23 17:55:21 +00001195
1196}
1197
1198/**
1199 * htmlAutoCloseTag:
1200 * @doc: the HTML document
1201 * @name: The tag name
1202 * @elem: the HTML element
1203 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001204 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001205 * The list is kept in htmlStartClose array. This function checks
1206 * if the element or one of it's children would autoclose the
1207 * given tag.
1208 *
1209 * Returns 1 if autoclose, 0 otherwise
1210 */
1211int
1212htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1213 htmlNodePtr child;
1214
1215 if (elem == NULL) return(1);
1216 if (xmlStrEqual(name, elem->name)) return(0);
1217 if (htmlCheckAutoClose(elem->name, name)) return(1);
1218 child = elem->children;
1219 while (child != NULL) {
1220 if (htmlAutoCloseTag(doc, name, child)) return(1);
1221 child = child->next;
1222 }
1223 return(0);
1224}
1225
1226/**
1227 * htmlIsAutoClosed:
1228 * @doc: the HTML document
1229 * @elem: the HTML element
1230 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001231 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001232 * The list is kept in htmlStartClose array. This function checks
1233 * if a tag is autoclosed by one of it's child
1234 *
1235 * Returns 1 if autoclosed, 0 otherwise
1236 */
1237int
1238htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1239 htmlNodePtr child;
1240
1241 if (elem == NULL) return(1);
1242 child = elem->children;
1243 while (child != NULL) {
1244 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1245 child = child->next;
1246 }
1247 return(0);
1248}
1249
1250/**
1251 * htmlCheckImplied:
1252 * @ctxt: an HTML parser context
1253 * @newtag: The new tag name
1254 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001255 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001256 * called when a new tag has been detected and generates the
1257 * appropriates implicit tags if missing
1258 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001259static void
Owen Taylor3473f882001-02-23 17:55:21 +00001260htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1261 if (!htmlOmittedDefaultValue)
1262 return;
1263 if (xmlStrEqual(newtag, BAD_CAST"html"))
1264 return;
1265 if (ctxt->nameNr <= 0) {
1266#ifdef DEBUG
1267 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
1268#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001269 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001270 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1271 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1272 }
1273 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1274 return;
1275 if ((ctxt->nameNr <= 1) &&
1276 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1277 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1278 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1279 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1280 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1281 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1282 /*
1283 * dropped OBJECT ... i you put it first BODY will be
1284 * assumed !
1285 */
1286#ifdef DEBUG
1287 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
1288#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001289 htmlnamePush(ctxt, BAD_CAST"head");
Owen Taylor3473f882001-02-23 17:55:21 +00001290 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1291 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1292 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1293 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1294 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1295 int i;
1296 for (i = 0;i < ctxt->nameNr;i++) {
1297 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1298 return;
1299 }
1300 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1301 return;
1302 }
1303 }
1304
1305#ifdef DEBUG
1306 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
1307#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001308 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001309 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1310 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1311 }
1312}
1313
1314/**
1315 * htmlCheckParagraph
1316 * @ctxt: an HTML parser context
1317 *
1318 * Check whether a p element need to be implied before inserting
1319 * characters in the current element.
1320 *
1321 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1322 * in case of error.
1323 */
1324
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001325static int
Owen Taylor3473f882001-02-23 17:55:21 +00001326htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1327 const xmlChar *tag;
1328 int i;
1329
1330 if (ctxt == NULL)
1331 return(-1);
1332 tag = ctxt->name;
1333 if (tag == NULL) {
1334 htmlAutoClose(ctxt, BAD_CAST"p");
1335 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001336 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001337 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1338 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1339 return(1);
1340 }
1341 if (!htmlOmittedDefaultValue)
1342 return(0);
1343 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1344 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1345#ifdef DEBUG
1346 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
1347#endif
1348 htmlAutoClose(ctxt, BAD_CAST"p");
1349 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001350 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001351 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1352 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1353 return(1);
1354 }
1355 }
1356 return(0);
1357}
1358
1359/**
1360 * htmlIsScriptAttribute:
1361 * @name: an attribute name
1362 *
1363 * Check if an attribute is of content type Script
1364 *
1365 * Returns 1 is the attribute is a script 0 otherwise
1366 */
1367int
1368htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001369 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001370
1371 if (name == NULL)
1372 return(0);
1373 /*
1374 * all script attributes start with 'on'
1375 */
1376 if ((name[0] != 'o') || (name[1] != 'n'))
1377 return(0);
1378 for (i = 0;
1379 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1380 i++) {
1381 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1382 return(1);
1383 }
1384 return(0);
1385}
1386
1387/************************************************************************
1388 * *
1389 * The list of HTML predefined entities *
1390 * *
1391 ************************************************************************/
1392
1393
Daniel Veillard22090732001-07-16 00:06:07 +00001394static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001395/*
1396 * the 4 absolute ones, plus apostrophe.
1397 */
1398{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1399{ 38, "amp", "ampersand, U+0026 ISOnum" },
1400{ 39, "apos", "single quote" },
1401{ 60, "lt", "less-than sign, U+003C ISOnum" },
1402{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1403
1404/*
1405 * A bunch still in the 128-255 range
1406 * Replacing them depend really on the charset used.
1407 */
1408{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1409{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1410{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1411{ 163, "pound","pound sign, U+00A3 ISOnum" },
1412{ 164, "curren","currency sign, U+00A4 ISOnum" },
1413{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1414{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1415{ 167, "sect", "section sign, U+00A7 ISOnum" },
1416{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1417{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1418{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1419{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1420{ 172, "not", "not sign, U+00AC ISOnum" },
1421{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1422{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1423{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1424{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1425{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1426{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1427{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1428{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1429{ 181, "micro","micro sign, U+00B5 ISOnum" },
1430{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1431{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1432{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1433{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1434{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1435{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1436{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1437{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1438{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1439{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1440{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1441{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1442{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1443{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1444{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1445{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1446{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1447{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1448{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1449{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1450{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1451{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1452{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1453{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1454{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1455{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1456{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1457{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1458{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1459{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1460{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1461{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1462{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1463{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1464{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1465{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1466{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1467{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1468{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1469{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1470{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1471{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1472{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1473{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1474{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1475{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1476{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1477{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1478{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1479{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1480{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1481{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1482{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1483{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1484{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1485{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1486{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1487{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1488{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1489{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1490{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1491{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1492{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1493{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1494{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1495{ 247, "divide","division sign, U+00F7 ISOnum" },
1496{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1497{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1498{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1499{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1500{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1501{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1502{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1503{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1504
1505{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1506{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1507{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1508{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1509{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1510
1511/*
1512 * Anything below should really be kept as entities references
1513 */
1514{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1515
1516{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1517{ 732, "tilde","small tilde, U+02DC ISOdia" },
1518
1519{ 913, "Alpha","greek capital letter alpha, U+0391" },
1520{ 914, "Beta", "greek capital letter beta, U+0392" },
1521{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1522{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1523{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1524{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1525{ 919, "Eta", "greek capital letter eta, U+0397" },
1526{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1527{ 921, "Iota", "greek capital letter iota, U+0399" },
1528{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001529{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001530{ 924, "Mu", "greek capital letter mu, U+039C" },
1531{ 925, "Nu", "greek capital letter nu, U+039D" },
1532{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1533{ 927, "Omicron","greek capital letter omicron, U+039F" },
1534{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1535{ 929, "Rho", "greek capital letter rho, U+03A1" },
1536{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1537{ 932, "Tau", "greek capital letter tau, U+03A4" },
1538{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1539{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1540{ 935, "Chi", "greek capital letter chi, U+03A7" },
1541{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1542{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1543
1544{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1545{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1546{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1547{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1548{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1549{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1550{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1551{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1552{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1553{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1554{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1555{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1556{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1557{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1558{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1559{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1560{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1561{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1562{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1563{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1564{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1565{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1566{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1567{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1568{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1569{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1570{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1571{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1572
1573{ 8194, "ensp", "en space, U+2002 ISOpub" },
1574{ 8195, "emsp", "em space, U+2003 ISOpub" },
1575{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1576{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1577{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1578{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1579{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1580{ 8211, "ndash","en dash, U+2013 ISOpub" },
1581{ 8212, "mdash","em dash, U+2014 ISOpub" },
1582{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1583{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1584{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1585{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1586{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1587{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1588{ 8224, "dagger","dagger, U+2020 ISOpub" },
1589{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1590
1591{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1592{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1593
1594{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1595
1596{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1597{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1598
1599{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1600{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1601
1602{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1603{ 8260, "frasl","fraction slash, U+2044 NEW" },
1604
1605{ 8364, "euro", "euro sign, U+20AC NEW" },
1606
1607{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1608{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1609{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1610{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1611{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1612{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1613{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1614{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1615{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1616{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1617{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1618{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1619{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1620{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1621{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1622{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1623
1624{ 8704, "forall","for all, U+2200 ISOtech" },
1625{ 8706, "part", "partial differential, U+2202 ISOtech" },
1626{ 8707, "exist","there exists, U+2203 ISOtech" },
1627{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1628{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1629{ 8712, "isin", "element of, U+2208 ISOtech" },
1630{ 8713, "notin","not an element of, U+2209 ISOtech" },
1631{ 8715, "ni", "contains as member, U+220B ISOtech" },
1632{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001633{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001634{ 8722, "minus","minus sign, U+2212 ISOtech" },
1635{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1636{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1637{ 8733, "prop", "proportional to, U+221D ISOtech" },
1638{ 8734, "infin","infinity, U+221E ISOtech" },
1639{ 8736, "ang", "angle, U+2220 ISOamso" },
1640{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1641{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1642{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1643{ 8746, "cup", "union = cup, U+222A ISOtech" },
1644{ 8747, "int", "integral, U+222B ISOtech" },
1645{ 8756, "there4","therefore, U+2234 ISOtech" },
1646{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1647{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1648{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1649{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1650{ 8801, "equiv","identical to, U+2261 ISOtech" },
1651{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1652{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1653{ 8834, "sub", "subset of, U+2282 ISOtech" },
1654{ 8835, "sup", "superset of, U+2283 ISOtech" },
1655{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1656{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1657{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1658{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1659{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1660{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1661{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1662{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1663{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1664{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1665{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1666{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1667{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1668{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1669
1670{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1671{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1672{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1673{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1674
1675};
1676
1677/************************************************************************
1678 * *
1679 * Commodity functions to handle entities *
1680 * *
1681 ************************************************************************/
1682
1683/*
1684 * Macro used to grow the current buffer.
1685 */
1686#define growBuffer(buffer) { \
1687 buffer##_size *= 2; \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001688 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Owen Taylor3473f882001-02-23 17:55:21 +00001689 if (buffer == NULL) { \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001690 xmlGenericError(xmlGenericErrorContext, "realloc failed\n"); \
Owen Taylor3473f882001-02-23 17:55:21 +00001691 return(NULL); \
1692 } \
1693}
1694
1695/**
1696 * htmlEntityLookup:
1697 * @name: the entity name
1698 *
1699 * Lookup the given entity in EntitiesTable
1700 *
1701 * TODO: the linear scan is really ugly, an hash table is really needed.
1702 *
1703 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1704 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001705const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001706htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001707 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001708
1709 for (i = 0;i < (sizeof(html40EntitiesTable)/
1710 sizeof(html40EntitiesTable[0]));i++) {
1711 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1712#ifdef DEBUG
1713 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1714#endif
William M. Brack78637da2003-07-31 14:47:38 +00001715 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001716 }
1717 }
1718 return(NULL);
1719}
1720
1721/**
1722 * htmlEntityValueLookup:
1723 * @value: the entity's unicode value
1724 *
1725 * Lookup the given entity in EntitiesTable
1726 *
1727 * TODO: the linear scan is really ugly, an hash table is really needed.
1728 *
1729 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1730 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001731const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001732htmlEntityValueLookup(unsigned int value) {
1733 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001734#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001735 unsigned int lv = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001736#endif
1737
1738 for (i = 0;i < (sizeof(html40EntitiesTable)/
1739 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001740 if (html40EntitiesTable[i].value >= value) {
1741 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001742 break;
1743#ifdef DEBUG
1744 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1745#endif
William M. Brack78637da2003-07-31 14:47:38 +00001746 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001747 }
1748#ifdef DEBUG
1749 if (lv > html40EntitiesTable[i].value) {
1750 xmlGenericError(xmlGenericErrorContext,
1751 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1752 lv, html40EntitiesTable[i].value);
1753 }
1754 lv = html40EntitiesTable[i].value;
1755#endif
1756 }
1757 return(NULL);
1758}
1759
1760/**
1761 * UTF8ToHtml:
1762 * @out: a pointer to an array of bytes to store the result
1763 * @outlen: the length of @out
1764 * @in: a pointer to an array of UTF-8 chars
1765 * @inlen: the length of @in
1766 *
1767 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1768 * plus HTML entities block of chars out.
1769 *
1770 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1771 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001772 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001773 * The value of @outlen after return is the number of octets consumed.
1774 */
1775int
1776UTF8ToHtml(unsigned char* out, int *outlen,
1777 const unsigned char* in, int *inlen) {
1778 const unsigned char* processed = in;
1779 const unsigned char* outend;
1780 const unsigned char* outstart = out;
1781 const unsigned char* instart = in;
1782 const unsigned char* inend;
1783 unsigned int c, d;
1784 int trailing;
1785
1786 if (in == NULL) {
1787 /*
1788 * initialization nothing to do
1789 */
1790 *outlen = 0;
1791 *inlen = 0;
1792 return(0);
1793 }
1794 inend = in + (*inlen);
1795 outend = out + (*outlen);
1796 while (in < inend) {
1797 d = *in++;
1798 if (d < 0x80) { c= d; trailing= 0; }
1799 else if (d < 0xC0) {
1800 /* trailing byte in leading position */
1801 *outlen = out - outstart;
1802 *inlen = processed - instart;
1803 return(-2);
1804 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1805 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1806 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1807 else {
1808 /* no chance for this in Ascii */
1809 *outlen = out - outstart;
1810 *inlen = processed - instart;
1811 return(-2);
1812 }
1813
1814 if (inend - in < trailing) {
1815 break;
1816 }
1817
1818 for ( ; trailing; trailing--) {
1819 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1820 break;
1821 c <<= 6;
1822 c |= d & 0x3F;
1823 }
1824
1825 /* assertion: c is a single UTF-4 value */
1826 if (c < 0x80) {
1827 if (out + 1 >= outend)
1828 break;
1829 *out++ = c;
1830 } else {
1831 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001832 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001833
1834 /*
1835 * Try to lookup a predefined HTML entity for it
1836 */
1837
1838 ent = htmlEntityValueLookup(c);
1839 if (ent == NULL) {
1840 /* no chance for this in Ascii */
1841 *outlen = out - outstart;
1842 *inlen = processed - instart;
1843 return(-2);
1844 }
1845 len = strlen(ent->name);
1846 if (out + 2 + len >= outend)
1847 break;
1848 *out++ = '&';
1849 memcpy(out, ent->name, len);
1850 out += len;
1851 *out++ = ';';
1852 }
1853 processed = in;
1854 }
1855 *outlen = out - outstart;
1856 *inlen = processed - instart;
1857 return(0);
1858}
1859
1860/**
1861 * htmlEncodeEntities:
1862 * @out: a pointer to an array of bytes to store the result
1863 * @outlen: the length of @out
1864 * @in: a pointer to an array of UTF-8 chars
1865 * @inlen: the length of @in
1866 * @quoteChar: the quote character to escape (' or ") or zero.
1867 *
1868 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1869 * plus HTML entities block of chars out.
1870 *
1871 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1872 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001873 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001874 * The value of @outlen after return is the number of octets consumed.
1875 */
1876int
1877htmlEncodeEntities(unsigned char* out, int *outlen,
1878 const unsigned char* in, int *inlen, int quoteChar) {
1879 const unsigned char* processed = in;
1880 const unsigned char* outend = out + (*outlen);
1881 const unsigned char* outstart = out;
1882 const unsigned char* instart = in;
1883 const unsigned char* inend = in + (*inlen);
1884 unsigned int c, d;
1885 int trailing;
1886
1887 while (in < inend) {
1888 d = *in++;
1889 if (d < 0x80) { c= d; trailing= 0; }
1890 else if (d < 0xC0) {
1891 /* trailing byte in leading position */
1892 *outlen = out - outstart;
1893 *inlen = processed - instart;
1894 return(-2);
1895 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1896 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1897 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1898 else {
1899 /* no chance for this in Ascii */
1900 *outlen = out - outstart;
1901 *inlen = processed - instart;
1902 return(-2);
1903 }
1904
1905 if (inend - in < trailing)
1906 break;
1907
1908 while (trailing--) {
1909 if (((d= *in++) & 0xC0) != 0x80) {
1910 *outlen = out - outstart;
1911 *inlen = processed - instart;
1912 return(-2);
1913 }
1914 c <<= 6;
1915 c |= d & 0x3F;
1916 }
1917
1918 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001919 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1920 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001921 if (out >= outend)
1922 break;
1923 *out++ = c;
1924 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001925 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001926 const char *cp;
1927 char nbuf[16];
1928 int len;
1929
1930 /*
1931 * Try to lookup a predefined HTML entity for it
1932 */
1933 ent = htmlEntityValueLookup(c);
1934 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001935 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001936 cp = nbuf;
1937 }
1938 else
1939 cp = ent->name;
1940 len = strlen(cp);
1941 if (out + 2 + len > outend)
1942 break;
1943 *out++ = '&';
1944 memcpy(out, cp, len);
1945 out += len;
1946 *out++ = ';';
1947 }
1948 processed = in;
1949 }
1950 *outlen = out - outstart;
1951 *inlen = processed - instart;
1952 return(0);
1953}
1954
1955/**
1956 * htmlDecodeEntities:
1957 * @ctxt: the parser context
1958 * @len: the len to decode (in bytes !), -1 for no size limit
1959 * @end: an end marker xmlChar, 0 if none
1960 * @end2: an end marker xmlChar, 0 if none
1961 * @end3: an end marker xmlChar, 0 if none
1962 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001963 * Substitute the HTML entities by their value
Owen Taylor3473f882001-02-23 17:55:21 +00001964 *
1965 * DEPRECATED !!!!
1966 *
1967 * Returns A newly allocated string with the substitution done. The caller
1968 * must deallocate it !
1969 */
1970xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001971htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1972 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001973 static int deprecated = 0;
1974 if (!deprecated) {
1975 xmlGenericError(xmlGenericErrorContext,
1976 "htmlDecodeEntities() deprecated function reached\n");
1977 deprecated = 1;
1978 }
1979 return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001980}
1981
1982/************************************************************************
1983 * *
1984 * Commodity functions to handle streams *
1985 * *
1986 ************************************************************************/
1987
1988/**
Owen Taylor3473f882001-02-23 17:55:21 +00001989 * htmlNewInputStream:
1990 * @ctxt: an HTML parser context
1991 *
1992 * Create a new input stream structure
1993 * Returns the new input stream or NULL
1994 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001995static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001996htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1997 htmlParserInputPtr input;
1998
1999 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2000 if (input == NULL) {
2001 ctxt->errNo = XML_ERR_NO_MEMORY;
2002 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2003 ctxt->sax->error(ctxt->userData,
2004 "malloc: couldn't allocate a new input stream\n");
2005 return(NULL);
2006 }
2007 memset(input, 0, sizeof(htmlParserInput));
2008 input->filename = NULL;
2009 input->directory = NULL;
2010 input->base = NULL;
2011 input->cur = NULL;
2012 input->buf = NULL;
2013 input->line = 1;
2014 input->col = 1;
2015 input->buf = NULL;
2016 input->free = NULL;
2017 input->version = NULL;
2018 input->consumed = 0;
2019 input->length = 0;
2020 return(input);
2021}
2022
2023
2024/************************************************************************
2025 * *
2026 * Commodity functions, cleanup needed ? *
2027 * *
2028 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002029/*
2030 * all tags allowing pc data from the html 4.01 loose dtd
2031 * NOTE: it might be more apropriate to integrate this information
2032 * into the html40ElementTable array but I don't want to risk any
2033 * binary incomptibility
2034 */
2035static const char *allowPCData[] = {
2036 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2037 "blockquote", "body", "button", "caption", "center", "cite", "code",
2038 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2039 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2040 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2041 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2042};
Owen Taylor3473f882001-02-23 17:55:21 +00002043
2044/**
2045 * areBlanks:
2046 * @ctxt: an HTML parser context
2047 * @str: a xmlChar *
2048 * @len: the size of @str
2049 *
2050 * Is this a sequence of blank chars that one can ignore ?
2051 *
2052 * Returns 1 if ignorable 0 otherwise.
2053 */
2054
2055static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002056 unsigned int i;
2057 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002058 xmlNodePtr lastChild;
2059
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002060 for (j = 0;j < len;j++)
2061 if (!(IS_BLANK(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002062
2063 if (CUR == 0) return(1);
2064 if (CUR != '<') return(0);
2065 if (ctxt->name == NULL)
2066 return(1);
2067 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2068 return(1);
2069 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2070 return(1);
2071 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
2072 return(1);
2073 if (ctxt->node == NULL) return(0);
2074 lastChild = xmlGetLastChild(ctxt->node);
2075 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002076 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2077 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002078 /* keep ws in constructs like ...<b> </b>...
2079 for all tags "b" allowing PCDATA */
2080 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2081 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2082 return(0);
2083 }
2084 }
Owen Taylor3473f882001-02-23 17:55:21 +00002085 } else if (xmlNodeIsText(lastChild)) {
2086 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002087 } else {
2088 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2089 for all tags "p" allowing PCDATA */
2090 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2091 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2092 return(0);
2093 }
2094 }
Owen Taylor3473f882001-02-23 17:55:21 +00002095 }
2096 return(1);
2097}
2098
2099/**
Owen Taylor3473f882001-02-23 17:55:21 +00002100 * htmlNewDocNoDtD:
2101 * @URI: URI for the dtd, or NULL
2102 * @ExternalID: the external ID of the DTD, or NULL
2103 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002104 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2105 * are NULL
2106 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002107 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002108 */
2109htmlDocPtr
2110htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2111 xmlDocPtr cur;
2112
2113 /*
2114 * Allocate a new document and fill the fields.
2115 */
2116 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2117 if (cur == NULL) {
2118 xmlGenericError(xmlGenericErrorContext,
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002119 "htmlNewDocNoDtD : malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002120 return(NULL);
2121 }
2122 memset(cur, 0, sizeof(xmlDoc));
2123
2124 cur->type = XML_HTML_DOCUMENT_NODE;
2125 cur->version = NULL;
2126 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002127 cur->doc = cur;
2128 cur->name = NULL;
2129 cur->children = NULL;
2130 cur->extSubset = NULL;
2131 cur->oldNs = NULL;
2132 cur->encoding = NULL;
2133 cur->standalone = 1;
2134 cur->compression = 0;
2135 cur->ids = NULL;
2136 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002137 cur->_private = NULL;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002138 if ((ExternalID != NULL) ||
2139 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002140 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002141 return(cur);
2142}
2143
2144/**
2145 * htmlNewDoc:
2146 * @URI: URI for the dtd, or NULL
2147 * @ExternalID: the external ID of the DTD, or NULL
2148 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002149 * Creates a new HTML document
2150 *
Owen Taylor3473f882001-02-23 17:55:21 +00002151 * Returns a new document
2152 */
2153htmlDocPtr
2154htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2155 if ((URI == NULL) && (ExternalID == NULL))
2156 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002157 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2158 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002159
2160 return(htmlNewDocNoDtD(URI, ExternalID));
2161}
2162
2163
2164/************************************************************************
2165 * *
2166 * The parser itself *
2167 * Relates to http://www.w3.org/TR/html40 *
2168 * *
2169 ************************************************************************/
2170
2171/************************************************************************
2172 * *
2173 * The parser itself *
2174 * *
2175 ************************************************************************/
2176
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002177static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002178
Owen Taylor3473f882001-02-23 17:55:21 +00002179/**
2180 * htmlParseHTMLName:
2181 * @ctxt: an HTML parser context
2182 *
2183 * parse an HTML tag or attribute name, note that we convert it to lowercase
2184 * since HTML names are not case-sensitive.
2185 *
2186 * Returns the Tag Name parsed or NULL
2187 */
2188
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002189static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002190htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002191 int i = 0;
2192 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2193
2194 if (!IS_LETTER(CUR) && (CUR != '_') &&
2195 (CUR != ':')) return(NULL);
2196
2197 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2198 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2199 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2200 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2201 else loc[i] = CUR;
2202 i++;
2203
2204 NEXT;
2205 }
2206
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002207 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002208}
2209
2210/**
2211 * htmlParseName:
2212 * @ctxt: an HTML parser context
2213 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002214 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002215 *
2216 * Returns the Name parsed or NULL
2217 */
2218
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002219static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002220htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002221 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002222 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002223 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002224
2225 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002226
2227 /*
2228 * Accelerator for simple ASCII names
2229 */
2230 in = ctxt->input->cur;
2231 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2232 ((*in >= 0x41) && (*in <= 0x5A)) ||
2233 (*in == '_') || (*in == ':')) {
2234 in++;
2235 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2236 ((*in >= 0x41) && (*in <= 0x5A)) ||
2237 ((*in >= 0x30) && (*in <= 0x39)) ||
2238 (*in == '_') || (*in == '-') ||
2239 (*in == ':') || (*in == '.'))
2240 in++;
2241 if ((*in > 0) && (*in < 0x80)) {
2242 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002243 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002244 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002245 ctxt->nbChars += count;
2246 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002247 return(ret);
2248 }
2249 }
2250 return(htmlParseNameComplex(ctxt));
2251}
2252
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002253static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002254htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002255 int len = 0, l;
2256 int c;
2257 int count = 0;
2258
2259 /*
2260 * Handler for more complex cases
2261 */
2262 GROW;
2263 c = CUR_CHAR(l);
2264 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2265 (!IS_LETTER(c) && (c != '_') &&
2266 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002267 return(NULL);
2268 }
2269
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002270 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2271 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2272 (c == '.') || (c == '-') ||
2273 (c == '_') || (c == ':') ||
2274 (IS_COMBINING(c)) ||
2275 (IS_EXTENDER(c)))) {
2276 if (count++ > 100) {
2277 count = 0;
2278 GROW;
2279 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002280 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002281 NEXTL(l);
2282 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002283 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002284 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002285}
2286
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002287
Owen Taylor3473f882001-02-23 17:55:21 +00002288/**
2289 * htmlParseHTMLAttribute:
2290 * @ctxt: an HTML parser context
2291 * @stop: a char stop value
2292 *
2293 * parse an HTML attribute value till the stop (quote), if
2294 * stop is 0 then it stops at the first space
2295 *
2296 * Returns the attribute parsed or NULL
2297 */
2298
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002299static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002300htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2301 xmlChar *buffer = NULL;
2302 int buffer_size = 0;
2303 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002304 const xmlChar *name = NULL;
2305 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002306 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002307
2308 /*
2309 * allocate a translation buffer.
2310 */
2311 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002312 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002313 if (buffer == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00002314 xmlGenericError(xmlGenericErrorContext,
2315 "htmlParseHTMLAttribute: malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002316 return(NULL);
2317 }
2318 out = buffer;
2319
2320 /*
2321 * Ok loop until we reach one of the ending chars
2322 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002323 while ((CUR != 0) && (CUR != stop)) {
2324 if ((stop == 0) && (CUR == '>')) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002325 if ((stop == 0) && (IS_BLANK(CUR))) break;
2326 if (CUR == '&') {
2327 if (NXT(1) == '#') {
2328 unsigned int c;
2329 int bits;
2330
2331 c = htmlParseCharRef(ctxt);
2332 if (c < 0x80)
2333 { *out++ = c; bits= -6; }
2334 else if (c < 0x800)
2335 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2336 else if (c < 0x10000)
2337 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2338 else
2339 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2340
2341 for ( ; bits >= 0; bits-= 6) {
2342 *out++ = ((c >> bits) & 0x3F) | 0x80;
2343 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002344
2345 if (out - buffer > buffer_size - 100) {
2346 int indx = out - buffer;
2347
2348 growBuffer(buffer);
2349 out = &buffer[indx];
2350 }
Owen Taylor3473f882001-02-23 17:55:21 +00002351 } else {
2352 ent = htmlParseEntityRef(ctxt, &name);
2353 if (name == NULL) {
2354 *out++ = '&';
2355 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002356 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002357
2358 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002359 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002360 }
2361 } else if (ent == NULL) {
2362 *out++ = '&';
2363 cur = name;
2364 while (*cur != 0) {
2365 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002366 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002367
2368 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002369 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002370 }
2371 *out++ = *cur++;
2372 }
Owen Taylor3473f882001-02-23 17:55:21 +00002373 } else {
2374 unsigned int c;
2375 int bits;
2376
2377 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002378 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002379
2380 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002381 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002382 }
2383 c = (xmlChar)ent->value;
2384 if (c < 0x80)
2385 { *out++ = c; bits= -6; }
2386 else if (c < 0x800)
2387 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2388 else if (c < 0x10000)
2389 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2390 else
2391 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2392
2393 for ( ; bits >= 0; bits-= 6) {
2394 *out++ = ((c >> bits) & 0x3F) | 0x80;
2395 }
Owen Taylor3473f882001-02-23 17:55:21 +00002396 }
2397 }
2398 } else {
2399 unsigned int c;
2400 int bits, l;
2401
2402 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002403 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002404
2405 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002406 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002407 }
2408 c = CUR_CHAR(l);
2409 if (c < 0x80)
2410 { *out++ = c; bits= -6; }
2411 else if (c < 0x800)
2412 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2413 else if (c < 0x10000)
2414 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2415 else
2416 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2417
2418 for ( ; bits >= 0; bits-= 6) {
2419 *out++ = ((c >> bits) & 0x3F) | 0x80;
2420 }
2421 NEXT;
2422 }
2423 }
2424 *out++ = 0;
2425 return(buffer);
2426}
2427
2428/**
Owen Taylor3473f882001-02-23 17:55:21 +00002429 * htmlParseEntityRef:
2430 * @ctxt: an HTML parser context
2431 * @str: location to store the entity name
2432 *
2433 * parse an HTML ENTITY references
2434 *
2435 * [68] EntityRef ::= '&' Name ';'
2436 *
2437 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2438 * if non-NULL *str will have to be freed by the caller.
2439 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002440const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002441htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2442 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002443 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002444 *str = NULL;
2445
2446 if (CUR == '&') {
2447 NEXT;
2448 name = htmlParseName(ctxt);
2449 if (name == NULL) {
2450 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2451 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2452 ctxt->wellFormed = 0;
2453 } else {
2454 GROW;
2455 if (CUR == ';') {
2456 *str = name;
2457
2458 /*
2459 * Lookup the entity in the table.
2460 */
2461 ent = htmlEntityLookup(name);
2462 if (ent != NULL) /* OK that's ugly !!! */
2463 NEXT;
2464 } else {
2465 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2466 ctxt->sax->error(ctxt->userData,
2467 "htmlParseEntityRef: expecting ';'\n");
2468 *str = name;
2469 }
2470 }
2471 }
2472 return(ent);
2473}
2474
2475/**
2476 * htmlParseAttValue:
2477 * @ctxt: an HTML parser context
2478 *
2479 * parse a value for an attribute
2480 * Note: the parser won't do substitution of entities here, this
2481 * will be handled later in xmlStringGetNodeList, unless it was
2482 * asked for ctxt->replaceEntities != 0
2483 *
2484 * Returns the AttValue parsed or NULL.
2485 */
2486
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002487static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002488htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2489 xmlChar *ret = NULL;
2490
2491 if (CUR == '"') {
2492 NEXT;
2493 ret = htmlParseHTMLAttribute(ctxt, '"');
2494 if (CUR != '"') {
2495 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2496 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2497 ctxt->wellFormed = 0;
2498 } else
2499 NEXT;
2500 } else if (CUR == '\'') {
2501 NEXT;
2502 ret = htmlParseHTMLAttribute(ctxt, '\'');
2503 if (CUR != '\'') {
2504 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2505 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2506 ctxt->wellFormed = 0;
2507 } else
2508 NEXT;
2509 } else {
2510 /*
2511 * That's an HTMLism, the attribute value may not be quoted
2512 */
2513 ret = htmlParseHTMLAttribute(ctxt, 0);
2514 if (ret == NULL) {
2515 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2516 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2517 ctxt->wellFormed = 0;
2518 }
2519 }
2520 return(ret);
2521}
2522
2523/**
2524 * htmlParseSystemLiteral:
2525 * @ctxt: an HTML parser context
2526 *
2527 * parse an HTML Literal
2528 *
2529 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2530 *
2531 * Returns the SystemLiteral parsed or NULL
2532 */
2533
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002534static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002535htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2536 const xmlChar *q;
2537 xmlChar *ret = NULL;
2538
2539 if (CUR == '"') {
2540 NEXT;
2541 q = CUR_PTR;
Daniel Veillard34ba3872003-07-15 13:34:05 +00002542 while ((IS_CHAR((unsigned int) CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002543 NEXT;
Daniel Veillard34ba3872003-07-15 13:34:05 +00002544 if (!IS_CHAR((unsigned int) CUR)) {
Owen Taylor3473f882001-02-23 17:55:21 +00002545 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2546 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2547 ctxt->wellFormed = 0;
2548 } else {
2549 ret = xmlStrndup(q, CUR_PTR - q);
2550 NEXT;
2551 }
2552 } else if (CUR == '\'') {
2553 NEXT;
2554 q = CUR_PTR;
Daniel Veillard34ba3872003-07-15 13:34:05 +00002555 while ((IS_CHAR((unsigned int) CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002556 NEXT;
Daniel Veillard34ba3872003-07-15 13:34:05 +00002557 if (!IS_CHAR((unsigned int) CUR)) {
Owen Taylor3473f882001-02-23 17:55:21 +00002558 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2559 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2560 ctxt->wellFormed = 0;
2561 } else {
2562 ret = xmlStrndup(q, CUR_PTR - q);
2563 NEXT;
2564 }
2565 } else {
2566 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2567 ctxt->sax->error(ctxt->userData,
2568 "SystemLiteral \" or ' expected\n");
2569 ctxt->wellFormed = 0;
2570 }
2571
2572 return(ret);
2573}
2574
2575/**
2576 * htmlParsePubidLiteral:
2577 * @ctxt: an HTML parser context
2578 *
2579 * parse an HTML public literal
2580 *
2581 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2582 *
2583 * Returns the PubidLiteral parsed or NULL.
2584 */
2585
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002586static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002587htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2588 const xmlChar *q;
2589 xmlChar *ret = NULL;
2590 /*
2591 * Name ::= (Letter | '_') (NameChar)*
2592 */
2593 if (CUR == '"') {
2594 NEXT;
2595 q = CUR_PTR;
2596 while (IS_PUBIDCHAR(CUR)) NEXT;
2597 if (CUR != '"') {
2598 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2599 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2600 ctxt->wellFormed = 0;
2601 } else {
2602 ret = xmlStrndup(q, CUR_PTR - q);
2603 NEXT;
2604 }
2605 } else if (CUR == '\'') {
2606 NEXT;
2607 q = CUR_PTR;
Daniel Veillard6560a422003-03-27 21:25:38 +00002608 while ((IS_PUBIDCHAR(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002609 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002610 if (CUR != '\'') {
Owen Taylor3473f882001-02-23 17:55:21 +00002611 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2612 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2613 ctxt->wellFormed = 0;
2614 } else {
2615 ret = xmlStrndup(q, CUR_PTR - q);
2616 NEXT;
2617 }
2618 } else {
2619 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2620 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2621 ctxt->wellFormed = 0;
2622 }
2623
2624 return(ret);
2625}
2626
2627/**
2628 * htmlParseScript:
2629 * @ctxt: an HTML parser context
2630 *
2631 * parse the content of an HTML SCRIPT or STYLE element
2632 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2633 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2634 * http://www.w3.org/TR/html4/types.html#type-script
2635 * http://www.w3.org/TR/html4/types.html#h-6.15
2636 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2637 *
2638 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2639 * element and the value of intrinsic event attributes. User agents must
2640 * not evaluate script data as HTML markup but instead must pass it on as
2641 * data to a script engine.
2642 * NOTES:
2643 * - The content is passed like CDATA
2644 * - the attributes for style and scripting "onXXX" are also described
2645 * as CDATA but SGML allows entities references in attributes so their
2646 * processing is identical as other attributes
2647 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002648static void
Owen Taylor3473f882001-02-23 17:55:21 +00002649htmlParseScript(htmlParserCtxtPtr ctxt) {
2650 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2651 int nbchar = 0;
2652 xmlChar cur;
2653
2654 SHRINK;
2655 cur = CUR;
Daniel Veillard34ba3872003-07-15 13:34:05 +00002656 while (IS_CHAR((unsigned int) cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002657 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2658 (NXT(3) == '-')) {
2659 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2660 if (ctxt->sax->cdataBlock!= NULL) {
2661 /*
2662 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2663 */
2664 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002665 } else if (ctxt->sax->characters != NULL) {
2666 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Daniel Veillardc1f78342001-11-10 11:43:05 +00002667 }
2668 }
2669 nbchar = 0;
2670 htmlParseComment(ctxt);
2671 cur = CUR;
2672 continue;
2673 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002674 /*
2675 * One should break here, the specification is clear:
2676 * Authors should therefore escape "</" within the content.
2677 * Escape mechanisms are specific to each scripting or
2678 * style sheet language.
2679 */
2680 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2681 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2682 break; /* while */
2683 }
2684 buf[nbchar++] = cur;
2685 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2686 if (ctxt->sax->cdataBlock!= NULL) {
2687 /*
2688 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2689 */
2690 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002691 } else if (ctxt->sax->characters != NULL) {
2692 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002693 }
2694 nbchar = 0;
2695 }
2696 NEXT;
2697 cur = CUR;
2698 }
Daniel Veillard34ba3872003-07-15 13:34:05 +00002699 if (!(IS_CHAR((unsigned int) cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002700 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2701 ctxt->sax->error(ctxt->userData,
2702 "Invalid char in CDATA 0x%X\n", cur);
2703 ctxt->wellFormed = 0;
2704 NEXT;
2705 }
2706
2707 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2708 if (ctxt->sax->cdataBlock!= NULL) {
2709 /*
2710 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2711 */
2712 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002713 } else if (ctxt->sax->characters != NULL) {
2714 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002715 }
2716 }
2717}
2718
2719
2720/**
2721 * htmlParseCharData:
2722 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002723 *
2724 * parse a CharData section.
2725 * if we are within a CDATA section ']]>' marks an end of section.
2726 *
2727 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2728 */
2729
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002730static void
2731htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002732 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2733 int nbchar = 0;
2734 int cur, l;
2735
2736 SHRINK;
2737 cur = CUR_CHAR(l);
2738 while (((cur != '<') || (ctxt->token == '<')) &&
2739 ((cur != '&') || (ctxt->token == '&')) &&
2740 (IS_CHAR(cur))) {
2741 COPY_BUF(l,buf,nbchar,cur);
2742 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2743 /*
2744 * Ok the segment is to be consumed as chars.
2745 */
2746 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2747 if (areBlanks(ctxt, buf, nbchar)) {
2748 if (ctxt->sax->ignorableWhitespace != NULL)
2749 ctxt->sax->ignorableWhitespace(ctxt->userData,
2750 buf, nbchar);
2751 } else {
2752 htmlCheckParagraph(ctxt);
2753 if (ctxt->sax->characters != NULL)
2754 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2755 }
2756 }
2757 nbchar = 0;
2758 }
2759 NEXTL(l);
2760 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002761 if (cur == 0) {
2762 SHRINK;
2763 GROW;
2764 cur = CUR_CHAR(l);
2765 }
Owen Taylor3473f882001-02-23 17:55:21 +00002766 }
2767 if (nbchar != 0) {
2768 /*
2769 * Ok the segment is to be consumed as chars.
2770 */
2771 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2772 if (areBlanks(ctxt, buf, nbchar)) {
2773 if (ctxt->sax->ignorableWhitespace != NULL)
2774 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2775 } else {
2776 htmlCheckParagraph(ctxt);
2777 if (ctxt->sax->characters != NULL)
2778 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2779 }
2780 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002781 } else {
2782 /*
2783 * Loop detection
2784 */
2785 if (cur == 0)
2786 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002787 }
2788}
2789
2790/**
2791 * htmlParseExternalID:
2792 * @ctxt: an HTML parser context
2793 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002794 *
2795 * Parse an External ID or a Public ID
2796 *
Owen Taylor3473f882001-02-23 17:55:21 +00002797 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2798 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2799 *
2800 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2801 *
2802 * Returns the function returns SystemLiteral and in the second
2803 * case publicID receives PubidLiteral, is strict is off
2804 * it is possible to return NULL and have publicID set.
2805 */
2806
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002807static xmlChar *
2808htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002809 xmlChar *URI = NULL;
2810
2811 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2812 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2813 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2814 SKIP(6);
2815 if (!IS_BLANK(CUR)) {
2816 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2817 ctxt->sax->error(ctxt->userData,
2818 "Space required after 'SYSTEM'\n");
2819 ctxt->wellFormed = 0;
2820 }
2821 SKIP_BLANKS;
2822 URI = htmlParseSystemLiteral(ctxt);
2823 if (URI == NULL) {
2824 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2825 ctxt->sax->error(ctxt->userData,
2826 "htmlParseExternalID: SYSTEM, no URI\n");
2827 ctxt->wellFormed = 0;
2828 }
2829 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2830 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2831 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2832 SKIP(6);
2833 if (!IS_BLANK(CUR)) {
2834 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2835 ctxt->sax->error(ctxt->userData,
2836 "Space required after 'PUBLIC'\n");
2837 ctxt->wellFormed = 0;
2838 }
2839 SKIP_BLANKS;
2840 *publicID = htmlParsePubidLiteral(ctxt);
2841 if (*publicID == NULL) {
2842 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2843 ctxt->sax->error(ctxt->userData,
2844 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2845 ctxt->wellFormed = 0;
2846 }
2847 SKIP_BLANKS;
2848 if ((CUR == '"') || (CUR == '\'')) {
2849 URI = htmlParseSystemLiteral(ctxt);
2850 }
2851 }
2852 return(URI);
2853}
2854
2855/**
2856 * htmlParseComment:
2857 * @ctxt: an HTML parser context
2858 *
2859 * Parse an XML (SGML) comment <!-- .... -->
2860 *
2861 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2862 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002863static void
Owen Taylor3473f882001-02-23 17:55:21 +00002864htmlParseComment(htmlParserCtxtPtr ctxt) {
2865 xmlChar *buf = NULL;
2866 int len;
2867 int size = HTML_PARSER_BUFFER_SIZE;
2868 int q, ql;
2869 int r, rl;
2870 int cur, l;
2871 xmlParserInputState state;
2872
2873 /*
2874 * Check that there is a comment right here.
2875 */
2876 if ((RAW != '<') || (NXT(1) != '!') ||
2877 (NXT(2) != '-') || (NXT(3) != '-')) return;
2878
2879 state = ctxt->instate;
2880 ctxt->instate = XML_PARSER_COMMENT;
2881 SHRINK;
2882 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002883 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002884 if (buf == NULL) {
2885 xmlGenericError(xmlGenericErrorContext,
2886 "malloc of %d byte failed\n", size);
2887 ctxt->instate = state;
2888 return;
2889 }
2890 q = CUR_CHAR(ql);
2891 NEXTL(ql);
2892 r = CUR_CHAR(rl);
2893 NEXTL(rl);
2894 cur = CUR_CHAR(l);
2895 len = 0;
2896 while (IS_CHAR(cur) &&
2897 ((cur != '>') ||
2898 (r != '-') || (q != '-'))) {
2899 if (len + 5 >= size) {
2900 size *= 2;
2901 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2902 if (buf == NULL) {
2903 xmlGenericError(xmlGenericErrorContext,
2904 "realloc of %d byte failed\n", size);
2905 ctxt->instate = state;
2906 return;
2907 }
2908 }
2909 COPY_BUF(ql,buf,len,q);
2910 q = r;
2911 ql = rl;
2912 r = cur;
2913 rl = l;
2914 NEXTL(l);
2915 cur = CUR_CHAR(l);
2916 if (cur == 0) {
2917 SHRINK;
2918 GROW;
2919 cur = CUR_CHAR(l);
2920 }
2921 }
2922 buf[len] = 0;
2923 if (!IS_CHAR(cur)) {
2924 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2925 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2926 ctxt->sax->error(ctxt->userData,
2927 "Comment not terminated \n<!--%.50s\n", buf);
2928 ctxt->wellFormed = 0;
2929 xmlFree(buf);
2930 } else {
2931 NEXT;
2932 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2933 (!ctxt->disableSAX))
2934 ctxt->sax->comment(ctxt->userData, buf);
2935 xmlFree(buf);
2936 }
2937 ctxt->instate = state;
2938}
2939
2940/**
2941 * htmlParseCharRef:
2942 * @ctxt: an HTML parser context
2943 *
2944 * parse Reference declarations
2945 *
2946 * [66] CharRef ::= '&#' [0-9]+ ';' |
2947 * '&#x' [0-9a-fA-F]+ ';'
2948 *
2949 * Returns the value parsed (as an int)
2950 */
2951int
2952htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2953 int val = 0;
2954
2955 if ((CUR == '&') && (NXT(1) == '#') &&
2956 (NXT(2) == 'x')) {
2957 SKIP(3);
2958 while (CUR != ';') {
2959 if ((CUR >= '0') && (CUR <= '9'))
2960 val = val * 16 + (CUR - '0');
2961 else if ((CUR >= 'a') && (CUR <= 'f'))
2962 val = val * 16 + (CUR - 'a') + 10;
2963 else if ((CUR >= 'A') && (CUR <= 'F'))
2964 val = val * 16 + (CUR - 'A') + 10;
2965 else {
2966 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2967 ctxt->sax->error(ctxt->userData,
2968 "htmlParseCharRef: invalid hexadecimal value\n");
2969 ctxt->wellFormed = 0;
2970 return(0);
2971 }
2972 NEXT;
2973 }
2974 if (CUR == ';')
2975 NEXT;
2976 } else if ((CUR == '&') && (NXT(1) == '#')) {
2977 SKIP(2);
2978 while (CUR != ';') {
2979 if ((CUR >= '0') && (CUR <= '9'))
2980 val = val * 10 + (CUR - '0');
2981 else {
2982 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2983 ctxt->sax->error(ctxt->userData,
2984 "htmlParseCharRef: invalid decimal value\n");
2985 ctxt->wellFormed = 0;
2986 return(0);
2987 }
2988 NEXT;
2989 }
2990 if (CUR == ';')
2991 NEXT;
2992 } else {
2993 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2994 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2995 ctxt->wellFormed = 0;
2996 }
2997 /*
2998 * Check the value IS_CHAR ...
2999 */
3000 if (IS_CHAR(val)) {
3001 return(val);
3002 } else {
3003 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3004 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
3005 val);
3006 ctxt->wellFormed = 0;
3007 }
3008 return(0);
3009}
3010
3011
3012/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003013 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003014 * @ctxt: an HTML parser context
3015 *
3016 * parse a DOCTYPE declaration
3017 *
3018 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3019 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3020 */
3021
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003022static void
Owen Taylor3473f882001-02-23 17:55:21 +00003023htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003024 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003025 xmlChar *ExternalID = NULL;
3026 xmlChar *URI = NULL;
3027
3028 /*
3029 * We know that '<!DOCTYPE' has been detected.
3030 */
3031 SKIP(9);
3032
3033 SKIP_BLANKS;
3034
3035 /*
3036 * Parse the DOCTYPE name.
3037 */
3038 name = htmlParseName(ctxt);
3039 if (name == NULL) {
3040 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3041 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
3042 ctxt->wellFormed = 0;
3043 }
3044 /*
3045 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3046 */
3047
3048 SKIP_BLANKS;
3049
3050 /*
3051 * Check for SystemID and ExternalID
3052 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003053 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003054 SKIP_BLANKS;
3055
3056 /*
3057 * We should be at the end of the DOCTYPE declaration.
3058 */
3059 if (CUR != '>') {
3060 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardf6ed8bc2001-10-02 09:22:47 +00003061 ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003062 ctxt->wellFormed = 0;
3063 /* We shouldn't try to resynchronize ... */
3064 }
3065 NEXT;
3066
3067 /*
3068 * Create or update the document accordingly to the DOCTYPE
3069 */
3070 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3071 (!ctxt->disableSAX))
3072 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3073
3074 /*
3075 * Cleanup, since we don't use all those identifiers
3076 */
3077 if (URI != NULL) xmlFree(URI);
3078 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003079}
3080
3081/**
3082 * htmlParseAttribute:
3083 * @ctxt: an HTML parser context
3084 * @value: a xmlChar ** used to store the value of the attribute
3085 *
3086 * parse an attribute
3087 *
3088 * [41] Attribute ::= Name Eq AttValue
3089 *
3090 * [25] Eq ::= S? '=' S?
3091 *
3092 * With namespace:
3093 *
3094 * [NS 11] Attribute ::= QName Eq AttValue
3095 *
3096 * Also the case QName == xmlns:??? is handled independently as a namespace
3097 * definition.
3098 *
3099 * Returns the attribute name, and the value in *value.
3100 */
3101
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003102static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003103htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003104 const xmlChar *name;
3105 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003106
3107 *value = NULL;
3108 name = htmlParseHTMLName(ctxt);
3109 if (name == NULL) {
3110 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3111 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
3112 ctxt->wellFormed = 0;
3113 return(NULL);
3114 }
3115
3116 /*
3117 * read the value
3118 */
3119 SKIP_BLANKS;
3120 if (CUR == '=') {
3121 NEXT;
3122 SKIP_BLANKS;
3123 val = htmlParseAttValue(ctxt);
3124 /******
3125 } else {
3126 * TODO : some attribute must have values, some may not
3127 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3128 ctxt->sax->warning(ctxt->userData,
3129 "No value for attribute %s\n", name); */
3130 }
3131
3132 *value = val;
3133 return(name);
3134}
3135
3136/**
3137 * htmlCheckEncoding:
3138 * @ctxt: an HTML parser context
3139 * @attvalue: the attribute value
3140 *
3141 * Checks an http-equiv attribute from a Meta tag to detect
3142 * the encoding
3143 * If a new encoding is detected the parser is switched to decode
3144 * it and pass UTF8
3145 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003146static void
Owen Taylor3473f882001-02-23 17:55:21 +00003147htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3148 const xmlChar *encoding;
3149
3150 if ((ctxt == NULL) || (attvalue == NULL))
3151 return;
3152
3153 /* do not change encoding */
3154 if (ctxt->input->encoding != NULL)
3155 return;
3156
3157 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3158 if (encoding != NULL) {
3159 encoding += 8;
3160 } else {
3161 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3162 if (encoding != NULL)
3163 encoding += 9;
3164 }
3165 if (encoding != NULL) {
3166 xmlCharEncoding enc;
3167 xmlCharEncodingHandlerPtr handler;
3168
3169 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3170
3171 if (ctxt->input->encoding != NULL)
3172 xmlFree((xmlChar *) ctxt->input->encoding);
3173 ctxt->input->encoding = xmlStrdup(encoding);
3174
3175 enc = xmlParseCharEncoding((const char *) encoding);
3176 /*
3177 * registered set of known encodings
3178 */
3179 if (enc != XML_CHAR_ENCODING_ERROR) {
3180 xmlSwitchEncoding(ctxt, enc);
3181 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3182 } else {
3183 /*
3184 * fallback for unknown encodings
3185 */
3186 handler = xmlFindCharEncodingHandler((const char *) encoding);
3187 if (handler != NULL) {
3188 xmlSwitchToEncoding(ctxt, handler);
3189 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3190 } else {
3191 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3192 }
3193 }
3194
3195 if ((ctxt->input->buf != NULL) &&
3196 (ctxt->input->buf->encoder != NULL) &&
3197 (ctxt->input->buf->raw != NULL) &&
3198 (ctxt->input->buf->buffer != NULL)) {
3199 int nbchars;
3200 int processed;
3201
3202 /*
3203 * convert as much as possible to the parser reading buffer.
3204 */
3205 processed = ctxt->input->cur - ctxt->input->base;
3206 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3207 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3208 ctxt->input->buf->buffer,
3209 ctxt->input->buf->raw);
3210 if (nbchars < 0) {
3211 ctxt->errNo = XML_ERR_INVALID_ENCODING;
3212 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3213 ctxt->sax->error(ctxt->userData,
3214 "htmlCheckEncoding: encoder error\n");
3215 }
3216 ctxt->input->base =
3217 ctxt->input->cur = ctxt->input->buf->buffer->content;
3218 }
3219 }
3220}
3221
3222/**
3223 * htmlCheckMeta:
3224 * @ctxt: an HTML parser context
3225 * @atts: the attributes values
3226 *
3227 * Checks an attributes from a Meta tag
3228 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003229static void
Owen Taylor3473f882001-02-23 17:55:21 +00003230htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3231 int i;
3232 const xmlChar *att, *value;
3233 int http = 0;
3234 const xmlChar *content = NULL;
3235
3236 if ((ctxt == NULL) || (atts == NULL))
3237 return;
3238
3239 i = 0;
3240 att = atts[i++];
3241 while (att != NULL) {
3242 value = atts[i++];
3243 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3244 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3245 http = 1;
3246 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3247 content = value;
3248 att = atts[i++];
3249 }
3250 if ((http) && (content != NULL))
3251 htmlCheckEncoding(ctxt, content);
3252
3253}
3254
3255/**
3256 * htmlParseStartTag:
3257 * @ctxt: an HTML parser context
3258 *
3259 * parse a start of tag either for rule element or
3260 * EmptyElement. In both case we don't parse the tag closing chars.
3261 *
3262 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3263 *
3264 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3265 *
3266 * With namespace:
3267 *
3268 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3269 *
3270 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3271 *
3272 */
3273
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003274static void
Owen Taylor3473f882001-02-23 17:55:21 +00003275htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003276 const xmlChar *name;
3277 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003278 xmlChar *attvalue;
3279 const xmlChar **atts = NULL;
3280 int nbatts = 0;
3281 int maxatts = 0;
3282 int meta = 0;
3283 int i;
3284
3285 if (CUR != '<') return;
3286 NEXT;
3287
3288 GROW;
3289 name = htmlParseHTMLName(ctxt);
3290 if (name == NULL) {
3291 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3292 ctxt->sax->error(ctxt->userData,
3293 "htmlParseStartTag: invalid element name\n");
3294 ctxt->wellFormed = 0;
3295 /* Dump the bogus tag like browsers do */
Daniel Veillard34ba3872003-07-15 13:34:05 +00003296 while ((IS_CHAR((unsigned int) CUR)) && (CUR != '>'))
Owen Taylor3473f882001-02-23 17:55:21 +00003297 NEXT;
3298 return;
3299 }
3300 if (xmlStrEqual(name, BAD_CAST"meta"))
3301 meta = 1;
3302
3303 /*
3304 * Check for auto-closure of HTML elements.
3305 */
3306 htmlAutoClose(ctxt, name);
3307
3308 /*
3309 * Check for implied HTML elements.
3310 */
3311 htmlCheckImplied(ctxt, name);
3312
3313 /*
3314 * Avoid html at any level > 0, head at any level != 1
3315 * or any attempt to recurse body
3316 */
3317 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3318 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3319 ctxt->sax->error(ctxt->userData,
3320 "htmlParseStartTag: misplaced <html> tag\n");
3321 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003322 return;
3323 }
3324 if ((ctxt->nameNr != 1) &&
3325 (xmlStrEqual(name, BAD_CAST"head"))) {
3326 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3327 ctxt->sax->error(ctxt->userData,
3328 "htmlParseStartTag: misplaced <head> tag\n");
3329 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003330 return;
3331 }
3332 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003333 int indx;
3334 for (indx = 0;indx < ctxt->nameNr;indx++) {
3335 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00003336 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3337 ctxt->sax->error(ctxt->userData,
3338 "htmlParseStartTag: misplaced <body> tag\n");
3339 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003340 return;
3341 }
3342 }
3343 }
3344
3345 /*
3346 * Now parse the attributes, it ends up with the ending
3347 *
3348 * (S Attribute)* S?
3349 */
3350 SKIP_BLANKS;
Daniel Veillard34ba3872003-07-15 13:34:05 +00003351 while ((IS_CHAR((unsigned int) CUR)) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003352 (CUR != '>') &&
3353 ((CUR != '/') || (NXT(1) != '>'))) {
3354 long cons = ctxt->nbChars;
3355
3356 GROW;
3357 attname = htmlParseAttribute(ctxt, &attvalue);
3358 if (attname != NULL) {
3359
3360 /*
3361 * Well formedness requires at most one declaration of an attribute
3362 */
3363 for (i = 0; i < nbatts;i += 2) {
3364 if (xmlStrEqual(atts[i], attname)) {
3365 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3366 ctxt->sax->error(ctxt->userData,
3367 "Attribute %s redefined\n",
3368 attname);
3369 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003370 if (attvalue != NULL)
3371 xmlFree(attvalue);
3372 goto failed;
3373 }
3374 }
3375
3376 /*
3377 * Add the pair to atts
3378 */
3379 if (atts == NULL) {
3380 maxatts = 10;
3381 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
3382 if (atts == NULL) {
3383 xmlGenericError(xmlGenericErrorContext,
3384 "malloc of %ld byte failed\n",
3385 maxatts * (long)sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003386 return;
3387 }
3388 } else if (nbatts + 4 > maxatts) {
3389 maxatts *= 2;
3390 atts = (const xmlChar **) xmlRealloc((void *) atts,
3391 maxatts * sizeof(xmlChar *));
3392 if (atts == NULL) {
3393 xmlGenericError(xmlGenericErrorContext,
3394 "realloc of %ld byte failed\n",
3395 maxatts * (long)sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003396 return;
3397 }
3398 }
3399 atts[nbatts++] = attname;
3400 atts[nbatts++] = attvalue;
3401 atts[nbatts] = NULL;
3402 atts[nbatts + 1] = NULL;
3403 }
3404 else {
3405 /* Dump the bogus attribute string up to the next blank or
3406 * the end of the tag. */
Daniel Veillard34ba3872003-07-15 13:34:05 +00003407 while ((IS_CHAR((unsigned int) CUR)) &&
3408 !(IS_BLANK(CUR)) && (CUR != '>') &&
3409 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003410 NEXT;
3411 }
3412
3413failed:
3414 SKIP_BLANKS;
3415 if (cons == ctxt->nbChars) {
3416 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3417 ctxt->sax->error(ctxt->userData,
3418 "htmlParseStartTag: problem parsing attributes\n");
3419 ctxt->wellFormed = 0;
3420 break;
3421 }
3422 }
3423
3424 /*
3425 * Handle specific association to the META tag
3426 */
3427 if (meta)
3428 htmlCheckMeta(ctxt, atts);
3429
3430 /*
3431 * SAX: Start of Element !
3432 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003433 htmlnamePush(ctxt, name);
Owen Taylor3473f882001-02-23 17:55:21 +00003434#ifdef DEBUG
3435 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3436#endif
3437 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3438 ctxt->sax->startElement(ctxt->userData, name, atts);
3439
3440 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003441 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003442 if (atts[i] != NULL)
3443 xmlFree((xmlChar *) atts[i]);
3444 }
3445 xmlFree((void *) atts);
3446 }
Owen Taylor3473f882001-02-23 17:55:21 +00003447}
3448
3449/**
3450 * htmlParseEndTag:
3451 * @ctxt: an HTML parser context
3452 *
3453 * parse an end of tag
3454 *
3455 * [42] ETag ::= '</' Name S? '>'
3456 *
3457 * With namespace
3458 *
3459 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003460 *
3461 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003462 */
3463
Daniel Veillardf420ac52001-07-04 16:04:09 +00003464static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003465htmlParseEndTag(htmlParserCtxtPtr ctxt)
3466{
3467 const xmlChar *name;
3468 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003469 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003470
3471 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003472 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3473 ctxt->sax->error(ctxt->userData,
3474 "htmlParseEndTag: '</' not found\n");
3475 ctxt->wellFormed = 0;
3476 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003477 }
3478 SKIP(2);
3479
3480 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003481 if (name == NULL)
3482 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003483
3484 /*
3485 * We should definitely be at the ending "S? '>'" part
3486 */
3487 SKIP_BLANKS;
Daniel Veillard34ba3872003-07-15 13:34:05 +00003488 if ((!IS_CHAR((unsigned int) CUR)) || (CUR != '>')) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003489 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3490 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3491 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003492 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003493 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003494
3495 /*
3496 * If the name read is not one of the element in the parsing stack
3497 * then return, it's just an error.
3498 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003499 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3500 if (xmlStrEqual(name, ctxt->nameTab[i]))
3501 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003502 }
3503 if (i < 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003504 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3505 ctxt->sax->error(ctxt->userData,
3506 "Unexpected end tag : %s\n", name);
3507 ctxt->wellFormed = 0;
3508 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003509 }
3510
3511
3512 /*
3513 * Check for auto-closure of HTML elements.
3514 */
3515
3516 htmlAutoCloseOnClose(ctxt, name);
3517
3518 /*
3519 * Well formedness constraints, opening and closing must match.
3520 * With the exception that the autoclose may have popped stuff out
3521 * of the stack.
3522 */
3523 if (!xmlStrEqual(name, ctxt->name)) {
3524#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003525 xmlGenericError(xmlGenericErrorContext,
3526 "End of tag %s: expecting %s\n", name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003527#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003528 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
3529 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3530 ctxt->sax->error(ctxt->userData,
3531 "Opening and ending tag mismatch: %s and %s\n",
3532 name, ctxt->name);
3533 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003534 }
3535 }
3536
3537 /*
3538 * SAX: End of Tag
3539 */
3540 oldname = ctxt->name;
3541 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003542 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3543 ctxt->sax->endElement(ctxt->userData, name);
3544 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003545#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003546 if (oldname != NULL) {
3547 xmlGenericError(xmlGenericErrorContext,
3548 "End of tag %s: popping out %s\n", name,
3549 oldname);
3550 } else {
3551 xmlGenericError(xmlGenericErrorContext,
3552 "End of tag %s: stack empty !!!\n", name);
3553 }
Owen Taylor3473f882001-02-23 17:55:21 +00003554#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003555 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003556 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003557 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003558 }
3559
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003560 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003561}
3562
3563
3564/**
3565 * htmlParseReference:
3566 * @ctxt: an HTML parser context
3567 *
3568 * parse and handle entity references in content,
3569 * this will end-up in a call to character() since this is either a
3570 * CharRef, or a predefined entity.
3571 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003572static void
Owen Taylor3473f882001-02-23 17:55:21 +00003573htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003574 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003575 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003576 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003577 if (CUR != '&') return;
3578
3579 if (NXT(1) == '#') {
3580 unsigned int c;
3581 int bits, i = 0;
3582
3583 c = htmlParseCharRef(ctxt);
3584 if (c == 0)
3585 return;
3586
3587 if (c < 0x80) { out[i++]= c; bits= -6; }
3588 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3589 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3590 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3591
3592 for ( ; bits >= 0; bits-= 6) {
3593 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3594 }
3595 out[i] = 0;
3596
3597 htmlCheckParagraph(ctxt);
3598 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3599 ctxt->sax->characters(ctxt->userData, out, i);
3600 } else {
3601 ent = htmlParseEntityRef(ctxt, &name);
3602 if (name == NULL) {
3603 htmlCheckParagraph(ctxt);
3604 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3605 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3606 return;
3607 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003608 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003609 htmlCheckParagraph(ctxt);
3610 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3611 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3612 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3613 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3614 }
3615 } else {
3616 unsigned int c;
3617 int bits, i = 0;
3618
3619 c = ent->value;
3620 if (c < 0x80)
3621 { out[i++]= c; bits= -6; }
3622 else if (c < 0x800)
3623 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3624 else if (c < 0x10000)
3625 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3626 else
3627 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3628
3629 for ( ; bits >= 0; bits-= 6) {
3630 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3631 }
3632 out[i] = 0;
3633
3634 htmlCheckParagraph(ctxt);
3635 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3636 ctxt->sax->characters(ctxt->userData, out, i);
3637 }
Owen Taylor3473f882001-02-23 17:55:21 +00003638 }
3639}
3640
3641/**
3642 * htmlParseContent:
3643 * @ctxt: an HTML parser context
3644 * @name: the node name
3645 *
3646 * Parse a content: comment, sub-element, reference or text.
3647 *
3648 */
3649
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003650static void
Owen Taylor3473f882001-02-23 17:55:21 +00003651htmlParseContent(htmlParserCtxtPtr ctxt) {
3652 xmlChar *currentNode;
3653 int depth;
3654
3655 currentNode = xmlStrdup(ctxt->name);
3656 depth = ctxt->nameNr;
3657 while (1) {
3658 long cons = ctxt->nbChars;
3659
3660 GROW;
3661 /*
3662 * Our tag or one of it's parent or children is ending.
3663 */
3664 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003665 if (htmlParseEndTag(ctxt) &&
3666 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3667 if (currentNode != NULL)
3668 xmlFree(currentNode);
3669 return;
3670 }
3671 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003672 }
3673
3674 /*
3675 * Has this node been popped out during parsing of
3676 * the next element
3677 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003678 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3679 (!xmlStrEqual(currentNode, ctxt->name)))
3680 {
Owen Taylor3473f882001-02-23 17:55:21 +00003681 if (currentNode != NULL) xmlFree(currentNode);
3682 return;
3683 }
3684
Daniel Veillardf9533d12001-03-03 10:04:57 +00003685 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3686 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003687 /*
3688 * Handle SCRIPT/STYLE separately
3689 */
3690 htmlParseScript(ctxt);
3691 } else {
3692 /*
3693 * Sometimes DOCTYPE arrives in the middle of the document
3694 */
3695 if ((CUR == '<') && (NXT(1) == '!') &&
3696 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3697 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3698 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3699 (UPP(8) == 'E')) {
3700 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3701 ctxt->sax->error(ctxt->userData,
3702 "Misplaced DOCTYPE declaration\n");
3703 ctxt->wellFormed = 0;
3704 htmlParseDocTypeDecl(ctxt);
3705 }
3706
3707 /*
3708 * First case : a comment
3709 */
3710 if ((CUR == '<') && (NXT(1) == '!') &&
3711 (NXT(2) == '-') && (NXT(3) == '-')) {
3712 htmlParseComment(ctxt);
3713 }
3714
3715 /*
3716 * Second case : a sub-element.
3717 */
3718 else if (CUR == '<') {
3719 htmlParseElement(ctxt);
3720 }
3721
3722 /*
3723 * Third case : a reference. If if has not been resolved,
3724 * parsing returns it's Name, create the node
3725 */
3726 else if (CUR == '&') {
3727 htmlParseReference(ctxt);
3728 }
3729
3730 /*
3731 * Fourth : end of the resource
3732 */
3733 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003734 htmlAutoCloseOnEnd(ctxt);
3735 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003736 }
3737
3738 /*
3739 * Last case, text. Note that References are handled directly.
3740 */
3741 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003742 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003743 }
3744
3745 if (cons == ctxt->nbChars) {
3746 if (ctxt->node != NULL) {
3747 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3748 ctxt->sax->error(ctxt->userData,
3749 "detected an error in element content\n");
3750 ctxt->wellFormed = 0;
3751 }
3752 break;
3753 }
3754 }
3755 GROW;
3756 }
3757 if (currentNode != NULL) xmlFree(currentNode);
3758}
3759
3760/**
3761 * htmlParseElement:
3762 * @ctxt: an HTML parser context
3763 *
3764 * parse an HTML element, this is highly recursive
3765 *
3766 * [39] element ::= EmptyElemTag | STag content ETag
3767 *
3768 * [41] Attribute ::= Name Eq AttValue
3769 */
3770
3771void
3772htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003773 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003774 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003775 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003776 htmlParserNodeInfo node_info;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003777 const xmlChar *oldname;
Owen Taylor3473f882001-02-23 17:55:21 +00003778 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003779 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003780
3781 /* Capture start position */
3782 if (ctxt->record_info) {
3783 node_info.begin_pos = ctxt->input->consumed +
3784 (CUR_PTR - ctxt->input->base);
3785 node_info.begin_line = ctxt->input->line;
3786 }
3787
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003788 oldname = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00003789 htmlParseStartTag(ctxt);
3790 name = ctxt->name;
3791#ifdef DEBUG
3792 if (oldname == NULL)
3793 xmlGenericError(xmlGenericErrorContext,
3794 "Start of element %s\n", name);
3795 else if (name == NULL)
3796 xmlGenericError(xmlGenericErrorContext,
3797 "Start of element failed, was %s\n", oldname);
3798 else
3799 xmlGenericError(xmlGenericErrorContext,
3800 "Start of element %s, was %s\n", name, oldname);
3801#endif
3802 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3803 (name == NULL)) {
3804 if (CUR == '>')
3805 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003806 return;
3807 }
Owen Taylor3473f882001-02-23 17:55:21 +00003808
3809 /*
3810 * Lookup the info for that element.
3811 */
3812 info = htmlTagLookup(name);
3813 if (info == NULL) {
3814 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3815 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3816 name);
3817 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003818 }
3819
3820 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003821 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003822 */
3823 if ((CUR == '/') && (NXT(1) == '>')) {
3824 SKIP(2);
3825 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3826 ctxt->sax->endElement(ctxt->userData, name);
3827 oldname = htmlnamePop(ctxt);
3828#ifdef DEBUG
3829 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3830#endif
Owen Taylor3473f882001-02-23 17:55:21 +00003831 return;
3832 }
3833
3834 if (CUR == '>') {
3835 NEXT;
3836 } else {
3837 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3838 ctxt->sax->error(ctxt->userData,
3839 "Couldn't find end of Start Tag %s\n",
3840 name);
3841 ctxt->wellFormed = 0;
3842
3843 /*
3844 * end of parsing of this node.
3845 */
3846 if (xmlStrEqual(name, ctxt->name)) {
3847 nodePop(ctxt);
3848 oldname = htmlnamePop(ctxt);
3849#ifdef DEBUG
3850 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3851#endif
Owen Taylor3473f882001-02-23 17:55:21 +00003852 }
3853
3854 /*
3855 * Capture end position and add node
3856 */
3857 if ( currentNode != NULL && ctxt->record_info ) {
3858 node_info.end_pos = ctxt->input->consumed +
3859 (CUR_PTR - ctxt->input->base);
3860 node_info.end_line = ctxt->input->line;
3861 node_info.node = ctxt->node;
3862 xmlParserAddNodeInfo(ctxt, &node_info);
3863 }
3864 return;
3865 }
3866
3867 /*
3868 * Check for an Empty Element from DTD definition
3869 */
3870 if ((info != NULL) && (info->empty)) {
3871 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3872 ctxt->sax->endElement(ctxt->userData, name);
3873 oldname = htmlnamePop(ctxt);
3874#ifdef DEBUG
3875 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3876#endif
Owen Taylor3473f882001-02-23 17:55:21 +00003877 return;
3878 }
3879
3880 /*
3881 * Parse the content of the element:
3882 */
3883 currentNode = xmlStrdup(ctxt->name);
3884 depth = ctxt->nameNr;
Daniel Veillard34ba3872003-07-15 13:34:05 +00003885 while (IS_CHAR((unsigned int) CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003886 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003887 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003888 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003889 if (ctxt->nameNr < depth) break;
3890 }
3891
Owen Taylor3473f882001-02-23 17:55:21 +00003892 /*
3893 * Capture end position and add node
3894 */
3895 if ( currentNode != NULL && ctxt->record_info ) {
3896 node_info.end_pos = ctxt->input->consumed +
3897 (CUR_PTR - ctxt->input->base);
3898 node_info.end_line = ctxt->input->line;
3899 node_info.node = ctxt->node;
3900 xmlParserAddNodeInfo(ctxt, &node_info);
3901 }
Daniel Veillard34ba3872003-07-15 13:34:05 +00003902 if (!IS_CHAR((unsigned int) CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003903 htmlAutoCloseOnEnd(ctxt);
3904 }
3905
Owen Taylor3473f882001-02-23 17:55:21 +00003906 if (currentNode != NULL)
3907 xmlFree(currentNode);
3908}
3909
3910/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003911 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00003912 * @ctxt: an HTML parser context
3913 *
3914 * parse an HTML document (and build a tree if using the standard SAX
3915 * interface).
3916 *
3917 * Returns 0, -1 in case of error. the parser context is augmented
3918 * as a result of the parsing.
3919 */
3920
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00003921int
Owen Taylor3473f882001-02-23 17:55:21 +00003922htmlParseDocument(htmlParserCtxtPtr ctxt) {
3923 xmlDtdPtr dtd;
3924
Daniel Veillardd0463562001-10-13 09:15:48 +00003925 xmlInitParser();
3926
Owen Taylor3473f882001-02-23 17:55:21 +00003927 htmlDefaultSAXHandlerInit();
3928 ctxt->html = 1;
3929
3930 GROW;
3931 /*
3932 * SAX: beginning of the document processing.
3933 */
3934 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3935 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3936
3937 /*
3938 * Wipe out everything which is before the first '<'
3939 */
3940 SKIP_BLANKS;
3941 if (CUR == 0) {
3942 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3943 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3944 ctxt->wellFormed = 0;
3945 }
3946
3947 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3948 ctxt->sax->startDocument(ctxt->userData);
3949
3950
3951 /*
3952 * Parse possible comments before any content
3953 */
3954 while ((CUR == '<') && (NXT(1) == '!') &&
3955 (NXT(2) == '-') && (NXT(3) == '-')) {
3956 htmlParseComment(ctxt);
3957 SKIP_BLANKS;
3958 }
3959
3960
3961 /*
3962 * Then possibly doc type declaration(s) and more Misc
3963 * (doctypedecl Misc*)?
3964 */
3965 if ((CUR == '<') && (NXT(1) == '!') &&
3966 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3967 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3968 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3969 (UPP(8) == 'E')) {
3970 htmlParseDocTypeDecl(ctxt);
3971 }
3972 SKIP_BLANKS;
3973
3974 /*
3975 * Parse possible comments before any content
3976 */
3977 while ((CUR == '<') && (NXT(1) == '!') &&
3978 (NXT(2) == '-') && (NXT(3) == '-')) {
3979 htmlParseComment(ctxt);
3980 SKIP_BLANKS;
3981 }
3982
3983 /*
3984 * Time to start parsing the tree itself
3985 */
3986 htmlParseContent(ctxt);
3987
3988 /*
3989 * autoclose
3990 */
3991 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003992 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003993
3994
3995 /*
3996 * SAX: end of the document processing.
3997 */
3998 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3999 ctxt->sax->endDocument(ctxt->userData);
4000
4001 if (ctxt->myDoc != NULL) {
4002 dtd = xmlGetIntSubset(ctxt->myDoc);
4003 if (dtd == NULL)
4004 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00004005 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004006 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4007 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4008 }
4009 if (! ctxt->wellFormed) return(-1);
4010 return(0);
4011}
4012
4013
4014/************************************************************************
4015 * *
4016 * Parser contexts handling *
4017 * *
4018 ************************************************************************/
4019
4020/**
4021 * xmlInitParserCtxt:
4022 * @ctxt: an HTML parser context
4023 *
4024 * Initialize a parser context
4025 */
4026
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004027static void
Owen Taylor3473f882001-02-23 17:55:21 +00004028htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4029{
4030 htmlSAXHandler *sax;
4031
4032 if (ctxt == NULL) return;
4033 memset(ctxt, 0, sizeof(htmlParserCtxt));
4034
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004035 ctxt->dict = xmlDictCreate();
4036 if (ctxt->dict == NULL) {
4037 xmlGenericError(xmlGenericErrorContext,
4038 "xmlInitParserCtxt: out of memory\n");
4039 return;
4040 }
Owen Taylor3473f882001-02-23 17:55:21 +00004041 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4042 if (sax == NULL) {
4043 xmlGenericError(xmlGenericErrorContext,
4044 "htmlInitParserCtxt: out of memory\n");
4045 }
4046 else
4047 memset(sax, 0, sizeof(htmlSAXHandler));
4048
4049 /* Allocate the Input stack */
4050 ctxt->inputTab = (htmlParserInputPtr *)
4051 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4052 if (ctxt->inputTab == NULL) {
4053 xmlGenericError(xmlGenericErrorContext,
4054 "htmlInitParserCtxt: out of memory\n");
4055 ctxt->inputNr = 0;
4056 ctxt->inputMax = 0;
4057 ctxt->input = NULL;
4058 return;
4059 }
4060 ctxt->inputNr = 0;
4061 ctxt->inputMax = 5;
4062 ctxt->input = NULL;
4063 ctxt->version = NULL;
4064 ctxt->encoding = NULL;
4065 ctxt->standalone = -1;
4066 ctxt->instate = XML_PARSER_START;
4067
4068 /* Allocate the Node stack */
4069 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4070 if (ctxt->nodeTab == NULL) {
4071 xmlGenericError(xmlGenericErrorContext,
4072 "htmlInitParserCtxt: out of memory\n");
4073 ctxt->nodeNr = 0;
4074 ctxt->nodeMax = 0;
4075 ctxt->node = NULL;
4076 ctxt->inputNr = 0;
4077 ctxt->inputMax = 0;
4078 ctxt->input = NULL;
4079 return;
4080 }
4081 ctxt->nodeNr = 0;
4082 ctxt->nodeMax = 10;
4083 ctxt->node = NULL;
4084
4085 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004086 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004087 if (ctxt->nameTab == NULL) {
4088 xmlGenericError(xmlGenericErrorContext,
4089 "htmlInitParserCtxt: out of memory\n");
4090 ctxt->nameNr = 0;
4091 ctxt->nameMax = 10;
4092 ctxt->name = NULL;
4093 ctxt->nodeNr = 0;
4094 ctxt->nodeMax = 0;
4095 ctxt->node = NULL;
4096 ctxt->inputNr = 0;
4097 ctxt->inputMax = 0;
4098 ctxt->input = NULL;
4099 return;
4100 }
4101 ctxt->nameNr = 0;
4102 ctxt->nameMax = 10;
4103 ctxt->name = NULL;
4104
4105 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
4106 else {
4107 ctxt->sax = sax;
4108 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
4109 }
4110 ctxt->userData = ctxt;
4111 ctxt->myDoc = NULL;
4112 ctxt->wellFormed = 1;
4113 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004114 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004115 ctxt->html = 1;
4116 ctxt->record_info = 0;
4117 ctxt->validate = 0;
4118 ctxt->nbChars = 0;
4119 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004120 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004121 xmlInitNodeInfoSeq(&ctxt->node_seq);
4122}
4123
4124/**
4125 * htmlFreeParserCtxt:
4126 * @ctxt: an HTML parser context
4127 *
4128 * Free all the memory used by a parser context. However the parsed
4129 * document in ctxt->myDoc is not freed.
4130 */
4131
4132void
4133htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4134{
4135 xmlFreeParserCtxt(ctxt);
4136}
4137
4138/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004139 * htmlNewParserCtxt:
4140 *
4141 * Allocate and initialize a new parser context.
4142 *
4143 * Returns the xmlParserCtxtPtr or NULL
4144 */
4145
4146static htmlParserCtxtPtr
4147htmlNewParserCtxt(void)
4148{
4149 xmlParserCtxtPtr ctxt;
4150
4151 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4152 if (ctxt == NULL) {
4153 xmlGenericError(xmlGenericErrorContext,
4154 "xmlNewParserCtxt : cannot allocate context\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004155 return(NULL);
4156 }
4157 memset(ctxt, 0, sizeof(xmlParserCtxt));
4158 htmlInitParserCtxt(ctxt);
4159 return(ctxt);
4160}
4161
4162/**
4163 * htmlCreateMemoryParserCtxt:
4164 * @buffer: a pointer to a char array
4165 * @size: the size of the array
4166 *
4167 * Create a parser context for an HTML in-memory document.
4168 *
4169 * Returns the new parser context or NULL
4170 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004171htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004172htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4173 xmlParserCtxtPtr ctxt;
4174 xmlParserInputPtr input;
4175 xmlParserInputBufferPtr buf;
4176
4177 if (buffer == NULL)
4178 return(NULL);
4179 if (size <= 0)
4180 return(NULL);
4181
4182 ctxt = htmlNewParserCtxt();
4183 if (ctxt == NULL)
4184 return(NULL);
4185
4186 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4187 if (buf == NULL) return(NULL);
4188
4189 input = xmlNewInputStream(ctxt);
4190 if (input == NULL) {
4191 xmlFreeParserCtxt(ctxt);
4192 return(NULL);
4193 }
4194
4195 input->filename = NULL;
4196 input->buf = buf;
4197 input->base = input->buf->buffer->content;
4198 input->cur = input->buf->buffer->content;
4199 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4200
4201 inputPush(ctxt, input);
4202 return(ctxt);
4203}
4204
4205/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004206 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004207 * @cur: a pointer to an array of xmlChar
4208 * @encoding: a free form C string describing the HTML document encoding, or NULL
4209 *
4210 * Create a parser context for an HTML document.
4211 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004212 * TODO: check the need to add encoding handling there
4213 *
Owen Taylor3473f882001-02-23 17:55:21 +00004214 * Returns the new parser context or NULL
4215 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004216static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00004217htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004218 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004219 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004220
Daniel Veillard1d995272002-07-22 16:43:32 +00004221 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004222 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004223 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004224 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4225
4226 if (encoding != NULL) {
4227 xmlCharEncoding enc;
4228 xmlCharEncodingHandlerPtr handler;
4229
4230 if (ctxt->input->encoding != NULL)
4231 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004232 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004233
4234 enc = xmlParseCharEncoding(encoding);
4235 /*
4236 * registered set of known encodings
4237 */
4238 if (enc != XML_CHAR_ENCODING_ERROR) {
4239 xmlSwitchEncoding(ctxt, enc);
4240 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4241 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4242 ctxt->sax->error(ctxt->userData,
4243 "Unsupported encoding %s\n", encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004244 }
4245 } else {
4246 /*
4247 * fallback for unknown encodings
4248 */
4249 handler = xmlFindCharEncodingHandler((const char *) encoding);
4250 if (handler != NULL) {
4251 xmlSwitchToEncoding(ctxt, handler);
4252 } else {
4253 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
4254 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4255 ctxt->sax->error(ctxt->userData,
4256 "Unsupported encoding %s\n", encoding);
4257 }
4258 }
4259 }
4260 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004261}
4262
4263/************************************************************************
4264 * *
4265 * Progressive parsing interfaces *
4266 * *
4267 ************************************************************************/
4268
4269/**
4270 * htmlParseLookupSequence:
4271 * @ctxt: an HTML parser context
4272 * @first: the first char to lookup
4273 * @next: the next char to lookup or zero
4274 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004275 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004276 *
4277 * Try to find if a sequence (first, next, third) or just (first next) or
4278 * (first) is available in the input stream.
4279 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4280 * to avoid rescanning sequences of bytes, it DOES change the state of the
4281 * parser, do not use liberally.
4282 * This is basically similar to xmlParseLookupSequence()
4283 *
4284 * Returns the index to the current parsing point if the full sequence
4285 * is available, -1 otherwise.
4286 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004287static int
Owen Taylor3473f882001-02-23 17:55:21 +00004288htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
William M. Brackc1939562003-08-05 15:52:22 +00004289 xmlChar next, xmlChar third, int iscomment) {
Owen Taylor3473f882001-02-23 17:55:21 +00004290 int base, len;
4291 htmlParserInputPtr in;
4292 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004293 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004294
4295 in = ctxt->input;
4296 if (in == NULL) return(-1);
4297 base = in->cur - in->base;
4298 if (base < 0) return(-1);
4299 if (ctxt->checkIndex > base)
4300 base = ctxt->checkIndex;
4301 if (in->buf == NULL) {
4302 buf = in->base;
4303 len = in->length;
4304 } else {
4305 buf = in->buf->buffer->content;
4306 len = in->buf->buffer->use;
4307 }
4308 /* take into account the sequence length */
4309 if (third) len -= 2;
4310 else if (next) len --;
4311 for (;base < len;base++) {
William M. Brackc1939562003-08-05 15:52:22 +00004312 if (!incomment && (base + 4 < len) && !iscomment) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004313 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4314 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4315 incomment = 1;
Daniel Veillard97e01882003-07-30 18:59:19 +00004316 /* do not increment past <! - some people use <!--> */
4317 base += 2;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004318 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004319 }
4320 if (incomment) {
William M. Brack4a557d92003-07-29 04:28:04 +00004321 if (base + 3 > len)
Daniel Veillardc1f78342001-11-10 11:43:05 +00004322 return(-1);
4323 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4324 (buf[base + 2] == '>')) {
4325 incomment = 0;
4326 base += 2;
4327 }
4328 continue;
4329 }
Owen Taylor3473f882001-02-23 17:55:21 +00004330 if (buf[base] == first) {
4331 if (third != 0) {
4332 if ((buf[base + 1] != next) ||
4333 (buf[base + 2] != third)) continue;
4334 } else if (next != 0) {
4335 if (buf[base + 1] != next) continue;
4336 }
4337 ctxt->checkIndex = 0;
4338#ifdef DEBUG_PUSH
4339 if (next == 0)
4340 xmlGenericError(xmlGenericErrorContext,
4341 "HPP: lookup '%c' found at %d\n",
4342 first, base);
4343 else if (third == 0)
4344 xmlGenericError(xmlGenericErrorContext,
4345 "HPP: lookup '%c%c' found at %d\n",
4346 first, next, base);
4347 else
4348 xmlGenericError(xmlGenericErrorContext,
4349 "HPP: lookup '%c%c%c' found at %d\n",
4350 first, next, third, base);
4351#endif
4352 return(base - (in->cur - in->base));
4353 }
4354 }
4355 ctxt->checkIndex = base;
4356#ifdef DEBUG_PUSH
4357 if (next == 0)
4358 xmlGenericError(xmlGenericErrorContext,
4359 "HPP: lookup '%c' failed\n", first);
4360 else if (third == 0)
4361 xmlGenericError(xmlGenericErrorContext,
4362 "HPP: lookup '%c%c' failed\n", first, next);
4363 else
4364 xmlGenericError(xmlGenericErrorContext,
4365 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4366#endif
4367 return(-1);
4368}
4369
4370/**
4371 * htmlParseTryOrFinish:
4372 * @ctxt: an HTML parser context
4373 * @terminate: last chunk indicator
4374 *
4375 * Try to progress on parsing
4376 *
4377 * Returns zero if no parsing was possible
4378 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004379static int
Owen Taylor3473f882001-02-23 17:55:21 +00004380htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4381 int ret = 0;
4382 htmlParserInputPtr in;
4383 int avail = 0;
4384 xmlChar cur, next;
4385
4386#ifdef DEBUG_PUSH
4387 switch (ctxt->instate) {
4388 case XML_PARSER_EOF:
4389 xmlGenericError(xmlGenericErrorContext,
4390 "HPP: try EOF\n"); break;
4391 case XML_PARSER_START:
4392 xmlGenericError(xmlGenericErrorContext,
4393 "HPP: try START\n"); break;
4394 case XML_PARSER_MISC:
4395 xmlGenericError(xmlGenericErrorContext,
4396 "HPP: try MISC\n");break;
4397 case XML_PARSER_COMMENT:
4398 xmlGenericError(xmlGenericErrorContext,
4399 "HPP: try COMMENT\n");break;
4400 case XML_PARSER_PROLOG:
4401 xmlGenericError(xmlGenericErrorContext,
4402 "HPP: try PROLOG\n");break;
4403 case XML_PARSER_START_TAG:
4404 xmlGenericError(xmlGenericErrorContext,
4405 "HPP: try START_TAG\n");break;
4406 case XML_PARSER_CONTENT:
4407 xmlGenericError(xmlGenericErrorContext,
4408 "HPP: try CONTENT\n");break;
4409 case XML_PARSER_CDATA_SECTION:
4410 xmlGenericError(xmlGenericErrorContext,
4411 "HPP: try CDATA_SECTION\n");break;
4412 case XML_PARSER_END_TAG:
4413 xmlGenericError(xmlGenericErrorContext,
4414 "HPP: try END_TAG\n");break;
4415 case XML_PARSER_ENTITY_DECL:
4416 xmlGenericError(xmlGenericErrorContext,
4417 "HPP: try ENTITY_DECL\n");break;
4418 case XML_PARSER_ENTITY_VALUE:
4419 xmlGenericError(xmlGenericErrorContext,
4420 "HPP: try ENTITY_VALUE\n");break;
4421 case XML_PARSER_ATTRIBUTE_VALUE:
4422 xmlGenericError(xmlGenericErrorContext,
4423 "HPP: try ATTRIBUTE_VALUE\n");break;
4424 case XML_PARSER_DTD:
4425 xmlGenericError(xmlGenericErrorContext,
4426 "HPP: try DTD\n");break;
4427 case XML_PARSER_EPILOG:
4428 xmlGenericError(xmlGenericErrorContext,
4429 "HPP: try EPILOG\n");break;
4430 case XML_PARSER_PI:
4431 xmlGenericError(xmlGenericErrorContext,
4432 "HPP: try PI\n");break;
4433 case XML_PARSER_SYSTEM_LITERAL:
4434 xmlGenericError(xmlGenericErrorContext,
4435 "HPP: try SYSTEM_LITERAL\n");break;
4436 }
4437#endif
4438
4439 while (1) {
4440
4441 in = ctxt->input;
4442 if (in == NULL) break;
4443 if (in->buf == NULL)
4444 avail = in->length - (in->cur - in->base);
4445 else
4446 avail = in->buf->buffer->use - (in->cur - in->base);
4447 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004448 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004449 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4450 /*
4451 * SAX: end of the document processing.
4452 */
4453 ctxt->instate = XML_PARSER_EOF;
4454 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4455 ctxt->sax->endDocument(ctxt->userData);
4456 }
4457 }
4458 if (avail < 1)
4459 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004460 cur = in->cur[0];
4461 if (cur == 0) {
4462 SKIP(1);
4463 continue;
4464 }
4465
Owen Taylor3473f882001-02-23 17:55:21 +00004466 switch (ctxt->instate) {
4467 case XML_PARSER_EOF:
4468 /*
4469 * Document parsing is done !
4470 */
4471 goto done;
4472 case XML_PARSER_START:
4473 /*
4474 * Very first chars read from the document flow.
4475 */
4476 cur = in->cur[0];
4477 if (IS_BLANK(cur)) {
4478 SKIP_BLANKS;
4479 if (in->buf == NULL)
4480 avail = in->length - (in->cur - in->base);
4481 else
4482 avail = in->buf->buffer->use - (in->cur - in->base);
4483 }
4484 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4485 ctxt->sax->setDocumentLocator(ctxt->userData,
4486 &xmlDefaultSAXLocator);
4487 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4488 (!ctxt->disableSAX))
4489 ctxt->sax->startDocument(ctxt->userData);
4490
4491 cur = in->cur[0];
4492 next = in->cur[1];
4493 if ((cur == '<') && (next == '!') &&
4494 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4495 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4496 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4497 (UPP(8) == 'E')) {
4498 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004499 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004500 goto done;
4501#ifdef DEBUG_PUSH
4502 xmlGenericError(xmlGenericErrorContext,
4503 "HPP: Parsing internal subset\n");
4504#endif
4505 htmlParseDocTypeDecl(ctxt);
4506 ctxt->instate = XML_PARSER_PROLOG;
4507#ifdef DEBUG_PUSH
4508 xmlGenericError(xmlGenericErrorContext,
4509 "HPP: entering PROLOG\n");
4510#endif
4511 } else {
4512 ctxt->instate = XML_PARSER_MISC;
4513 }
4514#ifdef DEBUG_PUSH
4515 xmlGenericError(xmlGenericErrorContext,
4516 "HPP: entering MISC\n");
4517#endif
4518 break;
4519 case XML_PARSER_MISC:
4520 SKIP_BLANKS;
4521 if (in->buf == NULL)
4522 avail = in->length - (in->cur - in->base);
4523 else
4524 avail = in->buf->buffer->use - (in->cur - in->base);
4525 if (avail < 2)
4526 goto done;
4527 cur = in->cur[0];
4528 next = in->cur[1];
4529 if ((cur == '<') && (next == '!') &&
4530 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4531 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004532 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004533 goto done;
4534#ifdef DEBUG_PUSH
4535 xmlGenericError(xmlGenericErrorContext,
4536 "HPP: Parsing Comment\n");
4537#endif
4538 htmlParseComment(ctxt);
4539 ctxt->instate = XML_PARSER_MISC;
4540 } else if ((cur == '<') && (next == '!') &&
4541 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4542 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4543 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4544 (UPP(8) == 'E')) {
4545 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004546 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004547 goto done;
4548#ifdef DEBUG_PUSH
4549 xmlGenericError(xmlGenericErrorContext,
4550 "HPP: Parsing internal subset\n");
4551#endif
4552 htmlParseDocTypeDecl(ctxt);
4553 ctxt->instate = XML_PARSER_PROLOG;
4554#ifdef DEBUG_PUSH
4555 xmlGenericError(xmlGenericErrorContext,
4556 "HPP: entering PROLOG\n");
4557#endif
4558 } else if ((cur == '<') && (next == '!') &&
4559 (avail < 9)) {
4560 goto done;
4561 } else {
4562 ctxt->instate = XML_PARSER_START_TAG;
4563#ifdef DEBUG_PUSH
4564 xmlGenericError(xmlGenericErrorContext,
4565 "HPP: entering START_TAG\n");
4566#endif
4567 }
4568 break;
4569 case XML_PARSER_PROLOG:
4570 SKIP_BLANKS;
4571 if (in->buf == NULL)
4572 avail = in->length - (in->cur - in->base);
4573 else
4574 avail = in->buf->buffer->use - (in->cur - in->base);
4575 if (avail < 2)
4576 goto done;
4577 cur = in->cur[0];
4578 next = in->cur[1];
4579 if ((cur == '<') && (next == '!') &&
4580 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4581 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004582 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004583 goto done;
4584#ifdef DEBUG_PUSH
4585 xmlGenericError(xmlGenericErrorContext,
4586 "HPP: Parsing Comment\n");
4587#endif
4588 htmlParseComment(ctxt);
4589 ctxt->instate = XML_PARSER_PROLOG;
4590 } else if ((cur == '<') && (next == '!') &&
4591 (avail < 4)) {
4592 goto done;
4593 } else {
4594 ctxt->instate = XML_PARSER_START_TAG;
4595#ifdef DEBUG_PUSH
4596 xmlGenericError(xmlGenericErrorContext,
4597 "HPP: entering START_TAG\n");
4598#endif
4599 }
4600 break;
4601 case XML_PARSER_EPILOG:
4602 if (in->buf == NULL)
4603 avail = in->length - (in->cur - in->base);
4604 else
4605 avail = in->buf->buffer->use - (in->cur - in->base);
4606 if (avail < 1)
4607 goto done;
4608 cur = in->cur[0];
4609 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004610 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004611 goto done;
4612 }
4613 if (avail < 2)
4614 goto done;
4615 next = in->cur[1];
4616 if ((cur == '<') && (next == '!') &&
4617 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4618 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004619 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004620 goto done;
4621#ifdef DEBUG_PUSH
4622 xmlGenericError(xmlGenericErrorContext,
4623 "HPP: Parsing Comment\n");
4624#endif
4625 htmlParseComment(ctxt);
4626 ctxt->instate = XML_PARSER_EPILOG;
4627 } else if ((cur == '<') && (next == '!') &&
4628 (avail < 4)) {
4629 goto done;
4630 } else {
4631 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004632 ctxt->wellFormed = 0;
4633 ctxt->instate = XML_PARSER_EOF;
4634#ifdef DEBUG_PUSH
4635 xmlGenericError(xmlGenericErrorContext,
4636 "HPP: entering EOF\n");
4637#endif
4638 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4639 ctxt->sax->endDocument(ctxt->userData);
4640 goto done;
4641 }
4642 break;
4643 case XML_PARSER_START_TAG: {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004644 const xmlChar *name, *oldname;
Owen Taylor3473f882001-02-23 17:55:21 +00004645 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004646 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004647
4648 if (avail < 2)
4649 goto done;
4650 cur = in->cur[0];
4651 if (cur != '<') {
4652 ctxt->instate = XML_PARSER_CONTENT;
4653#ifdef DEBUG_PUSH
4654 xmlGenericError(xmlGenericErrorContext,
4655 "HPP: entering CONTENT\n");
4656#endif
4657 break;
4658 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004659 if (in->cur[1] == '/') {
4660 ctxt->instate = XML_PARSER_END_TAG;
4661 ctxt->checkIndex = 0;
4662#ifdef DEBUG_PUSH
4663 xmlGenericError(xmlGenericErrorContext,
4664 "HPP: entering END_TAG\n");
4665#endif
4666 break;
4667 }
Owen Taylor3473f882001-02-23 17:55:21 +00004668 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004669 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004670 goto done;
4671
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004672 oldname = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00004673 htmlParseStartTag(ctxt);
4674 name = ctxt->name;
4675#ifdef DEBUG
4676 if (oldname == NULL)
4677 xmlGenericError(xmlGenericErrorContext,
4678 "Start of element %s\n", name);
4679 else if (name == NULL)
4680 xmlGenericError(xmlGenericErrorContext,
4681 "Start of element failed, was %s\n",
4682 oldname);
4683 else
4684 xmlGenericError(xmlGenericErrorContext,
4685 "Start of element %s, was %s\n",
4686 name, oldname);
4687#endif
4688 if (((depth == ctxt->nameNr) &&
4689 (xmlStrEqual(oldname, ctxt->name))) ||
4690 (name == NULL)) {
4691 if (CUR == '>')
4692 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004693 break;
4694 }
Owen Taylor3473f882001-02-23 17:55:21 +00004695
4696 /*
4697 * Lookup the info for that element.
4698 */
4699 info = htmlTagLookup(name);
4700 if (info == NULL) {
4701 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4702 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4703 name);
4704 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004705 }
4706
4707 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004708 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004709 */
4710 if ((CUR == '/') && (NXT(1) == '>')) {
4711 SKIP(2);
4712 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4713 ctxt->sax->endElement(ctxt->userData, name);
4714 oldname = htmlnamePop(ctxt);
4715#ifdef DEBUG
4716 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4717 oldname);
4718#endif
Owen Taylor3473f882001-02-23 17:55:21 +00004719 ctxt->instate = XML_PARSER_CONTENT;
4720#ifdef DEBUG_PUSH
4721 xmlGenericError(xmlGenericErrorContext,
4722 "HPP: entering CONTENT\n");
4723#endif
4724 break;
4725 }
4726
4727 if (CUR == '>') {
4728 NEXT;
4729 } else {
4730 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4731 ctxt->sax->error(ctxt->userData,
4732 "Couldn't find end of Start Tag %s\n",
4733 name);
4734 ctxt->wellFormed = 0;
4735
4736 /*
4737 * end of parsing of this node.
4738 */
4739 if (xmlStrEqual(name, ctxt->name)) {
4740 nodePop(ctxt);
4741 oldname = htmlnamePop(ctxt);
4742#ifdef DEBUG
4743 xmlGenericError(xmlGenericErrorContext,
4744 "End of start tag problem: popping out %s\n", oldname);
4745#endif
Owen Taylor3473f882001-02-23 17:55:21 +00004746 }
4747
4748 ctxt->instate = XML_PARSER_CONTENT;
4749#ifdef DEBUG_PUSH
4750 xmlGenericError(xmlGenericErrorContext,
4751 "HPP: entering CONTENT\n");
4752#endif
4753 break;
4754 }
4755
4756 /*
4757 * Check for an Empty Element from DTD definition
4758 */
4759 if ((info != NULL) && (info->empty)) {
4760 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4761 ctxt->sax->endElement(ctxt->userData, name);
4762 oldname = htmlnamePop(ctxt);
4763#ifdef DEBUG
4764 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4765#endif
Owen Taylor3473f882001-02-23 17:55:21 +00004766 }
4767 ctxt->instate = XML_PARSER_CONTENT;
4768#ifdef DEBUG_PUSH
4769 xmlGenericError(xmlGenericErrorContext,
4770 "HPP: entering CONTENT\n");
4771#endif
4772 break;
4773 }
4774 case XML_PARSER_CONTENT: {
4775 long cons;
4776 /*
4777 * Handle preparsed entities and charRef
4778 */
4779 if (ctxt->token != 0) {
4780 xmlChar chr[2] = { 0 , 0 } ;
4781
4782 chr[0] = (xmlChar) ctxt->token;
4783 htmlCheckParagraph(ctxt);
4784 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4785 ctxt->sax->characters(ctxt->userData, chr, 1);
4786 ctxt->token = 0;
4787 ctxt->checkIndex = 0;
4788 }
4789 if ((avail == 1) && (terminate)) {
4790 cur = in->cur[0];
4791 if ((cur != '<') && (cur != '&')) {
4792 if (ctxt->sax != NULL) {
4793 if (IS_BLANK(cur)) {
4794 if (ctxt->sax->ignorableWhitespace != NULL)
4795 ctxt->sax->ignorableWhitespace(
4796 ctxt->userData, &cur, 1);
4797 } else {
4798 htmlCheckParagraph(ctxt);
4799 if (ctxt->sax->characters != NULL)
4800 ctxt->sax->characters(
4801 ctxt->userData, &cur, 1);
4802 }
4803 }
4804 ctxt->token = 0;
4805 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00004806 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00004807 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004808 }
Owen Taylor3473f882001-02-23 17:55:21 +00004809 }
4810 if (avail < 2)
4811 goto done;
4812 cur = in->cur[0];
4813 next = in->cur[1];
4814 cons = ctxt->nbChars;
4815 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4816 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4817 /*
4818 * Handle SCRIPT/STYLE separately
4819 */
4820 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004821 (htmlParseLookupSequence(ctxt, '<', '/', 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004822 goto done;
4823 htmlParseScript(ctxt);
4824 if ((cur == '<') && (next == '/')) {
4825 ctxt->instate = XML_PARSER_END_TAG;
4826 ctxt->checkIndex = 0;
4827#ifdef DEBUG_PUSH
4828 xmlGenericError(xmlGenericErrorContext,
4829 "HPP: entering END_TAG\n");
4830#endif
4831 break;
4832 }
4833 } else {
4834 /*
4835 * Sometimes DOCTYPE arrives in the middle of the document
4836 */
4837 if ((cur == '<') && (next == '!') &&
4838 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4839 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4840 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4841 (UPP(8) == 'E')) {
4842 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004843 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004844 goto done;
4845 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4846 ctxt->sax->error(ctxt->userData,
4847 "Misplaced DOCTYPE declaration\n");
4848 ctxt->wellFormed = 0;
4849 htmlParseDocTypeDecl(ctxt);
4850 } else if ((cur == '<') && (next == '!') &&
4851 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4852 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004853 (htmlParseLookupSequence(
4854 ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004855 goto done;
4856#ifdef DEBUG_PUSH
4857 xmlGenericError(xmlGenericErrorContext,
4858 "HPP: Parsing Comment\n");
4859#endif
4860 htmlParseComment(ctxt);
4861 ctxt->instate = XML_PARSER_CONTENT;
4862 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4863 goto done;
4864 } else if ((cur == '<') && (next == '/')) {
4865 ctxt->instate = XML_PARSER_END_TAG;
4866 ctxt->checkIndex = 0;
4867#ifdef DEBUG_PUSH
4868 xmlGenericError(xmlGenericErrorContext,
4869 "HPP: entering END_TAG\n");
4870#endif
4871 break;
4872 } else if (cur == '<') {
4873 ctxt->instate = XML_PARSER_START_TAG;
4874 ctxt->checkIndex = 0;
4875#ifdef DEBUG_PUSH
4876 xmlGenericError(xmlGenericErrorContext,
4877 "HPP: entering START_TAG\n");
4878#endif
4879 break;
4880 } else if (cur == '&') {
4881 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004882 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004883 goto done;
4884#ifdef DEBUG_PUSH
4885 xmlGenericError(xmlGenericErrorContext,
4886 "HPP: Parsing Reference\n");
4887#endif
4888 /* TODO: check generation of subtrees if noent !!! */
4889 htmlParseReference(ctxt);
4890 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00004891 /*
4892 * check that the text sequence is complete
4893 * before handing out the data to the parser
4894 * to avoid problems with erroneous end of
4895 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00004896 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00004897 if ((!terminate) &&
4898 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
4899 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00004900 ctxt->checkIndex = 0;
4901#ifdef DEBUG_PUSH
4902 xmlGenericError(xmlGenericErrorContext,
4903 "HPP: Parsing char data\n");
4904#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004905 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004906 }
4907 }
4908 if (cons == ctxt->nbChars) {
4909 if (ctxt->node != NULL) {
4910 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4911 ctxt->sax->error(ctxt->userData,
4912 "detected an error in element content\n");
4913 ctxt->wellFormed = 0;
4914 }
4915 NEXT;
4916 break;
4917 }
4918
4919 break;
4920 }
4921 case XML_PARSER_END_TAG:
4922 if (avail < 2)
4923 goto done;
4924 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004925 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004926 goto done;
4927 htmlParseEndTag(ctxt);
4928 if (ctxt->nameNr == 0) {
4929 ctxt->instate = XML_PARSER_EPILOG;
4930 } else {
4931 ctxt->instate = XML_PARSER_CONTENT;
4932 }
4933 ctxt->checkIndex = 0;
4934#ifdef DEBUG_PUSH
4935 xmlGenericError(xmlGenericErrorContext,
4936 "HPP: entering CONTENT\n");
4937#endif
4938 break;
4939 case XML_PARSER_CDATA_SECTION:
4940 xmlGenericError(xmlGenericErrorContext,
4941 "HPP: internal error, state == CDATA\n");
4942 ctxt->instate = XML_PARSER_CONTENT;
4943 ctxt->checkIndex = 0;
4944#ifdef DEBUG_PUSH
4945 xmlGenericError(xmlGenericErrorContext,
4946 "HPP: entering CONTENT\n");
4947#endif
4948 break;
4949 case XML_PARSER_DTD:
4950 xmlGenericError(xmlGenericErrorContext,
4951 "HPP: internal error, state == DTD\n");
4952 ctxt->instate = XML_PARSER_CONTENT;
4953 ctxt->checkIndex = 0;
4954#ifdef DEBUG_PUSH
4955 xmlGenericError(xmlGenericErrorContext,
4956 "HPP: entering CONTENT\n");
4957#endif
4958 break;
4959 case XML_PARSER_COMMENT:
4960 xmlGenericError(xmlGenericErrorContext,
4961 "HPP: internal error, state == COMMENT\n");
4962 ctxt->instate = XML_PARSER_CONTENT;
4963 ctxt->checkIndex = 0;
4964#ifdef DEBUG_PUSH
4965 xmlGenericError(xmlGenericErrorContext,
4966 "HPP: entering CONTENT\n");
4967#endif
4968 break;
4969 case XML_PARSER_PI:
4970 xmlGenericError(xmlGenericErrorContext,
4971 "HPP: internal error, state == PI\n");
4972 ctxt->instate = XML_PARSER_CONTENT;
4973 ctxt->checkIndex = 0;
4974#ifdef DEBUG_PUSH
4975 xmlGenericError(xmlGenericErrorContext,
4976 "HPP: entering CONTENT\n");
4977#endif
4978 break;
4979 case XML_PARSER_ENTITY_DECL:
4980 xmlGenericError(xmlGenericErrorContext,
4981 "HPP: internal error, state == ENTITY_DECL\n");
4982 ctxt->instate = XML_PARSER_CONTENT;
4983 ctxt->checkIndex = 0;
4984#ifdef DEBUG_PUSH
4985 xmlGenericError(xmlGenericErrorContext,
4986 "HPP: entering CONTENT\n");
4987#endif
4988 break;
4989 case XML_PARSER_ENTITY_VALUE:
4990 xmlGenericError(xmlGenericErrorContext,
4991 "HPP: internal error, state == ENTITY_VALUE\n");
4992 ctxt->instate = XML_PARSER_CONTENT;
4993 ctxt->checkIndex = 0;
4994#ifdef DEBUG_PUSH
4995 xmlGenericError(xmlGenericErrorContext,
4996 "HPP: entering DTD\n");
4997#endif
4998 break;
4999 case XML_PARSER_ATTRIBUTE_VALUE:
5000 xmlGenericError(xmlGenericErrorContext,
5001 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
5002 ctxt->instate = XML_PARSER_START_TAG;
5003 ctxt->checkIndex = 0;
5004#ifdef DEBUG_PUSH
5005 xmlGenericError(xmlGenericErrorContext,
5006 "HPP: entering START_TAG\n");
5007#endif
5008 break;
5009 case XML_PARSER_SYSTEM_LITERAL:
5010 xmlGenericError(xmlGenericErrorContext,
5011 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
5012 ctxt->instate = XML_PARSER_CONTENT;
5013 ctxt->checkIndex = 0;
5014#ifdef DEBUG_PUSH
5015 xmlGenericError(xmlGenericErrorContext,
5016 "HPP: entering CONTENT\n");
5017#endif
5018 break;
5019 case XML_PARSER_IGNORE:
5020 xmlGenericError(xmlGenericErrorContext,
5021 "HPP: internal error, state == XML_PARSER_IGNORE\n");
5022 ctxt->instate = XML_PARSER_CONTENT;
5023 ctxt->checkIndex = 0;
5024#ifdef DEBUG_PUSH
5025 xmlGenericError(xmlGenericErrorContext,
5026 "HPP: entering CONTENT\n");
5027#endif
5028 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005029 case XML_PARSER_PUBLIC_LITERAL:
5030 xmlGenericError(xmlGenericErrorContext,
5031 "HPP: internal error, state == XML_PARSER_LITERAL\n");
5032 ctxt->instate = XML_PARSER_CONTENT;
5033 ctxt->checkIndex = 0;
5034#ifdef DEBUG_PUSH
5035 xmlGenericError(xmlGenericErrorContext,
5036 "HPP: entering CONTENT\n");
5037#endif
5038 break;
5039
Owen Taylor3473f882001-02-23 17:55:21 +00005040 }
5041 }
5042done:
5043 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005044 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005045 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5046 /*
5047 * SAX: end of the document processing.
5048 */
5049 ctxt->instate = XML_PARSER_EOF;
5050 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5051 ctxt->sax->endDocument(ctxt->userData);
5052 }
5053 }
5054 if ((ctxt->myDoc != NULL) &&
5055 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5056 (ctxt->instate == XML_PARSER_EPILOG))) {
5057 xmlDtdPtr dtd;
5058 dtd = xmlGetIntSubset(ctxt->myDoc);
5059 if (dtd == NULL)
5060 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00005061 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005062 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5063 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5064 }
5065#ifdef DEBUG_PUSH
5066 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5067#endif
5068 return(ret);
5069}
5070
5071/**
Owen Taylor3473f882001-02-23 17:55:21 +00005072 * htmlParseChunk:
5073 * @ctxt: an XML parser context
5074 * @chunk: an char array
5075 * @size: the size in byte of the chunk
5076 * @terminate: last chunk indicator
5077 *
5078 * Parse a Chunk of memory
5079 *
5080 * Returns zero if no error, the xmlParserErrors otherwise.
5081 */
5082int
5083htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5084 int terminate) {
5085 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5086 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5087 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5088 int cur = ctxt->input->cur - ctxt->input->base;
5089
5090 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5091 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5092 ctxt->input->cur = ctxt->input->base + cur;
5093#ifdef DEBUG_PUSH
5094 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5095#endif
5096
Daniel Veillard14f752c2003-08-09 11:44:50 +00005097#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00005098 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5099 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005100#endif
Owen Taylor3473f882001-02-23 17:55:21 +00005101 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005102 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5103 xmlParserInputBufferPtr in = ctxt->input->buf;
5104 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5105 (in->raw != NULL)) {
5106 int nbchars;
5107
5108 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5109 if (nbchars < 0) {
5110 xmlGenericError(xmlGenericErrorContext,
5111 "htmlParseChunk: encoder error\n");
5112 return(XML_ERR_INVALID_ENCODING);
5113 }
5114 }
5115 }
Owen Taylor3473f882001-02-23 17:55:21 +00005116 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00005117 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00005118 if (terminate) {
5119 if ((ctxt->instate != XML_PARSER_EOF) &&
5120 (ctxt->instate != XML_PARSER_EPILOG) &&
5121 (ctxt->instate != XML_PARSER_MISC)) {
5122 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005123 ctxt->wellFormed = 0;
5124 }
5125 if (ctxt->instate != XML_PARSER_EOF) {
5126 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5127 ctxt->sax->endDocument(ctxt->userData);
5128 }
5129 ctxt->instate = XML_PARSER_EOF;
5130 }
5131 return((xmlParserErrors) ctxt->errNo);
5132}
5133
5134/************************************************************************
5135 * *
5136 * User entry points *
5137 * *
5138 ************************************************************************/
5139
5140/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005141 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005142 * @sax: a SAX handler
5143 * @user_data: The user data returned on SAX callbacks
5144 * @chunk: a pointer to an array of chars
5145 * @size: number of chars in the array
5146 * @filename: an optional file name or URI
5147 * @enc: an optional encoding
5148 *
5149 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005150 * The value of @filename is used for fetching external entities
5151 * and error/warning reports.
5152 *
5153 * Returns the new parser context or NULL
5154 */
5155htmlParserCtxtPtr
5156htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5157 const char *chunk, int size, const char *filename,
5158 xmlCharEncoding enc) {
5159 htmlParserCtxtPtr ctxt;
5160 htmlParserInputPtr inputStream;
5161 xmlParserInputBufferPtr buf;
5162
Daniel Veillardd0463562001-10-13 09:15:48 +00005163 xmlInitParser();
5164
Owen Taylor3473f882001-02-23 17:55:21 +00005165 buf = xmlAllocParserInputBuffer(enc);
5166 if (buf == NULL) return(NULL);
5167
5168 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
5169 if (ctxt == NULL) {
5170 xmlFree(buf);
5171 return(NULL);
5172 }
5173 memset(ctxt, 0, sizeof(htmlParserCtxt));
5174 htmlInitParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005175 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5176 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005177 if (sax != NULL) {
5178 if (ctxt->sax != &htmlDefaultSAXHandler)
5179 xmlFree(ctxt->sax);
5180 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5181 if (ctxt->sax == NULL) {
5182 xmlFree(buf);
5183 xmlFree(ctxt);
5184 return(NULL);
5185 }
5186 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5187 if (user_data != NULL)
5188 ctxt->userData = user_data;
5189 }
5190 if (filename == NULL) {
5191 ctxt->directory = NULL;
5192 } else {
5193 ctxt->directory = xmlParserGetDirectory(filename);
5194 }
5195
5196 inputStream = htmlNewInputStream(ctxt);
5197 if (inputStream == NULL) {
5198 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005199 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005200 return(NULL);
5201 }
5202
5203 if (filename == NULL)
5204 inputStream->filename = NULL;
5205 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005206 inputStream->filename = (char *)
5207 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005208 inputStream->buf = buf;
5209 inputStream->base = inputStream->buf->buffer->content;
5210 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillard5f704af2003-03-05 10:01:43 +00005211 inputStream->end =
5212 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005213
5214 inputPush(ctxt, inputStream);
5215
5216 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5217 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005218 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5219 int cur = ctxt->input->cur - ctxt->input->base;
5220
Owen Taylor3473f882001-02-23 17:55:21 +00005221 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005222
5223 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5224 ctxt->input->cur = ctxt->input->base + cur;
5225 ctxt->input->end =
5226 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005227#ifdef DEBUG_PUSH
5228 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5229#endif
5230 }
5231
5232 return(ctxt);
5233}
5234
5235/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005236 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005237 * @cur: a pointer to an array of xmlChar
5238 * @encoding: a free form C string describing the HTML document encoding, or NULL
5239 * @sax: the SAX handler block
5240 * @userData: if using SAX, this pointer will be provided on callbacks.
5241 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005242 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5243 * to handle parse events. If sax is NULL, fallback to the default DOM
5244 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00005245 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005246 * Returns the resulting document tree unless SAX is NULL or the document is
5247 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005248 */
5249
5250htmlDocPtr
5251htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5252 htmlDocPtr ret;
5253 htmlParserCtxtPtr ctxt;
5254
Daniel Veillardd0463562001-10-13 09:15:48 +00005255 xmlInitParser();
5256
Owen Taylor3473f882001-02-23 17:55:21 +00005257 if (cur == NULL) return(NULL);
5258
5259
5260 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5261 if (ctxt == NULL) return(NULL);
5262 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005263 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005264 ctxt->sax = sax;
5265 ctxt->userData = userData;
5266 }
5267
5268 htmlParseDocument(ctxt);
5269 ret = ctxt->myDoc;
5270 if (sax != NULL) {
5271 ctxt->sax = NULL;
5272 ctxt->userData = NULL;
5273 }
5274 htmlFreeParserCtxt(ctxt);
5275
5276 return(ret);
5277}
5278
5279/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005280 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005281 * @cur: a pointer to an array of xmlChar
5282 * @encoding: a free form C string describing the HTML document encoding, or NULL
5283 *
5284 * parse an HTML in-memory document and build a tree.
5285 *
5286 * Returns the resulting document tree
5287 */
5288
5289htmlDocPtr
5290htmlParseDoc(xmlChar *cur, const char *encoding) {
5291 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5292}
5293
5294
5295/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005296 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005297 * @filename: the filename
5298 * @encoding: a free form C string describing the HTML document encoding, or NULL
5299 *
5300 * Create a parser context for a file content.
5301 * Automatic support for ZLIB/Compress compressed document is provided
5302 * by default if found at compile-time.
5303 *
5304 * Returns the new parser context or NULL
5305 */
5306htmlParserCtxtPtr
5307htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5308{
5309 htmlParserCtxtPtr ctxt;
5310 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005311 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005312 /* htmlCharEncoding enc; */
5313 xmlChar *content, *content_line = (xmlChar *) "charset=";
5314
Owen Taylor3473f882001-02-23 17:55:21 +00005315 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
5316 if (ctxt == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00005317 xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00005318 return(NULL);
5319 }
5320 memset(ctxt, 0, sizeof(htmlParserCtxt));
5321 htmlInitParserCtxt(ctxt);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005322 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5323 if (canonicFilename == NULL) {
5324 if (xmlDefaultSAXHandler.error != NULL) {
5325 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5326 }
Daniel Veillard104caa32003-05-13 22:54:05 +00005327 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005328 return(NULL);
5329 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005330
5331 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5332 xmlFree(canonicFilename);
5333 if (inputStream == NULL) {
5334 xmlFreeParserCtxt(ctxt);
5335 return(NULL);
5336 }
Owen Taylor3473f882001-02-23 17:55:21 +00005337
5338 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005339
Owen Taylor3473f882001-02-23 17:55:21 +00005340 /* set encoding */
5341 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005342 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Owen Taylor3473f882001-02-23 17:55:21 +00005343 if (content) {
5344 strcpy ((char *)content, (char *)content_line);
5345 strcat ((char *)content, (char *)encoding);
5346 htmlCheckEncoding (ctxt, content);
5347 xmlFree (content);
5348 }
5349 }
5350
5351 return(ctxt);
5352}
5353
5354/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005355 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005356 * @filename: the filename
5357 * @encoding: a free form C string describing the HTML document encoding, or NULL
5358 * @sax: the SAX handler block
5359 * @userData: if using SAX, this pointer will be provided on callbacks.
5360 *
5361 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5362 * compressed document is provided by default if found at compile-time.
5363 * It use the given SAX function block to handle the parsing callback.
5364 * If sax is NULL, fallback to the default DOM tree building routines.
5365 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005366 * Returns the resulting document tree unless SAX is NULL or the document is
5367 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005368 */
5369
5370htmlDocPtr
5371htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5372 void *userData) {
5373 htmlDocPtr ret;
5374 htmlParserCtxtPtr ctxt;
5375 htmlSAXHandlerPtr oldsax = NULL;
5376
Daniel Veillardd0463562001-10-13 09:15:48 +00005377 xmlInitParser();
5378
Owen Taylor3473f882001-02-23 17:55:21 +00005379 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5380 if (ctxt == NULL) return(NULL);
5381 if (sax != NULL) {
5382 oldsax = ctxt->sax;
5383 ctxt->sax = sax;
5384 ctxt->userData = userData;
5385 }
5386
5387 htmlParseDocument(ctxt);
5388
5389 ret = ctxt->myDoc;
5390 if (sax != NULL) {
5391 ctxt->sax = oldsax;
5392 ctxt->userData = NULL;
5393 }
5394 htmlFreeParserCtxt(ctxt);
5395
5396 return(ret);
5397}
5398
5399/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005400 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005401 * @filename: the filename
5402 * @encoding: a free form C string describing the HTML document encoding, or NULL
5403 *
5404 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5405 * compressed document is provided by default if found at compile-time.
5406 *
5407 * Returns the resulting document tree
5408 */
5409
5410htmlDocPtr
5411htmlParseFile(const char *filename, const char *encoding) {
5412 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5413}
5414
5415/**
5416 * htmlHandleOmittedElem:
5417 * @val: int 0 or 1
5418 *
5419 * Set and return the previous value for handling HTML omitted tags.
5420 *
5421 * Returns the last value for 0 for no handling, 1 for auto insertion.
5422 */
5423
5424int
5425htmlHandleOmittedElem(int val) {
5426 int old = htmlOmittedDefaultValue;
5427
5428 htmlOmittedDefaultValue = val;
5429 return(old);
5430}
5431
Daniel Veillard930dfb62003-02-05 10:17:38 +00005432/**
5433 * htmlElementAllowedHere:
5434 * @parent: HTML parent element
5435 * @elt: HTML element
5436 *
5437 * Checks whether an HTML element may be a direct child of a parent element.
5438 * Note - doesn't check for deprecated elements
5439 *
5440 * Returns 1 if allowed; 0 otherwise.
5441 */
5442int
5443htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5444 const char** p ;
5445
5446 if ( ! elt || ! parent || ! parent->subelts )
5447 return 0 ;
5448
5449 for ( p = parent->subelts; *p; ++p )
5450 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5451 return 1 ;
5452
5453 return 0 ;
5454}
5455/**
5456 * htmlElementStatusHere:
5457 * @parent: HTML parent element
5458 * @elt: HTML element
5459 *
5460 * Checks whether an HTML element may be a direct child of a parent element.
5461 * and if so whether it is valid or deprecated.
5462 *
5463 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5464 */
5465htmlStatus
5466htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5467 if ( ! parent || ! elt )
5468 return HTML_INVALID ;
5469 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5470 return HTML_INVALID ;
5471
5472 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5473}
5474/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005475 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005476 * @elt: HTML element
5477 * @attr: HTML attribute
5478 * @legacy: whether to allow deprecated attributes
5479 *
5480 * Checks whether an attribute is valid for an element
5481 * Has full knowledge of Required and Deprecated attributes
5482 *
5483 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5484 */
5485htmlStatus
5486htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5487 const char** p ;
5488
5489 if ( !elt || ! attr )
5490 return HTML_INVALID ;
5491
5492 if ( elt->attrs_req )
5493 for ( p = elt->attrs_req; *p; ++p)
5494 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5495 return HTML_REQUIRED ;
5496
5497 if ( elt->attrs_opt )
5498 for ( p = elt->attrs_opt; *p; ++p)
5499 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5500 return HTML_VALID ;
5501
5502 if ( legacy && elt->attrs_depr )
5503 for ( p = elt->attrs_depr; *p; ++p)
5504 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5505 return HTML_DEPRECATED ;
5506
5507 return HTML_INVALID ;
5508}
5509/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005510 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005511 * @node: an htmlNodePtr in a tree
5512 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005513 * for Element nodes)
5514 *
5515 * Checks whether the tree node is valid. Experimental (the author
5516 * only uses the HTML enhancements in a SAX parser)
5517 *
5518 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5519 * legacy allowed) or htmlElementStatusHere (otherwise).
5520 * for Attribute nodes, a return from htmlAttrAllowed
5521 * for other nodes, HTML_NA (no checks performed)
5522 */
5523htmlStatus
5524htmlNodeStatus(const htmlNodePtr node, int legacy) {
5525 if ( ! node )
5526 return HTML_INVALID ;
5527
5528 switch ( node->type ) {
5529 case XML_ELEMENT_NODE:
5530 return legacy
5531 ? ( htmlElementAllowedHere (
5532 htmlTagLookup(node->parent->name) , node->name
5533 ) ? HTML_VALID : HTML_INVALID )
5534 : htmlElementStatusHere(
5535 htmlTagLookup(node->parent->name) ,
5536 htmlTagLookup(node->name) )
5537 ;
5538 case XML_ATTRIBUTE_NODE:
5539 return htmlAttrAllowed(
5540 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5541 default: return HTML_NA ;
5542 }
5543}
Owen Taylor3473f882001-02-23 17:55:21 +00005544#endif /* LIBXML_HTML_ENABLED */