blob: 1606547d4634c3a028bd5d717d275cb71effb71e [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Owen Taylor3473f882001-02-23 17:55:21 +000062 * Parser stacks related functions and macros *
63 * *
64 ************************************************************************/
65
Daniel Veillard1c732d22002-11-30 11:22:59 +000066/**
67 * htmlnamePush:
68 * @ctxt: an HTML parser context
69 * @value: the element name
70 *
71 * Pushes a new element name on top of the name stack
72 *
73 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +000074 */
Daniel Veillard1c732d22002-11-30 11:22:59 +000075static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +000076htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +000077{
78 if (ctxt->nameNr >= ctxt->nameMax) {
79 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +000080 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +000081 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +000082 ctxt->nameMax *
83 sizeof(ctxt->nameTab[0]));
84 if (ctxt->nameTab == NULL) {
85 xmlGenericError(xmlGenericErrorContext, "realloc failed !\n");
86 return (0);
87 }
88 }
89 ctxt->nameTab[ctxt->nameNr] = value;
90 ctxt->name = value;
91 return (ctxt->nameNr++);
92}
93/**
94 * htmlnamePop:
95 * @ctxt: an HTML parser context
96 *
97 * Pops the top element name from the name stack
98 *
99 * Returns the name just removed
100 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000101static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000102htmlnamePop(htmlParserCtxtPtr ctxt)
103{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000104 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000105
Daniel Veillard1c732d22002-11-30 11:22:59 +0000106 if (ctxt->nameNr <= 0)
107 return (0);
108 ctxt->nameNr--;
109 if (ctxt->nameNr < 0)
110 return (0);
111 if (ctxt->nameNr > 0)
112 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
113 else
114 ctxt->name = NULL;
115 ret = ctxt->nameTab[ctxt->nameNr];
116 ctxt->nameTab[ctxt->nameNr] = 0;
117 return (ret);
118}
Owen Taylor3473f882001-02-23 17:55:21 +0000119
120/*
121 * Macros for accessing the content. Those should be used only by the parser,
122 * and not exported.
123 *
124 * Dirty macros, i.e. one need to make assumption on the context to use them
125 *
126 * CUR_PTR return the current pointer to the xmlChar to be parsed.
127 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
128 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
129 * in UNICODE mode. This should be used internally by the parser
130 * only to compare to ASCII values otherwise it would break when
131 * running with UTF-8 encoding.
132 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
133 * to compare on ASCII based substring.
134 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
135 * it should be used only to compare on ASCII based substring.
136 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000137 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000138 *
139 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
140 *
141 * CURRENT Returns the current char value, with the full decoding of
142 * UTF-8 if we are using this mode. It returns an int.
143 * NEXT Skip to the next character, this does the proper decoding
144 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000145 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000146 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
147 */
148
149#define UPPER (toupper(*ctxt->input->cur))
150
Daniel Veillard77a90a72003-03-22 00:04:05 +0000151#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000152
153#define NXT(val) ctxt->input->cur[(val)]
154
155#define UPP(val) (toupper(ctxt->input->cur[(val)]))
156
157#define CUR_PTR ctxt->input->cur
158
159#define SHRINK xmlParserInputShrink(ctxt->input)
160
161#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
162
163#define CURRENT ((int) (*ctxt->input->cur))
164
165#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
166
167/* Inported from XML */
168
Daniel Veillard561b7f82002-03-20 21:55:57 +0000169/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
170#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000171#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000172
Daniel Veillard561b7f82002-03-20 21:55:57 +0000173#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000174#define NXT(val) ctxt->input->cur[(val)]
175#define CUR_PTR ctxt->input->cur
176
177
178#define NEXTL(l) do { \
179 if (*(ctxt->input->cur) == '\n') { \
180 ctxt->input->line++; ctxt->input->col = 1; \
181 } else ctxt->input->col++; \
182 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
183 } while (0)
184
185/************
186 \
187 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
188 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
189 ************/
190
191#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
192#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
193
194#define COPY_BUF(l,b,i,v) \
195 if (l == 1) b[i++] = (xmlChar) v; \
196 else i += xmlCopyChar(l,&b[i],v)
197
198/**
199 * htmlCurrentChar:
200 * @ctxt: the HTML parser context
201 * @len: pointer to the length of the char read
202 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000203 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000204 * bytes in the input buffer. Implement the end of line normalization:
205 * 2.11 End-of-Line Handling
206 * If the encoding is unspecified, in the case we find an ISO-Latin-1
207 * char, then the encoding converter is plugged in automatically.
208 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000209 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000210 */
211
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000212static int
Owen Taylor3473f882001-02-23 17:55:21 +0000213htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
214 if (ctxt->instate == XML_PARSER_EOF)
215 return(0);
216
217 if (ctxt->token != 0) {
218 *len = 0;
219 return(ctxt->token);
220 }
221 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
222 /*
223 * We are supposed to handle UTF8, check it's valid
224 * From rfc2044: encoding of the Unicode values on UTF-8:
225 *
226 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
227 * 0000 0000-0000 007F 0xxxxxxx
228 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
229 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
230 *
231 * Check for the 0x110000 limit too
232 */
233 const unsigned char *cur = ctxt->input->cur;
234 unsigned char c;
235 unsigned int val;
236
237 c = *cur;
238 if (c & 0x80) {
239 if (cur[1] == 0)
240 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
241 if ((cur[1] & 0xc0) != 0x80)
242 goto encoding_error;
243 if ((c & 0xe0) == 0xe0) {
244
245 if (cur[2] == 0)
246 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
247 if ((cur[2] & 0xc0) != 0x80)
248 goto encoding_error;
249 if ((c & 0xf0) == 0xf0) {
250 if (cur[3] == 0)
251 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
252 if (((c & 0xf8) != 0xf0) ||
253 ((cur[3] & 0xc0) != 0x80))
254 goto encoding_error;
255 /* 4-byte code */
256 *len = 4;
257 val = (cur[0] & 0x7) << 18;
258 val |= (cur[1] & 0x3f) << 12;
259 val |= (cur[2] & 0x3f) << 6;
260 val |= cur[3] & 0x3f;
261 } else {
262 /* 3-byte code */
263 *len = 3;
264 val = (cur[0] & 0xf) << 12;
265 val |= (cur[1] & 0x3f) << 6;
266 val |= cur[2] & 0x3f;
267 }
268 } else {
269 /* 2-byte code */
270 *len = 2;
271 val = (cur[0] & 0x1f) << 6;
272 val |= cur[1] & 0x3f;
273 }
274 if (!IS_CHAR(val)) {
275 ctxt->errNo = XML_ERR_INVALID_ENCODING;
276 if ((ctxt->sax != NULL) &&
277 (ctxt->sax->error != NULL))
278 ctxt->sax->error(ctxt->userData,
279 "Char 0x%X out of allowed range\n", val);
280 ctxt->wellFormed = 0;
Daniel Veillarddad3f682002-11-17 16:47:27 +0000281 if (ctxt->recovery == 0) ctxt->disableSAX = 1;
Owen Taylor3473f882001-02-23 17:55:21 +0000282 }
283 return(val);
284 } else {
285 /* 1-byte code */
286 *len = 1;
287 return((int) *ctxt->input->cur);
288 }
289 }
290 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000291 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000292 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000293 * XML constructs only use < 128 chars
294 */
295 *len = 1;
296 if ((int) *ctxt->input->cur < 0x80)
297 return((int) *ctxt->input->cur);
298
299 /*
300 * Humm this is bad, do an automatic flow conversion
301 */
302 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
303 ctxt->charset = XML_CHAR_ENCODING_UTF8;
304 return(xmlCurrentChar(ctxt, len));
305
306encoding_error:
307 /*
308 * If we detect an UTF8 error that probably mean that the
309 * input encoding didn't get properly advertized in the
310 * declaration header. Report the error and switch the encoding
311 * to ISO-Latin-1 (if you don't like this policy, just declare the
312 * encoding !)
313 */
314 ctxt->errNo = XML_ERR_INVALID_ENCODING;
315 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
316 ctxt->sax->error(ctxt->userData,
317 "Input is not proper UTF-8, indicate encoding !\n");
318 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
319 ctxt->input->cur[0], ctxt->input->cur[1],
320 ctxt->input->cur[2], ctxt->input->cur[3]);
321 }
322
323 ctxt->charset = XML_CHAR_ENCODING_8859_1;
324 *len = 1;
325 return((int) *ctxt->input->cur);
326}
327
328/**
Owen Taylor3473f882001-02-23 17:55:21 +0000329 * htmlSkipBlankChars:
330 * @ctxt: the HTML parser context
331 *
332 * skip all blanks character found at that point in the input streams.
333 *
334 * Returns the number of space chars skipped
335 */
336
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000337static int
Owen Taylor3473f882001-02-23 17:55:21 +0000338htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
339 int res = 0;
340
341 while (IS_BLANK(*(ctxt->input->cur))) {
342 if ((*ctxt->input->cur == 0) &&
343 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
344 xmlPopInput(ctxt);
345 } else {
346 if (*(ctxt->input->cur) == '\n') {
347 ctxt->input->line++; ctxt->input->col = 1;
348 } else ctxt->input->col++;
349 ctxt->input->cur++;
350 ctxt->nbChars++;
351 if (*ctxt->input->cur == 0)
352 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
353 }
354 res++;
355 }
356 return(res);
357}
358
359
360
361/************************************************************************
362 * *
363 * The list of HTML elements and their properties *
364 * *
365 ************************************************************************/
366
367/*
368 * Start Tag: 1 means the start tag can be ommited
369 * End Tag: 1 means the end tag can be ommited
370 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000371 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000372 * Depr: this element is deprecated
373 * DTD: 1 means that this element is valid only in the Loose DTD
374 * 2 means that this element is valid only in the Frameset DTD
375 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000376 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000377 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000378 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000379
380/* Definitions and a couple of vars for HTML Elements */
381
382#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
383#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
384#define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
385#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
386#define BLOCK HEADING LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
387#define FORMCTRL "input", "select", "textarea", "label", "button"
388#define PCDATA
389#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
390#define LIST "ul", "ol", "dir", "menu"
391#define MODIFIER
392#define FLOW BLOCK,INLINE
393#define EMPTY NULL
394
395
396static const char* html_flow[] = { FLOW, NULL } ;
397static const char* html_inline[] = { INLINE, NULL } ;
398
399/* placeholders: elts with content but no subelements */
400static const char* html_pcdata[] = { NULL } ;
401#define html_cdata html_pcdata
402
403
404/* ... and for HTML Attributes */
405
406#define COREATTRS "id", "class", "style", "title"
407#define I18N "lang", "dir"
408#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
409#define ATTRS COREATTRS,I18N,EVENTS
410#define CELLHALIGN "align", "char", "charoff"
411#define CELLVALIGN "valign"
412
413static const char* html_attrs[] = { ATTRS, NULL } ;
414static const char* core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
415static const char* core_attrs[] = { COREATTRS, NULL } ;
416static const char* i18n_attrs[] = { I18N, NULL } ;
417
418
419/* Other declarations that should go inline ... */
420static const char* a_attrs[] = { ATTRS, "charset", "type", "name",
421 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
422 "tabindex", "onfocus", "onblur", NULL } ;
423static const char* target_attr[] = { "target", NULL } ;
424static const char* rows_cols_attr[] = { "rows", "cols", NULL } ;
425static const char* alt_attr[] = { "alt", NULL } ;
426static const char* src_alt_attrs[] = { "src", "alt", NULL } ;
427static const char* href_attrs[] = { "href", NULL } ;
428static const char* clear_attrs[] = { "clear", NULL } ;
429static const char* inline_p[] = { INLINE, "p", NULL } ;
430static const char* flow_param[] = { FLOW, "param", NULL } ;
431static const char* applet_attrs[] = { COREATTRS , "codebase",
432 "archive", "alt", "name", "height", "width", "align",
433 "hspace", "vspace", NULL } ;
434static const char* area_attrs[] = { "shape", "coords", "href", "nohref",
435 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
436static const char* basefont_attrs[] =
437 { "id", "size", "color", "face", NULL } ;
438static const char* quote_attrs[] = { ATTRS, "cite", NULL } ;
439static const char* body_contents[] = { FLOW, "ins", "del", NULL } ;
440static const char* body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
441static const char* body_depr[] = { "background", "bgcolor", "text",
442 "link", "vlink", "alink", NULL } ;
443static const char* button_attrs[] = { ATTRS, "name", "value", "type",
444 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
445
446
447static const char* col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
448static const char* col_elt[] = { "col", NULL } ;
449static const char* edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
450static const char* compact_attrs[] = { ATTRS, "compact", NULL } ;
451static const char* dl_contents[] = { "dt", "dd", NULL } ;
452static const char* compact_attr[] = { "compact", NULL } ;
453static const char* label_attr[] = { "label", NULL } ;
454static const char* fieldset_contents[] = { FLOW, "legend" } ;
455static const char* font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
456static const char* form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
457static const char* form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
458static const char* frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
459static const char* frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
460static const char* frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
461static const char* head_attrs[] = { I18N, "profile", NULL } ;
462static const char* head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
463static const char* hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
464static const char* version_attr[] = { "version", NULL } ;
465static const char* html_content[] = { "head", "body", "frameset", NULL } ;
466static const char* iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
467static const char* img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
468static const char* input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
469static const char* prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
470static const char* label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
471static const char* legend_attrs[] = { ATTRS, "accesskey", NULL } ;
472static const char* align_attr[] = { "align", NULL } ;
473static const char* link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
474static const char* map_contents[] = { BLOCK, "area", NULL } ;
475static const char* name_attr[] = { "name", NULL } ;
476static const char* action_attr[] = { "action", NULL } ;
477static const char* blockli_elt[] = { BLOCK, "li", NULL } ;
478static const char* meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
479static const char* content_attr[] = { "content", NULL } ;
480static const char* type_attr[] = { "type", NULL } ;
481static const char* noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
482static const char* object_contents[] = { FLOW, "param", NULL } ;
483static const char* object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
484static const char* object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
485static const char* ol_attrs[] = { "type", "compact", "start", NULL} ;
486static const char* option_elt[] = { "option", NULL } ;
487static const char* optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
488static const char* option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
489static const char* param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
490static const char* width_attr[] = { "width", NULL } ;
491static const char* pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
492static const char* script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
493static const char* language_attr[] = { "language", NULL } ;
494static const char* select_content[] = { "optgroup", "option", NULL } ;
495static const char* select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
496static const char* style_attrs[] = { I18N, "media", "title", NULL } ;
497static const char* table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
498static const char* table_depr[] = { "align", "bgcolor", NULL } ;
499static const char* table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
500static const char* tr_elt[] = { "tr", NULL } ;
501static const char* talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
502static const char* th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
503static const char* th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
504static const char* textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
505static const char* tr_contents[] = { "th", "td", NULL } ;
506static const char* bgcolor_attr[] = { "bgcolor", NULL } ;
507static const char* li_elt[] = { "li", NULL } ;
508static const char* ul_depr[] = { "type", "compact", NULL} ;
509static const char* dir_attr[] = { "dir", NULL} ;
510
511#define DECL (const char**)
512
Daniel Veillard22090732001-07-16 00:06:07 +0000513static const htmlElemDesc
514html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000515{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
516 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
517},
518{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
519 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
520},
521{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
522 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
523},
524{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
525 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
526},
527{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
528 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
529},
530{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
531 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
532},
533{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
534 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
535},
536{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
537 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
538},
539{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
540 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
541},
542{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
543 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
544},
545{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
546 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
547},
548{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
549 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
550},
551{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
552 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
553},
554{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
555 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
556},
557{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
558 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
559},
560{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
561 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
562},
563{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
564 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
565},
566{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
567 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
568},
569{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
570 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
571},
572{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
573 EMPTY , NULL , DECL col_attrs , NULL, NULL
574},
575{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
576 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
577},
578{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
579 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
580},
581{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
582 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
583},
584{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
585 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
586},
587{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
588 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
589},
590{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
591 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
592},
593{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
594 DECL dl_contents , "dd" , html_attrs, DECL compact_attr, NULL
595},
596{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
597 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
598},
599{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
600 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
601},
602{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
603 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
604},
605{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
606 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
607},
608{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
609 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
610},
611{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
612 EMPTY, NULL, NULL, DECL frame_attrs, NULL
613},
614{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
615 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
616},
617{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
618 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
619},
620{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
621 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
622},
623{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
624 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
625},
626{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
627 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
628},
629{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
630 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
631},
632{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
633 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
634},
635{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
636 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
637},
638{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
639 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
640},
641{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
642 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
643},
644{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
645 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
646},
647{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
648 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
649},
650{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
651 EMPTY, NULL, DECL img_attrs, DECL align_attr, src_alt_attrs
652},
653{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
654 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
655},
656{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
657 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
658},
659{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
660 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
661},
662{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
663 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
664},
665{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
666 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
667},
668{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
669 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
670},
671{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
672 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
673},
674{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
675 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
676},
677{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
678 DECL map_contents , NULL, DECL html_attrs , NULL, name_attr
679},
680{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
681 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
682},
683{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
684 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
685},
686{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
687 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
688},
689{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
690 DECL html_flow, "div", DECL html_attrs, NULL, NULL
691},
692{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
693 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
694},
695{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
696 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
697},
698{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
699 option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
700},
701{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
702 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
703},
704{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
705 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
706},
707{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
708 EMPTY, NULL, DECL param_attrs, NULL, name_attr
709},
710{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
711 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
712},
713{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
714 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
715},
716{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
717 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
718},
719{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
720 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
721},
722{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
723 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
724},
725{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
726 DECL select_content, NULL, DECL select_attrs, NULL, NULL
727},
728{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
729 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
730},
731{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
732 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
733},
734{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
735 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
736},
737{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
738 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
739},
740{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
741 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
742},
743{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
744 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
745},
746{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
747 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
748},
749{ "table", 0, 0, 0, 0, 0, 0, 0, "",
750 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
751},
752{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
753 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
754},
755{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
756 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
757},
758{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
759 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
760},
761{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
762 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
763},
764{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
765 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
766},
767{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
768 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
769},
770{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
771 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
772},
773{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
774 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
775},
776{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
777 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
778},
779{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
780 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
781},
782{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
783 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
784},
785{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
786 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
787}
Owen Taylor3473f882001-02-23 17:55:21 +0000788};
789
790/*
Owen Taylor3473f882001-02-23 17:55:21 +0000791 * start tags that imply the end of current element
792 */
Daniel Veillard22090732001-07-16 00:06:07 +0000793static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000794"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
795 "dl", "ul", "ol", "menu", "dir", "address", "pre",
796 "listing", "xmp", "head", NULL,
797"head", "p", NULL,
798"title", "p", NULL,
799"body", "head", "style", "link", "title", "p", NULL,
800"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
801 "pre", "listing", "xmp", "head", "li", NULL,
802"hr", "p", "head", NULL,
803"h1", "p", "head", NULL,
804"h2", "p", "head", NULL,
805"h3", "p", "head", NULL,
806"h4", "p", "head", NULL,
807"h5", "p", "head", NULL,
808"h6", "p", "head", NULL,
809"dir", "p", "head", NULL,
810"address", "p", "head", "ul", NULL,
811"pre", "p", "head", "ul", NULL,
812"listing", "p", "head", NULL,
813"xmp", "p", "head", NULL,
814"blockquote", "p", "head", NULL,
815"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
816 "xmp", "head", NULL,
817"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
818 "head", "dd", NULL,
819"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
820 "head", "dt", NULL,
821"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
822 "listing", "xmp", NULL,
823"ol", "p", "head", "ul", NULL,
824"menu", "p", "head", "ul", NULL,
825"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
826"div", "p", "head", NULL,
827"noscript", "p", "head", NULL,
828"center", "font", "b", "i", "p", "head", NULL,
829"a", "a", NULL,
830"caption", "p", NULL,
831"colgroup", "caption", "colgroup", "col", "p", NULL,
832"col", "caption", "col", "p", NULL,
833"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
834 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000835"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
836"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000837"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
838"thead", "caption", "col", "colgroup", NULL,
839"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
840 "tbody", "p", NULL,
841"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
842 "tfoot", "tbody", "p", NULL,
843"optgroup", "option", NULL,
844"option", "option", NULL,
845"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
846 "pre", "listing", "xmp", "a", NULL,
847NULL
848};
849
850/*
851 * The list of HTML elements which are supposed not to have
852 * CDATA content and where a p element will be implied
853 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000854 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000855 * implied paragraph
856 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000857static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000858 "html",
859 "head",
860 "body",
861 NULL
862};
863
864/*
865 * The list of HTML attributes which are of content %Script;
866 * NOTE: when adding ones, check htmlIsScriptAttribute() since
867 * it assumes the name starts with 'on'
868 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000869static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000870 "onclick",
871 "ondblclick",
872 "onmousedown",
873 "onmouseup",
874 "onmouseover",
875 "onmousemove",
876 "onmouseout",
877 "onkeypress",
878 "onkeydown",
879 "onkeyup",
880 "onload",
881 "onunload",
882 "onfocus",
883 "onblur",
884 "onsubmit",
885 "onrest",
886 "onchange",
887 "onselect"
888};
889
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000890/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000891 * This table is used by the htmlparser to know what to do with
892 * broken html pages. By assigning different priorities to different
893 * elements the parser can decide how to handle extra endtags.
894 * Endtags are only allowed to close elements with lower or equal
895 * priority.
896 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000897
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000898typedef struct {
899 const char *name;
900 int priority;
901} elementPriority;
902
Daniel Veillard22090732001-07-16 00:06:07 +0000903static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000904 {"div", 150},
905 {"td", 160},
906 {"th", 160},
907 {"tr", 170},
908 {"thead", 180},
909 {"tbody", 180},
910 {"tfoot", 180},
911 {"table", 190},
912 {"head", 200},
913 {"body", 200},
914 {"html", 220},
915 {NULL, 100} /* Default priority */
916};
Owen Taylor3473f882001-02-23 17:55:21 +0000917
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000918static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000919static int htmlStartCloseIndexinitialized = 0;
920
921/************************************************************************
922 * *
923 * functions to handle HTML specific data *
924 * *
925 ************************************************************************/
926
927/**
928 * htmlInitAutoClose:
929 *
930 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
931 * This is not reentrant. Call xmlInitParser() once before processing in
932 * case of use in multithreaded programs.
933 */
934void
935htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000936 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000937
938 if (htmlStartCloseIndexinitialized) return;
939
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000940 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
941 indx = 0;
942 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
943 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000944 while (htmlStartClose[i] != NULL) i++;
945 i++;
946 }
947 htmlStartCloseIndexinitialized = 1;
948}
949
950/**
951 * htmlTagLookup:
952 * @tag: The tag name in lowercase
953 *
954 * Lookup the HTML tag in the ElementTable
955 *
956 * Returns the related htmlElemDescPtr or NULL if not found.
957 */
Daniel Veillardbb371292001-08-16 23:26:59 +0000958const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +0000959htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000960 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000961
962 for (i = 0; i < (sizeof(html40ElementTable) /
963 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000964 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +0000965 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +0000966 }
967 return(NULL);
968}
969
970/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000971 * htmlGetEndPriority:
972 * @name: The name of the element to look up the priority for.
973 *
974 * Return value: The "endtag" priority.
975 **/
976static int
977htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000978 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000979
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000980 while ((htmlEndPriority[i].name != NULL) &&
981 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
982 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000983
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000984 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000985}
986
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000987
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000988/**
Owen Taylor3473f882001-02-23 17:55:21 +0000989 * htmlCheckAutoClose:
990 * @newtag: The new tag name
991 * @oldtag: The old tag name
992 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000993 * Checks whether the new tag is one of the registered valid tags for
994 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +0000995 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
996 *
997 * Returns 0 if no, 1 if yes.
998 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000999static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001000htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1001{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001002 int i, indx;
1003 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001004
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001005 if (htmlStartCloseIndexinitialized == 0)
1006 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001007
1008 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001009 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001010 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001011 if (closed == NULL)
1012 return (0);
1013 if (xmlStrEqual(BAD_CAST * closed, newtag))
1014 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001015 }
1016
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001017 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001018 i++;
1019 while (htmlStartClose[i] != NULL) {
1020 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001021 return (1);
1022 }
1023 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001024 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001025 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001026}
1027
1028/**
1029 * htmlAutoCloseOnClose:
1030 * @ctxt: an HTML parser context
1031 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001032 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001033 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001034 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001035 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001036static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001037htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1038{
1039 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001040 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001041
1042#ifdef DEBUG
William M. Brack899e64a2003-09-26 18:03:42 +00001043 const xmlChar *oldname;
1044
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001045 xmlGenericError(xmlGenericErrorContext,
1046 "Close of %s stack: %d elements\n", newtag,
1047 ctxt->nameNr);
1048 for (i = 0; i < ctxt->nameNr; i++)
1049 xmlGenericError(xmlGenericErrorContext, "%d : %s\n", i,
1050 ctxt->nameTab[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001051#endif
1052
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001053 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001054
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001055 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001056
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001057 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1058 break;
1059 /*
1060 * A missplaced endtag can only close elements with lower
1061 * or equal priority, so if we find an element with higher
1062 * priority before we find an element with
1063 * matching name, we just ignore this endtag
1064 */
1065 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1066 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001067 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001068 if (i < 0)
1069 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001070
1071 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001072 info = htmlTagLookup(ctxt->name);
1073 if ((info == NULL) || (info->endTag == 1)) {
Owen Taylor3473f882001-02-23 17:55:21 +00001074#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001075 xmlGenericError(xmlGenericErrorContext,
1076 "htmlAutoCloseOnClose: %s closes %s\n", newtag,
1077 ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00001078#endif
Daniel Veillard56098d42001-04-24 12:51:09 +00001079 } else if (info->endTag == 3) {
1080#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001081 xmlGenericError(xmlGenericErrorContext,
1082 "End of tag %s: expecting %s\n", newtag,
1083 ctxt->name);
William M. Brack1633d182001-10-05 15:41:19 +00001084
Daniel Veillard56098d42001-04-24 12:51:09 +00001085#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001086 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1087 ctxt->sax->error(ctxt->userData,
1088 "Opening and ending tag mismatch: %s and %s\n",
1089 newtag, ctxt->name);
1090 ctxt->wellFormed = 0;
1091 }
1092 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1093 ctxt->sax->endElement(ctxt->userData, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00001094#ifdef DEBUG
William M. Brack899e64a2003-09-26 18:03:42 +00001095 oldname = htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001096 if (oldname != NULL) {
1097 xmlGenericError(xmlGenericErrorContext,
1098 "htmlAutoCloseOnClose: popped %s\n", oldname);
1099 }
William M. Brack899e64a2003-09-26 18:03:42 +00001100#else
1101 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001102#endif
Owen Taylor3473f882001-02-23 17:55:21 +00001103 }
1104}
1105
1106/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001107 * htmlAutoCloseOnEnd:
1108 * @ctxt: an HTML parser context
1109 *
1110 * Close all remaining tags at the end of the stream
1111 */
1112static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001113htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1114{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001115 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001116#ifdef DEBUG
William M. Brack899e64a2003-09-26 18:03:42 +00001117 const xmlChar *oldname;
1118
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001119 xmlGenericError(xmlGenericErrorContext,
1120 "Close of stack: %d elements\n", ctxt->nameNr);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001121#endif
1122
William M. Brack899e64a2003-09-26 18:03:42 +00001123 if (ctxt->nameNr == 0)
1124 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001125 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001126#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001127 xmlGenericError(xmlGenericErrorContext, "%d : %s\n", i,
1128 ctxt->nameTab[i]);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001129#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001130 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1131 ctxt->sax->endElement(ctxt->userData, ctxt->name);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001132#ifdef DEBUG
William M. Brack899e64a2003-09-26 18:03:42 +00001133 oldname = htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001134 if (oldname != NULL) {
1135 xmlGenericError(xmlGenericErrorContext,
1136 "htmlAutoCloseOnEnd: popped %s\n", oldname);
1137 }
William M. Brack899e64a2003-09-26 18:03:42 +00001138#else
1139 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001140#endif
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001141 }
1142}
1143
1144/**
Owen Taylor3473f882001-02-23 17:55:21 +00001145 * htmlAutoClose:
1146 * @ctxt: an HTML parser context
1147 * @newtag: The new tag name or NULL
1148 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001149 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001150 * The list is kept in htmlStartClose array. This function is
1151 * called when a new tag has been detected and generates the
1152 * appropriates closes if possible/needed.
1153 * If newtag is NULL this mean we are at the end of the resource
1154 * and we should check
1155 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001156static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001157htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1158{
William M. Brack899e64a2003-09-26 18:03:42 +00001159#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001160 const xmlChar *oldname;
William M. Brack899e64a2003-09-26 18:03:42 +00001161#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001162
1163 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001164 (htmlCheckAutoClose(newtag, ctxt->name))) {
1165#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001166 xmlGenericError(xmlGenericErrorContext,
1167 "htmlAutoClose: %s closes %s\n", newtag,
1168 ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00001169#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001170 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1171 ctxt->sax->endElement(ctxt->userData, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00001172#ifdef DEBUG
William M. Brack899e64a2003-09-26 18:03:42 +00001173 oldname = htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001174 if (oldname != NULL) {
1175 xmlGenericError(xmlGenericErrorContext,
1176 "htmlAutoClose: popped %s\n", oldname);
Owen Taylor3473f882001-02-23 17:55:21 +00001177 }
William M. Brack899e64a2003-09-26 18:03:42 +00001178#else
1179 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001180#endif
Owen Taylor3473f882001-02-23 17:55:21 +00001181 }
1182 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001183 htmlAutoCloseOnEnd(ctxt);
1184 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001185 }
1186 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001187 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1188 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1189 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00001190#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001191 xmlGenericError(xmlGenericErrorContext,
1192 "htmlAutoClose: EOF closes %s\n", ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00001193#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001194 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1195 ctxt->sax->endElement(ctxt->userData, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00001196#ifdef DEBUG
William M. Brack899e64a2003-09-26 18:03:42 +00001197 oldname = htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001198 if (oldname != NULL) {
1199 xmlGenericError(xmlGenericErrorContext,
1200 "htmlAutoClose: popped %s\n", oldname);
Owen Taylor3473f882001-02-23 17:55:21 +00001201 }
William M. Brack899e64a2003-09-26 18:03:42 +00001202#else
1203 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001204#endif
1205 }
Owen Taylor3473f882001-02-23 17:55:21 +00001206
1207}
1208
1209/**
1210 * htmlAutoCloseTag:
1211 * @doc: the HTML document
1212 * @name: The tag name
1213 * @elem: the HTML element
1214 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001215 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001216 * The list is kept in htmlStartClose array. This function checks
1217 * if the element or one of it's children would autoclose the
1218 * given tag.
1219 *
1220 * Returns 1 if autoclose, 0 otherwise
1221 */
1222int
1223htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1224 htmlNodePtr child;
1225
1226 if (elem == NULL) return(1);
1227 if (xmlStrEqual(name, elem->name)) return(0);
1228 if (htmlCheckAutoClose(elem->name, name)) return(1);
1229 child = elem->children;
1230 while (child != NULL) {
1231 if (htmlAutoCloseTag(doc, name, child)) return(1);
1232 child = child->next;
1233 }
1234 return(0);
1235}
1236
1237/**
1238 * htmlIsAutoClosed:
1239 * @doc: the HTML document
1240 * @elem: the HTML element
1241 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001242 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001243 * The list is kept in htmlStartClose array. This function checks
1244 * if a tag is autoclosed by one of it's child
1245 *
1246 * Returns 1 if autoclosed, 0 otherwise
1247 */
1248int
1249htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1250 htmlNodePtr child;
1251
1252 if (elem == NULL) return(1);
1253 child = elem->children;
1254 while (child != NULL) {
1255 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1256 child = child->next;
1257 }
1258 return(0);
1259}
1260
1261/**
1262 * htmlCheckImplied:
1263 * @ctxt: an HTML parser context
1264 * @newtag: The new tag name
1265 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001266 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001267 * called when a new tag has been detected and generates the
1268 * appropriates implicit tags if missing
1269 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001270static void
Owen Taylor3473f882001-02-23 17:55:21 +00001271htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1272 if (!htmlOmittedDefaultValue)
1273 return;
1274 if (xmlStrEqual(newtag, BAD_CAST"html"))
1275 return;
1276 if (ctxt->nameNr <= 0) {
1277#ifdef DEBUG
1278 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
1279#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001280 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001281 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1282 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1283 }
1284 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1285 return;
1286 if ((ctxt->nameNr <= 1) &&
1287 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1288 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1289 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1290 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1291 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1292 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1293 /*
1294 * dropped OBJECT ... i you put it first BODY will be
1295 * assumed !
1296 */
1297#ifdef DEBUG
1298 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
1299#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001300 htmlnamePush(ctxt, BAD_CAST"head");
Owen Taylor3473f882001-02-23 17:55:21 +00001301 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1302 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1303 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1304 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1305 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1306 int i;
1307 for (i = 0;i < ctxt->nameNr;i++) {
1308 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1309 return;
1310 }
1311 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1312 return;
1313 }
1314 }
1315
1316#ifdef DEBUG
1317 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
1318#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001319 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001320 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1321 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1322 }
1323}
1324
1325/**
1326 * htmlCheckParagraph
1327 * @ctxt: an HTML parser context
1328 *
1329 * Check whether a p element need to be implied before inserting
1330 * characters in the current element.
1331 *
1332 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1333 * in case of error.
1334 */
1335
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001336static int
Owen Taylor3473f882001-02-23 17:55:21 +00001337htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1338 const xmlChar *tag;
1339 int i;
1340
1341 if (ctxt == NULL)
1342 return(-1);
1343 tag = ctxt->name;
1344 if (tag == NULL) {
1345 htmlAutoClose(ctxt, BAD_CAST"p");
1346 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001347 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001348 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1349 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1350 return(1);
1351 }
1352 if (!htmlOmittedDefaultValue)
1353 return(0);
1354 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1355 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1356#ifdef DEBUG
1357 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
1358#endif
1359 htmlAutoClose(ctxt, BAD_CAST"p");
1360 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001361 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001362 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1363 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1364 return(1);
1365 }
1366 }
1367 return(0);
1368}
1369
1370/**
1371 * htmlIsScriptAttribute:
1372 * @name: an attribute name
1373 *
1374 * Check if an attribute is of content type Script
1375 *
1376 * Returns 1 is the attribute is a script 0 otherwise
1377 */
1378int
1379htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001380 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001381
1382 if (name == NULL)
1383 return(0);
1384 /*
1385 * all script attributes start with 'on'
1386 */
1387 if ((name[0] != 'o') || (name[1] != 'n'))
1388 return(0);
1389 for (i = 0;
1390 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1391 i++) {
1392 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1393 return(1);
1394 }
1395 return(0);
1396}
1397
1398/************************************************************************
1399 * *
1400 * The list of HTML predefined entities *
1401 * *
1402 ************************************************************************/
1403
1404
Daniel Veillard22090732001-07-16 00:06:07 +00001405static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001406/*
1407 * the 4 absolute ones, plus apostrophe.
1408 */
1409{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1410{ 38, "amp", "ampersand, U+0026 ISOnum" },
1411{ 39, "apos", "single quote" },
1412{ 60, "lt", "less-than sign, U+003C ISOnum" },
1413{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1414
1415/*
1416 * A bunch still in the 128-255 range
1417 * Replacing them depend really on the charset used.
1418 */
1419{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1420{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1421{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1422{ 163, "pound","pound sign, U+00A3 ISOnum" },
1423{ 164, "curren","currency sign, U+00A4 ISOnum" },
1424{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1425{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1426{ 167, "sect", "section sign, U+00A7 ISOnum" },
1427{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1428{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1429{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1430{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1431{ 172, "not", "not sign, U+00AC ISOnum" },
1432{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1433{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1434{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1435{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1436{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1437{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1438{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1439{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1440{ 181, "micro","micro sign, U+00B5 ISOnum" },
1441{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1442{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1443{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1444{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1445{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1446{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1447{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1448{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1449{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1450{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1451{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1452{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1453{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1454{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1455{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1456{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1457{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1458{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1459{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1460{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1461{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1462{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1463{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1464{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1465{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1466{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1467{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1468{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1469{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1470{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1471{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1472{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1473{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1474{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1475{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1476{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1477{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1478{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1479{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1480{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1481{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1482{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1483{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1484{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1485{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1486{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1487{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1488{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1489{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1490{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1491{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1492{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1493{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1494{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1495{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1496{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1497{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1498{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1499{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1500{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1501{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1502{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1503{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1504{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1505{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1506{ 247, "divide","division sign, U+00F7 ISOnum" },
1507{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1508{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1509{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1510{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1511{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1512{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1513{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1514{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1515
1516{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1517{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1518{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1519{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1520{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1521
1522/*
1523 * Anything below should really be kept as entities references
1524 */
1525{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1526
1527{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1528{ 732, "tilde","small tilde, U+02DC ISOdia" },
1529
1530{ 913, "Alpha","greek capital letter alpha, U+0391" },
1531{ 914, "Beta", "greek capital letter beta, U+0392" },
1532{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1533{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1534{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1535{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1536{ 919, "Eta", "greek capital letter eta, U+0397" },
1537{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1538{ 921, "Iota", "greek capital letter iota, U+0399" },
1539{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001540{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001541{ 924, "Mu", "greek capital letter mu, U+039C" },
1542{ 925, "Nu", "greek capital letter nu, U+039D" },
1543{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1544{ 927, "Omicron","greek capital letter omicron, U+039F" },
1545{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1546{ 929, "Rho", "greek capital letter rho, U+03A1" },
1547{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1548{ 932, "Tau", "greek capital letter tau, U+03A4" },
1549{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1550{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1551{ 935, "Chi", "greek capital letter chi, U+03A7" },
1552{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1553{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1554
1555{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1556{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1557{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1558{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1559{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1560{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1561{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1562{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1563{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1564{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1565{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1566{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1567{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1568{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1569{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1570{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1571{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1572{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1573{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1574{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1575{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1576{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1577{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1578{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1579{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1580{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1581{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1582{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1583
1584{ 8194, "ensp", "en space, U+2002 ISOpub" },
1585{ 8195, "emsp", "em space, U+2003 ISOpub" },
1586{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1587{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1588{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1589{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1590{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1591{ 8211, "ndash","en dash, U+2013 ISOpub" },
1592{ 8212, "mdash","em dash, U+2014 ISOpub" },
1593{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1594{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1595{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1596{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1597{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1598{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1599{ 8224, "dagger","dagger, U+2020 ISOpub" },
1600{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1601
1602{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1603{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1604
1605{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1606
1607{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1608{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1609
1610{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1611{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1612
1613{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1614{ 8260, "frasl","fraction slash, U+2044 NEW" },
1615
1616{ 8364, "euro", "euro sign, U+20AC NEW" },
1617
1618{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1619{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1620{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1621{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1622{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1623{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1624{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1625{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1626{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1627{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1628{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1629{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1630{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1631{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1632{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1633{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1634
1635{ 8704, "forall","for all, U+2200 ISOtech" },
1636{ 8706, "part", "partial differential, U+2202 ISOtech" },
1637{ 8707, "exist","there exists, U+2203 ISOtech" },
1638{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1639{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1640{ 8712, "isin", "element of, U+2208 ISOtech" },
1641{ 8713, "notin","not an element of, U+2209 ISOtech" },
1642{ 8715, "ni", "contains as member, U+220B ISOtech" },
1643{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001644{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001645{ 8722, "minus","minus sign, U+2212 ISOtech" },
1646{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1647{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1648{ 8733, "prop", "proportional to, U+221D ISOtech" },
1649{ 8734, "infin","infinity, U+221E ISOtech" },
1650{ 8736, "ang", "angle, U+2220 ISOamso" },
1651{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1652{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1653{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1654{ 8746, "cup", "union = cup, U+222A ISOtech" },
1655{ 8747, "int", "integral, U+222B ISOtech" },
1656{ 8756, "there4","therefore, U+2234 ISOtech" },
1657{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1658{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1659{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1660{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1661{ 8801, "equiv","identical to, U+2261 ISOtech" },
1662{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1663{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1664{ 8834, "sub", "subset of, U+2282 ISOtech" },
1665{ 8835, "sup", "superset of, U+2283 ISOtech" },
1666{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1667{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1668{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1669{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1670{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1671{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1672{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1673{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1674{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1675{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1676{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1677{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1678{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1679{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1680
1681{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1682{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1683{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1684{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1685
1686};
1687
1688/************************************************************************
1689 * *
1690 * Commodity functions to handle entities *
1691 * *
1692 ************************************************************************/
1693
1694/*
1695 * Macro used to grow the current buffer.
1696 */
1697#define growBuffer(buffer) { \
1698 buffer##_size *= 2; \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001699 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Owen Taylor3473f882001-02-23 17:55:21 +00001700 if (buffer == NULL) { \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001701 xmlGenericError(xmlGenericErrorContext, "realloc failed\n"); \
Owen Taylor3473f882001-02-23 17:55:21 +00001702 return(NULL); \
1703 } \
1704}
1705
1706/**
1707 * htmlEntityLookup:
1708 * @name: the entity name
1709 *
1710 * Lookup the given entity in EntitiesTable
1711 *
1712 * TODO: the linear scan is really ugly, an hash table is really needed.
1713 *
1714 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1715 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001716const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001717htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001718 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001719
1720 for (i = 0;i < (sizeof(html40EntitiesTable)/
1721 sizeof(html40EntitiesTable[0]));i++) {
1722 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1723#ifdef DEBUG
1724 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1725#endif
William M. Brack78637da2003-07-31 14:47:38 +00001726 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001727 }
1728 }
1729 return(NULL);
1730}
1731
1732/**
1733 * htmlEntityValueLookup:
1734 * @value: the entity's unicode value
1735 *
1736 * Lookup the given entity in EntitiesTable
1737 *
1738 * TODO: the linear scan is really ugly, an hash table is really needed.
1739 *
1740 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1741 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001742const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001743htmlEntityValueLookup(unsigned int value) {
1744 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001745#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001746 unsigned int lv = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001747#endif
1748
1749 for (i = 0;i < (sizeof(html40EntitiesTable)/
1750 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001751 if (html40EntitiesTable[i].value >= value) {
1752 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001753 break;
1754#ifdef DEBUG
1755 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1756#endif
William M. Brack78637da2003-07-31 14:47:38 +00001757 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001758 }
1759#ifdef DEBUG
1760 if (lv > html40EntitiesTable[i].value) {
1761 xmlGenericError(xmlGenericErrorContext,
1762 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1763 lv, html40EntitiesTable[i].value);
1764 }
1765 lv = html40EntitiesTable[i].value;
1766#endif
1767 }
1768 return(NULL);
1769}
1770
1771/**
1772 * UTF8ToHtml:
1773 * @out: a pointer to an array of bytes to store the result
1774 * @outlen: the length of @out
1775 * @in: a pointer to an array of UTF-8 chars
1776 * @inlen: the length of @in
1777 *
1778 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1779 * plus HTML entities block of chars out.
1780 *
1781 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1782 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001783 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001784 * The value of @outlen after return is the number of octets consumed.
1785 */
1786int
1787UTF8ToHtml(unsigned char* out, int *outlen,
1788 const unsigned char* in, int *inlen) {
1789 const unsigned char* processed = in;
1790 const unsigned char* outend;
1791 const unsigned char* outstart = out;
1792 const unsigned char* instart = in;
1793 const unsigned char* inend;
1794 unsigned int c, d;
1795 int trailing;
1796
1797 if (in == NULL) {
1798 /*
1799 * initialization nothing to do
1800 */
1801 *outlen = 0;
1802 *inlen = 0;
1803 return(0);
1804 }
1805 inend = in + (*inlen);
1806 outend = out + (*outlen);
1807 while (in < inend) {
1808 d = *in++;
1809 if (d < 0x80) { c= d; trailing= 0; }
1810 else if (d < 0xC0) {
1811 /* trailing byte in leading position */
1812 *outlen = out - outstart;
1813 *inlen = processed - instart;
1814 return(-2);
1815 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1816 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1817 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1818 else {
1819 /* no chance for this in Ascii */
1820 *outlen = out - outstart;
1821 *inlen = processed - instart;
1822 return(-2);
1823 }
1824
1825 if (inend - in < trailing) {
1826 break;
1827 }
1828
1829 for ( ; trailing; trailing--) {
1830 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1831 break;
1832 c <<= 6;
1833 c |= d & 0x3F;
1834 }
1835
1836 /* assertion: c is a single UTF-4 value */
1837 if (c < 0x80) {
1838 if (out + 1 >= outend)
1839 break;
1840 *out++ = c;
1841 } else {
1842 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001843 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001844
1845 /*
1846 * Try to lookup a predefined HTML entity for it
1847 */
1848
1849 ent = htmlEntityValueLookup(c);
1850 if (ent == NULL) {
1851 /* no chance for this in Ascii */
1852 *outlen = out - outstart;
1853 *inlen = processed - instart;
1854 return(-2);
1855 }
1856 len = strlen(ent->name);
1857 if (out + 2 + len >= outend)
1858 break;
1859 *out++ = '&';
1860 memcpy(out, ent->name, len);
1861 out += len;
1862 *out++ = ';';
1863 }
1864 processed = in;
1865 }
1866 *outlen = out - outstart;
1867 *inlen = processed - instart;
1868 return(0);
1869}
1870
1871/**
1872 * htmlEncodeEntities:
1873 * @out: a pointer to an array of bytes to store the result
1874 * @outlen: the length of @out
1875 * @in: a pointer to an array of UTF-8 chars
1876 * @inlen: the length of @in
1877 * @quoteChar: the quote character to escape (' or ") or zero.
1878 *
1879 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1880 * plus HTML entities block of chars out.
1881 *
1882 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1883 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001884 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001885 * The value of @outlen after return is the number of octets consumed.
1886 */
1887int
1888htmlEncodeEntities(unsigned char* out, int *outlen,
1889 const unsigned char* in, int *inlen, int quoteChar) {
1890 const unsigned char* processed = in;
1891 const unsigned char* outend = out + (*outlen);
1892 const unsigned char* outstart = out;
1893 const unsigned char* instart = in;
1894 const unsigned char* inend = in + (*inlen);
1895 unsigned int c, d;
1896 int trailing;
1897
1898 while (in < inend) {
1899 d = *in++;
1900 if (d < 0x80) { c= d; trailing= 0; }
1901 else if (d < 0xC0) {
1902 /* trailing byte in leading position */
1903 *outlen = out - outstart;
1904 *inlen = processed - instart;
1905 return(-2);
1906 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1907 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1908 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1909 else {
1910 /* no chance for this in Ascii */
1911 *outlen = out - outstart;
1912 *inlen = processed - instart;
1913 return(-2);
1914 }
1915
1916 if (inend - in < trailing)
1917 break;
1918
1919 while (trailing--) {
1920 if (((d= *in++) & 0xC0) != 0x80) {
1921 *outlen = out - outstart;
1922 *inlen = processed - instart;
1923 return(-2);
1924 }
1925 c <<= 6;
1926 c |= d & 0x3F;
1927 }
1928
1929 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001930 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1931 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001932 if (out >= outend)
1933 break;
1934 *out++ = c;
1935 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001936 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001937 const char *cp;
1938 char nbuf[16];
1939 int len;
1940
1941 /*
1942 * Try to lookup a predefined HTML entity for it
1943 */
1944 ent = htmlEntityValueLookup(c);
1945 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001946 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001947 cp = nbuf;
1948 }
1949 else
1950 cp = ent->name;
1951 len = strlen(cp);
1952 if (out + 2 + len > outend)
1953 break;
1954 *out++ = '&';
1955 memcpy(out, cp, len);
1956 out += len;
1957 *out++ = ';';
1958 }
1959 processed = in;
1960 }
1961 *outlen = out - outstart;
1962 *inlen = processed - instart;
1963 return(0);
1964}
1965
1966/**
1967 * htmlDecodeEntities:
1968 * @ctxt: the parser context
1969 * @len: the len to decode (in bytes !), -1 for no size limit
1970 * @end: an end marker xmlChar, 0 if none
1971 * @end2: an end marker xmlChar, 0 if none
1972 * @end3: an end marker xmlChar, 0 if none
1973 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001974 * Substitute the HTML entities by their value
Owen Taylor3473f882001-02-23 17:55:21 +00001975 *
1976 * DEPRECATED !!!!
1977 *
1978 * Returns A newly allocated string with the substitution done. The caller
1979 * must deallocate it !
1980 */
1981xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001982htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1983 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001984 static int deprecated = 0;
1985 if (!deprecated) {
1986 xmlGenericError(xmlGenericErrorContext,
1987 "htmlDecodeEntities() deprecated function reached\n");
1988 deprecated = 1;
1989 }
1990 return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001991}
1992
1993/************************************************************************
1994 * *
1995 * Commodity functions to handle streams *
1996 * *
1997 ************************************************************************/
1998
1999/**
Owen Taylor3473f882001-02-23 17:55:21 +00002000 * htmlNewInputStream:
2001 * @ctxt: an HTML parser context
2002 *
2003 * Create a new input stream structure
2004 * Returns the new input stream or NULL
2005 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002006static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00002007htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2008 htmlParserInputPtr input;
2009
2010 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2011 if (input == NULL) {
2012 ctxt->errNo = XML_ERR_NO_MEMORY;
2013 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2014 ctxt->sax->error(ctxt->userData,
2015 "malloc: couldn't allocate a new input stream\n");
2016 return(NULL);
2017 }
2018 memset(input, 0, sizeof(htmlParserInput));
2019 input->filename = NULL;
2020 input->directory = NULL;
2021 input->base = NULL;
2022 input->cur = NULL;
2023 input->buf = NULL;
2024 input->line = 1;
2025 input->col = 1;
2026 input->buf = NULL;
2027 input->free = NULL;
2028 input->version = NULL;
2029 input->consumed = 0;
2030 input->length = 0;
2031 return(input);
2032}
2033
2034
2035/************************************************************************
2036 * *
2037 * Commodity functions, cleanup needed ? *
2038 * *
2039 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002040/*
2041 * all tags allowing pc data from the html 4.01 loose dtd
2042 * NOTE: it might be more apropriate to integrate this information
2043 * into the html40ElementTable array but I don't want to risk any
2044 * binary incomptibility
2045 */
2046static const char *allowPCData[] = {
2047 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2048 "blockquote", "body", "button", "caption", "center", "cite", "code",
2049 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2050 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2051 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2052 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2053};
Owen Taylor3473f882001-02-23 17:55:21 +00002054
2055/**
2056 * areBlanks:
2057 * @ctxt: an HTML parser context
2058 * @str: a xmlChar *
2059 * @len: the size of @str
2060 *
2061 * Is this a sequence of blank chars that one can ignore ?
2062 *
2063 * Returns 1 if ignorable 0 otherwise.
2064 */
2065
2066static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002067 unsigned int i;
2068 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002069 xmlNodePtr lastChild;
2070
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002071 for (j = 0;j < len;j++)
2072 if (!(IS_BLANK(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002073
2074 if (CUR == 0) return(1);
2075 if (CUR != '<') return(0);
2076 if (ctxt->name == NULL)
2077 return(1);
2078 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2079 return(1);
2080 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2081 return(1);
2082 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
2083 return(1);
2084 if (ctxt->node == NULL) return(0);
2085 lastChild = xmlGetLastChild(ctxt->node);
2086 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002087 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2088 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002089 /* keep ws in constructs like ...<b> </b>...
2090 for all tags "b" allowing PCDATA */
2091 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2092 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2093 return(0);
2094 }
2095 }
Owen Taylor3473f882001-02-23 17:55:21 +00002096 } else if (xmlNodeIsText(lastChild)) {
2097 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002098 } else {
2099 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2100 for all tags "p" allowing PCDATA */
2101 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2102 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2103 return(0);
2104 }
2105 }
Owen Taylor3473f882001-02-23 17:55:21 +00002106 }
2107 return(1);
2108}
2109
2110/**
Owen Taylor3473f882001-02-23 17:55:21 +00002111 * htmlNewDocNoDtD:
2112 * @URI: URI for the dtd, or NULL
2113 * @ExternalID: the external ID of the DTD, or NULL
2114 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002115 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2116 * are NULL
2117 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002118 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002119 */
2120htmlDocPtr
2121htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2122 xmlDocPtr cur;
2123
2124 /*
2125 * Allocate a new document and fill the fields.
2126 */
2127 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2128 if (cur == NULL) {
2129 xmlGenericError(xmlGenericErrorContext,
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002130 "htmlNewDocNoDtD : malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002131 return(NULL);
2132 }
2133 memset(cur, 0, sizeof(xmlDoc));
2134
2135 cur->type = XML_HTML_DOCUMENT_NODE;
2136 cur->version = NULL;
2137 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002138 cur->doc = cur;
2139 cur->name = NULL;
2140 cur->children = NULL;
2141 cur->extSubset = NULL;
2142 cur->oldNs = NULL;
2143 cur->encoding = NULL;
2144 cur->standalone = 1;
2145 cur->compression = 0;
2146 cur->ids = NULL;
2147 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002148 cur->_private = NULL;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002149 if ((ExternalID != NULL) ||
2150 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002151 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002152 return(cur);
2153}
2154
2155/**
2156 * htmlNewDoc:
2157 * @URI: URI for the dtd, or NULL
2158 * @ExternalID: the external ID of the DTD, or NULL
2159 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002160 * Creates a new HTML document
2161 *
Owen Taylor3473f882001-02-23 17:55:21 +00002162 * Returns a new document
2163 */
2164htmlDocPtr
2165htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2166 if ((URI == NULL) && (ExternalID == NULL))
2167 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002168 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2169 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002170
2171 return(htmlNewDocNoDtD(URI, ExternalID));
2172}
2173
2174
2175/************************************************************************
2176 * *
2177 * The parser itself *
2178 * Relates to http://www.w3.org/TR/html40 *
2179 * *
2180 ************************************************************************/
2181
2182/************************************************************************
2183 * *
2184 * The parser itself *
2185 * *
2186 ************************************************************************/
2187
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002188static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002189
Owen Taylor3473f882001-02-23 17:55:21 +00002190/**
2191 * htmlParseHTMLName:
2192 * @ctxt: an HTML parser context
2193 *
2194 * parse an HTML tag or attribute name, note that we convert it to lowercase
2195 * since HTML names are not case-sensitive.
2196 *
2197 * Returns the Tag Name parsed or NULL
2198 */
2199
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002200static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002201htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002202 int i = 0;
2203 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2204
2205 if (!IS_LETTER(CUR) && (CUR != '_') &&
2206 (CUR != ':')) return(NULL);
2207
2208 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2209 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2210 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2211 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2212 else loc[i] = CUR;
2213 i++;
2214
2215 NEXT;
2216 }
2217
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002218 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002219}
2220
2221/**
2222 * htmlParseName:
2223 * @ctxt: an HTML parser context
2224 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002225 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002226 *
2227 * Returns the Name parsed or NULL
2228 */
2229
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002230static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002231htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002232 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002233 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002234 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002235
2236 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002237
2238 /*
2239 * Accelerator for simple ASCII names
2240 */
2241 in = ctxt->input->cur;
2242 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2243 ((*in >= 0x41) && (*in <= 0x5A)) ||
2244 (*in == '_') || (*in == ':')) {
2245 in++;
2246 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2247 ((*in >= 0x41) && (*in <= 0x5A)) ||
2248 ((*in >= 0x30) && (*in <= 0x39)) ||
2249 (*in == '_') || (*in == '-') ||
2250 (*in == ':') || (*in == '.'))
2251 in++;
2252 if ((*in > 0) && (*in < 0x80)) {
2253 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002254 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002255 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002256 ctxt->nbChars += count;
2257 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002258 return(ret);
2259 }
2260 }
2261 return(htmlParseNameComplex(ctxt));
2262}
2263
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002264static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002265htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002266 int len = 0, l;
2267 int c;
2268 int count = 0;
2269
2270 /*
2271 * Handler for more complex cases
2272 */
2273 GROW;
2274 c = CUR_CHAR(l);
2275 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2276 (!IS_LETTER(c) && (c != '_') &&
2277 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002278 return(NULL);
2279 }
2280
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002281 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2282 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2283 (c == '.') || (c == '-') ||
2284 (c == '_') || (c == ':') ||
2285 (IS_COMBINING(c)) ||
2286 (IS_EXTENDER(c)))) {
2287 if (count++ > 100) {
2288 count = 0;
2289 GROW;
2290 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002291 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002292 NEXTL(l);
2293 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002294 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002295 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002296}
2297
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002298
Owen Taylor3473f882001-02-23 17:55:21 +00002299/**
2300 * htmlParseHTMLAttribute:
2301 * @ctxt: an HTML parser context
2302 * @stop: a char stop value
2303 *
2304 * parse an HTML attribute value till the stop (quote), if
2305 * stop is 0 then it stops at the first space
2306 *
2307 * Returns the attribute parsed or NULL
2308 */
2309
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002310static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002311htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2312 xmlChar *buffer = NULL;
2313 int buffer_size = 0;
2314 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002315 const xmlChar *name = NULL;
2316 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002317 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002318
2319 /*
2320 * allocate a translation buffer.
2321 */
2322 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002323 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002324 if (buffer == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00002325 xmlGenericError(xmlGenericErrorContext,
2326 "htmlParseHTMLAttribute: malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002327 return(NULL);
2328 }
2329 out = buffer;
2330
2331 /*
2332 * Ok loop until we reach one of the ending chars
2333 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002334 while ((CUR != 0) && (CUR != stop)) {
2335 if ((stop == 0) && (CUR == '>')) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002336 if ((stop == 0) && (IS_BLANK(CUR))) break;
2337 if (CUR == '&') {
2338 if (NXT(1) == '#') {
2339 unsigned int c;
2340 int bits;
2341
2342 c = htmlParseCharRef(ctxt);
2343 if (c < 0x80)
2344 { *out++ = c; bits= -6; }
2345 else if (c < 0x800)
2346 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2347 else if (c < 0x10000)
2348 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2349 else
2350 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2351
2352 for ( ; bits >= 0; bits-= 6) {
2353 *out++ = ((c >> bits) & 0x3F) | 0x80;
2354 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002355
2356 if (out - buffer > buffer_size - 100) {
2357 int indx = out - buffer;
2358
2359 growBuffer(buffer);
2360 out = &buffer[indx];
2361 }
Owen Taylor3473f882001-02-23 17:55:21 +00002362 } else {
2363 ent = htmlParseEntityRef(ctxt, &name);
2364 if (name == NULL) {
2365 *out++ = '&';
2366 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002367 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002368
2369 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002370 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002371 }
2372 } else if (ent == NULL) {
2373 *out++ = '&';
2374 cur = name;
2375 while (*cur != 0) {
2376 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002377 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002378
2379 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002380 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002381 }
2382 *out++ = *cur++;
2383 }
Owen Taylor3473f882001-02-23 17:55:21 +00002384 } else {
2385 unsigned int c;
2386 int bits;
2387
2388 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002389 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002390
2391 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002392 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002393 }
2394 c = (xmlChar)ent->value;
2395 if (c < 0x80)
2396 { *out++ = c; bits= -6; }
2397 else if (c < 0x800)
2398 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2399 else if (c < 0x10000)
2400 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2401 else
2402 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2403
2404 for ( ; bits >= 0; bits-= 6) {
2405 *out++ = ((c >> bits) & 0x3F) | 0x80;
2406 }
Owen Taylor3473f882001-02-23 17:55:21 +00002407 }
2408 }
2409 } else {
2410 unsigned int c;
2411 int bits, l;
2412
2413 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002414 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002415
2416 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002417 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002418 }
2419 c = CUR_CHAR(l);
2420 if (c < 0x80)
2421 { *out++ = c; bits= -6; }
2422 else if (c < 0x800)
2423 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2424 else if (c < 0x10000)
2425 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2426 else
2427 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2428
2429 for ( ; bits >= 0; bits-= 6) {
2430 *out++ = ((c >> bits) & 0x3F) | 0x80;
2431 }
2432 NEXT;
2433 }
2434 }
2435 *out++ = 0;
2436 return(buffer);
2437}
2438
2439/**
Owen Taylor3473f882001-02-23 17:55:21 +00002440 * htmlParseEntityRef:
2441 * @ctxt: an HTML parser context
2442 * @str: location to store the entity name
2443 *
2444 * parse an HTML ENTITY references
2445 *
2446 * [68] EntityRef ::= '&' Name ';'
2447 *
2448 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2449 * if non-NULL *str will have to be freed by the caller.
2450 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002451const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002452htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2453 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002454 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002455 *str = NULL;
2456
2457 if (CUR == '&') {
2458 NEXT;
2459 name = htmlParseName(ctxt);
2460 if (name == NULL) {
2461 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2462 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2463 ctxt->wellFormed = 0;
2464 } else {
2465 GROW;
2466 if (CUR == ';') {
2467 *str = name;
2468
2469 /*
2470 * Lookup the entity in the table.
2471 */
2472 ent = htmlEntityLookup(name);
2473 if (ent != NULL) /* OK that's ugly !!! */
2474 NEXT;
2475 } else {
2476 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2477 ctxt->sax->error(ctxt->userData,
2478 "htmlParseEntityRef: expecting ';'\n");
2479 *str = name;
2480 }
2481 }
2482 }
2483 return(ent);
2484}
2485
2486/**
2487 * htmlParseAttValue:
2488 * @ctxt: an HTML parser context
2489 *
2490 * parse a value for an attribute
2491 * Note: the parser won't do substitution of entities here, this
2492 * will be handled later in xmlStringGetNodeList, unless it was
2493 * asked for ctxt->replaceEntities != 0
2494 *
2495 * Returns the AttValue parsed or NULL.
2496 */
2497
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002498static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002499htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2500 xmlChar *ret = NULL;
2501
2502 if (CUR == '"') {
2503 NEXT;
2504 ret = htmlParseHTMLAttribute(ctxt, '"');
2505 if (CUR != '"') {
2506 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2507 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2508 ctxt->wellFormed = 0;
2509 } else
2510 NEXT;
2511 } else if (CUR == '\'') {
2512 NEXT;
2513 ret = htmlParseHTMLAttribute(ctxt, '\'');
2514 if (CUR != '\'') {
2515 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2516 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2517 ctxt->wellFormed = 0;
2518 } else
2519 NEXT;
2520 } else {
2521 /*
2522 * That's an HTMLism, the attribute value may not be quoted
2523 */
2524 ret = htmlParseHTMLAttribute(ctxt, 0);
2525 if (ret == NULL) {
2526 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2527 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2528 ctxt->wellFormed = 0;
2529 }
2530 }
2531 return(ret);
2532}
2533
2534/**
2535 * htmlParseSystemLiteral:
2536 * @ctxt: an HTML parser context
2537 *
2538 * parse an HTML Literal
2539 *
2540 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2541 *
2542 * Returns the SystemLiteral parsed or NULL
2543 */
2544
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002545static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002546htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2547 const xmlChar *q;
2548 xmlChar *ret = NULL;
2549
2550 if (CUR == '"') {
2551 NEXT;
2552 q = CUR_PTR;
Daniel Veillard34ba3872003-07-15 13:34:05 +00002553 while ((IS_CHAR((unsigned int) CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002554 NEXT;
Daniel Veillard34ba3872003-07-15 13:34:05 +00002555 if (!IS_CHAR((unsigned int) CUR)) {
Owen Taylor3473f882001-02-23 17:55:21 +00002556 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2557 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2558 ctxt->wellFormed = 0;
2559 } else {
2560 ret = xmlStrndup(q, CUR_PTR - q);
2561 NEXT;
2562 }
2563 } else if (CUR == '\'') {
2564 NEXT;
2565 q = CUR_PTR;
Daniel Veillard34ba3872003-07-15 13:34:05 +00002566 while ((IS_CHAR((unsigned int) CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002567 NEXT;
Daniel Veillard34ba3872003-07-15 13:34:05 +00002568 if (!IS_CHAR((unsigned int) CUR)) {
Owen Taylor3473f882001-02-23 17:55:21 +00002569 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2570 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2571 ctxt->wellFormed = 0;
2572 } else {
2573 ret = xmlStrndup(q, CUR_PTR - q);
2574 NEXT;
2575 }
2576 } else {
2577 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2578 ctxt->sax->error(ctxt->userData,
2579 "SystemLiteral \" or ' expected\n");
2580 ctxt->wellFormed = 0;
2581 }
2582
2583 return(ret);
2584}
2585
2586/**
2587 * htmlParsePubidLiteral:
2588 * @ctxt: an HTML parser context
2589 *
2590 * parse an HTML public literal
2591 *
2592 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2593 *
2594 * Returns the PubidLiteral parsed or NULL.
2595 */
2596
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002597static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002598htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2599 const xmlChar *q;
2600 xmlChar *ret = NULL;
2601 /*
2602 * Name ::= (Letter | '_') (NameChar)*
2603 */
2604 if (CUR == '"') {
2605 NEXT;
2606 q = CUR_PTR;
2607 while (IS_PUBIDCHAR(CUR)) NEXT;
2608 if (CUR != '"') {
2609 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2610 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2611 ctxt->wellFormed = 0;
2612 } else {
2613 ret = xmlStrndup(q, CUR_PTR - q);
2614 NEXT;
2615 }
2616 } else if (CUR == '\'') {
2617 NEXT;
2618 q = CUR_PTR;
Daniel Veillard6560a422003-03-27 21:25:38 +00002619 while ((IS_PUBIDCHAR(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002620 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002621 if (CUR != '\'') {
Owen Taylor3473f882001-02-23 17:55:21 +00002622 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2623 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2624 ctxt->wellFormed = 0;
2625 } else {
2626 ret = xmlStrndup(q, CUR_PTR - q);
2627 NEXT;
2628 }
2629 } else {
2630 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2631 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2632 ctxt->wellFormed = 0;
2633 }
2634
2635 return(ret);
2636}
2637
2638/**
2639 * htmlParseScript:
2640 * @ctxt: an HTML parser context
2641 *
2642 * parse the content of an HTML SCRIPT or STYLE element
2643 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2644 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2645 * http://www.w3.org/TR/html4/types.html#type-script
2646 * http://www.w3.org/TR/html4/types.html#h-6.15
2647 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2648 *
2649 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2650 * element and the value of intrinsic event attributes. User agents must
2651 * not evaluate script data as HTML markup but instead must pass it on as
2652 * data to a script engine.
2653 * NOTES:
2654 * - The content is passed like CDATA
2655 * - the attributes for style and scripting "onXXX" are also described
2656 * as CDATA but SGML allows entities references in attributes so their
2657 * processing is identical as other attributes
2658 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002659static void
Owen Taylor3473f882001-02-23 17:55:21 +00002660htmlParseScript(htmlParserCtxtPtr ctxt) {
2661 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2662 int nbchar = 0;
2663 xmlChar cur;
2664
2665 SHRINK;
2666 cur = CUR;
Daniel Veillard34ba3872003-07-15 13:34:05 +00002667 while (IS_CHAR((unsigned int) cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002668 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2669 (NXT(3) == '-')) {
2670 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2671 if (ctxt->sax->cdataBlock!= NULL) {
2672 /*
2673 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2674 */
2675 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002676 } else if (ctxt->sax->characters != NULL) {
2677 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Daniel Veillardc1f78342001-11-10 11:43:05 +00002678 }
2679 }
2680 nbchar = 0;
2681 htmlParseComment(ctxt);
2682 cur = CUR;
2683 continue;
2684 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002685 /*
2686 * One should break here, the specification is clear:
2687 * Authors should therefore escape "</" within the content.
2688 * Escape mechanisms are specific to each scripting or
2689 * style sheet language.
2690 */
2691 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2692 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2693 break; /* while */
2694 }
2695 buf[nbchar++] = cur;
2696 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2697 if (ctxt->sax->cdataBlock!= NULL) {
2698 /*
2699 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2700 */
2701 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002702 } else if (ctxt->sax->characters != NULL) {
2703 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002704 }
2705 nbchar = 0;
2706 }
2707 NEXT;
2708 cur = CUR;
2709 }
Daniel Veillard34ba3872003-07-15 13:34:05 +00002710 if (!(IS_CHAR((unsigned int) cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002711 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2712 ctxt->sax->error(ctxt->userData,
2713 "Invalid char in CDATA 0x%X\n", cur);
2714 ctxt->wellFormed = 0;
2715 NEXT;
2716 }
2717
2718 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2719 if (ctxt->sax->cdataBlock!= NULL) {
2720 /*
2721 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2722 */
2723 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002724 } else if (ctxt->sax->characters != NULL) {
2725 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002726 }
2727 }
2728}
2729
2730
2731/**
2732 * htmlParseCharData:
2733 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002734 *
2735 * parse a CharData section.
2736 * if we are within a CDATA section ']]>' marks an end of section.
2737 *
2738 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2739 */
2740
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002741static void
2742htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002743 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2744 int nbchar = 0;
2745 int cur, l;
2746
2747 SHRINK;
2748 cur = CUR_CHAR(l);
2749 while (((cur != '<') || (ctxt->token == '<')) &&
2750 ((cur != '&') || (ctxt->token == '&')) &&
2751 (IS_CHAR(cur))) {
2752 COPY_BUF(l,buf,nbchar,cur);
2753 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2754 /*
2755 * Ok the segment is to be consumed as chars.
2756 */
2757 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2758 if (areBlanks(ctxt, buf, nbchar)) {
2759 if (ctxt->sax->ignorableWhitespace != NULL)
2760 ctxt->sax->ignorableWhitespace(ctxt->userData,
2761 buf, nbchar);
2762 } else {
2763 htmlCheckParagraph(ctxt);
2764 if (ctxt->sax->characters != NULL)
2765 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2766 }
2767 }
2768 nbchar = 0;
2769 }
2770 NEXTL(l);
2771 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002772 if (cur == 0) {
2773 SHRINK;
2774 GROW;
2775 cur = CUR_CHAR(l);
2776 }
Owen Taylor3473f882001-02-23 17:55:21 +00002777 }
2778 if (nbchar != 0) {
2779 /*
2780 * Ok the segment is to be consumed as chars.
2781 */
2782 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2783 if (areBlanks(ctxt, buf, nbchar)) {
2784 if (ctxt->sax->ignorableWhitespace != NULL)
2785 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2786 } else {
2787 htmlCheckParagraph(ctxt);
2788 if (ctxt->sax->characters != NULL)
2789 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2790 }
2791 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002792 } else {
2793 /*
2794 * Loop detection
2795 */
2796 if (cur == 0)
2797 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002798 }
2799}
2800
2801/**
2802 * htmlParseExternalID:
2803 * @ctxt: an HTML parser context
2804 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002805 *
2806 * Parse an External ID or a Public ID
2807 *
Owen Taylor3473f882001-02-23 17:55:21 +00002808 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2809 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2810 *
2811 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2812 *
2813 * Returns the function returns SystemLiteral and in the second
2814 * case publicID receives PubidLiteral, is strict is off
2815 * it is possible to return NULL and have publicID set.
2816 */
2817
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002818static xmlChar *
2819htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002820 xmlChar *URI = NULL;
2821
2822 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2823 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2824 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2825 SKIP(6);
2826 if (!IS_BLANK(CUR)) {
2827 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2828 ctxt->sax->error(ctxt->userData,
2829 "Space required after 'SYSTEM'\n");
2830 ctxt->wellFormed = 0;
2831 }
2832 SKIP_BLANKS;
2833 URI = htmlParseSystemLiteral(ctxt);
2834 if (URI == NULL) {
2835 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2836 ctxt->sax->error(ctxt->userData,
2837 "htmlParseExternalID: SYSTEM, no URI\n");
2838 ctxt->wellFormed = 0;
2839 }
2840 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2841 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2842 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2843 SKIP(6);
2844 if (!IS_BLANK(CUR)) {
2845 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2846 ctxt->sax->error(ctxt->userData,
2847 "Space required after 'PUBLIC'\n");
2848 ctxt->wellFormed = 0;
2849 }
2850 SKIP_BLANKS;
2851 *publicID = htmlParsePubidLiteral(ctxt);
2852 if (*publicID == NULL) {
2853 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2854 ctxt->sax->error(ctxt->userData,
2855 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2856 ctxt->wellFormed = 0;
2857 }
2858 SKIP_BLANKS;
2859 if ((CUR == '"') || (CUR == '\'')) {
2860 URI = htmlParseSystemLiteral(ctxt);
2861 }
2862 }
2863 return(URI);
2864}
2865
2866/**
2867 * htmlParseComment:
2868 * @ctxt: an HTML parser context
2869 *
2870 * Parse an XML (SGML) comment <!-- .... -->
2871 *
2872 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2873 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002874static void
Owen Taylor3473f882001-02-23 17:55:21 +00002875htmlParseComment(htmlParserCtxtPtr ctxt) {
2876 xmlChar *buf = NULL;
2877 int len;
2878 int size = HTML_PARSER_BUFFER_SIZE;
2879 int q, ql;
2880 int r, rl;
2881 int cur, l;
2882 xmlParserInputState state;
2883
2884 /*
2885 * Check that there is a comment right here.
2886 */
2887 if ((RAW != '<') || (NXT(1) != '!') ||
2888 (NXT(2) != '-') || (NXT(3) != '-')) return;
2889
2890 state = ctxt->instate;
2891 ctxt->instate = XML_PARSER_COMMENT;
2892 SHRINK;
2893 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002894 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002895 if (buf == NULL) {
2896 xmlGenericError(xmlGenericErrorContext,
2897 "malloc of %d byte failed\n", size);
2898 ctxt->instate = state;
2899 return;
2900 }
2901 q = CUR_CHAR(ql);
2902 NEXTL(ql);
2903 r = CUR_CHAR(rl);
2904 NEXTL(rl);
2905 cur = CUR_CHAR(l);
2906 len = 0;
2907 while (IS_CHAR(cur) &&
2908 ((cur != '>') ||
2909 (r != '-') || (q != '-'))) {
2910 if (len + 5 >= size) {
2911 size *= 2;
2912 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2913 if (buf == NULL) {
2914 xmlGenericError(xmlGenericErrorContext,
2915 "realloc of %d byte failed\n", size);
2916 ctxt->instate = state;
2917 return;
2918 }
2919 }
2920 COPY_BUF(ql,buf,len,q);
2921 q = r;
2922 ql = rl;
2923 r = cur;
2924 rl = l;
2925 NEXTL(l);
2926 cur = CUR_CHAR(l);
2927 if (cur == 0) {
2928 SHRINK;
2929 GROW;
2930 cur = CUR_CHAR(l);
2931 }
2932 }
2933 buf[len] = 0;
2934 if (!IS_CHAR(cur)) {
2935 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2936 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2937 ctxt->sax->error(ctxt->userData,
2938 "Comment not terminated \n<!--%.50s\n", buf);
2939 ctxt->wellFormed = 0;
2940 xmlFree(buf);
2941 } else {
2942 NEXT;
2943 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2944 (!ctxt->disableSAX))
2945 ctxt->sax->comment(ctxt->userData, buf);
2946 xmlFree(buf);
2947 }
2948 ctxt->instate = state;
2949}
2950
2951/**
2952 * htmlParseCharRef:
2953 * @ctxt: an HTML parser context
2954 *
2955 * parse Reference declarations
2956 *
2957 * [66] CharRef ::= '&#' [0-9]+ ';' |
2958 * '&#x' [0-9a-fA-F]+ ';'
2959 *
2960 * Returns the value parsed (as an int)
2961 */
2962int
2963htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2964 int val = 0;
2965
2966 if ((CUR == '&') && (NXT(1) == '#') &&
2967 (NXT(2) == 'x')) {
2968 SKIP(3);
2969 while (CUR != ';') {
2970 if ((CUR >= '0') && (CUR <= '9'))
2971 val = val * 16 + (CUR - '0');
2972 else if ((CUR >= 'a') && (CUR <= 'f'))
2973 val = val * 16 + (CUR - 'a') + 10;
2974 else if ((CUR >= 'A') && (CUR <= 'F'))
2975 val = val * 16 + (CUR - 'A') + 10;
2976 else {
2977 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2978 ctxt->sax->error(ctxt->userData,
2979 "htmlParseCharRef: invalid hexadecimal value\n");
2980 ctxt->wellFormed = 0;
2981 return(0);
2982 }
2983 NEXT;
2984 }
2985 if (CUR == ';')
2986 NEXT;
2987 } else if ((CUR == '&') && (NXT(1) == '#')) {
2988 SKIP(2);
2989 while (CUR != ';') {
2990 if ((CUR >= '0') && (CUR <= '9'))
2991 val = val * 10 + (CUR - '0');
2992 else {
2993 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2994 ctxt->sax->error(ctxt->userData,
2995 "htmlParseCharRef: invalid decimal value\n");
2996 ctxt->wellFormed = 0;
2997 return(0);
2998 }
2999 NEXT;
3000 }
3001 if (CUR == ';')
3002 NEXT;
3003 } else {
3004 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3005 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
3006 ctxt->wellFormed = 0;
3007 }
3008 /*
3009 * Check the value IS_CHAR ...
3010 */
3011 if (IS_CHAR(val)) {
3012 return(val);
3013 } else {
3014 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3015 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
3016 val);
3017 ctxt->wellFormed = 0;
3018 }
3019 return(0);
3020}
3021
3022
3023/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003024 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003025 * @ctxt: an HTML parser context
3026 *
3027 * parse a DOCTYPE declaration
3028 *
3029 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3030 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3031 */
3032
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003033static void
Owen Taylor3473f882001-02-23 17:55:21 +00003034htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003035 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003036 xmlChar *ExternalID = NULL;
3037 xmlChar *URI = NULL;
3038
3039 /*
3040 * We know that '<!DOCTYPE' has been detected.
3041 */
3042 SKIP(9);
3043
3044 SKIP_BLANKS;
3045
3046 /*
3047 * Parse the DOCTYPE name.
3048 */
3049 name = htmlParseName(ctxt);
3050 if (name == NULL) {
3051 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3052 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
3053 ctxt->wellFormed = 0;
3054 }
3055 /*
3056 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3057 */
3058
3059 SKIP_BLANKS;
3060
3061 /*
3062 * Check for SystemID and ExternalID
3063 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003064 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003065 SKIP_BLANKS;
3066
3067 /*
3068 * We should be at the end of the DOCTYPE declaration.
3069 */
3070 if (CUR != '>') {
3071 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardf6ed8bc2001-10-02 09:22:47 +00003072 ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003073 ctxt->wellFormed = 0;
3074 /* We shouldn't try to resynchronize ... */
3075 }
3076 NEXT;
3077
3078 /*
3079 * Create or update the document accordingly to the DOCTYPE
3080 */
3081 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3082 (!ctxt->disableSAX))
3083 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3084
3085 /*
3086 * Cleanup, since we don't use all those identifiers
3087 */
3088 if (URI != NULL) xmlFree(URI);
3089 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003090}
3091
3092/**
3093 * htmlParseAttribute:
3094 * @ctxt: an HTML parser context
3095 * @value: a xmlChar ** used to store the value of the attribute
3096 *
3097 * parse an attribute
3098 *
3099 * [41] Attribute ::= Name Eq AttValue
3100 *
3101 * [25] Eq ::= S? '=' S?
3102 *
3103 * With namespace:
3104 *
3105 * [NS 11] Attribute ::= QName Eq AttValue
3106 *
3107 * Also the case QName == xmlns:??? is handled independently as a namespace
3108 * definition.
3109 *
3110 * Returns the attribute name, and the value in *value.
3111 */
3112
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003113static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003114htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003115 const xmlChar *name;
3116 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003117
3118 *value = NULL;
3119 name = htmlParseHTMLName(ctxt);
3120 if (name == NULL) {
3121 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3122 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
3123 ctxt->wellFormed = 0;
3124 return(NULL);
3125 }
3126
3127 /*
3128 * read the value
3129 */
3130 SKIP_BLANKS;
3131 if (CUR == '=') {
3132 NEXT;
3133 SKIP_BLANKS;
3134 val = htmlParseAttValue(ctxt);
3135 /******
3136 } else {
3137 * TODO : some attribute must have values, some may not
3138 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3139 ctxt->sax->warning(ctxt->userData,
3140 "No value for attribute %s\n", name); */
3141 }
3142
3143 *value = val;
3144 return(name);
3145}
3146
3147/**
3148 * htmlCheckEncoding:
3149 * @ctxt: an HTML parser context
3150 * @attvalue: the attribute value
3151 *
3152 * Checks an http-equiv attribute from a Meta tag to detect
3153 * the encoding
3154 * If a new encoding is detected the parser is switched to decode
3155 * it and pass UTF8
3156 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003157static void
Owen Taylor3473f882001-02-23 17:55:21 +00003158htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3159 const xmlChar *encoding;
3160
3161 if ((ctxt == NULL) || (attvalue == NULL))
3162 return;
3163
3164 /* do not change encoding */
3165 if (ctxt->input->encoding != NULL)
3166 return;
3167
3168 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3169 if (encoding != NULL) {
3170 encoding += 8;
3171 } else {
3172 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3173 if (encoding != NULL)
3174 encoding += 9;
3175 }
3176 if (encoding != NULL) {
3177 xmlCharEncoding enc;
3178 xmlCharEncodingHandlerPtr handler;
3179
3180 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3181
3182 if (ctxt->input->encoding != NULL)
3183 xmlFree((xmlChar *) ctxt->input->encoding);
3184 ctxt->input->encoding = xmlStrdup(encoding);
3185
3186 enc = xmlParseCharEncoding((const char *) encoding);
3187 /*
3188 * registered set of known encodings
3189 */
3190 if (enc != XML_CHAR_ENCODING_ERROR) {
3191 xmlSwitchEncoding(ctxt, enc);
3192 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3193 } else {
3194 /*
3195 * fallback for unknown encodings
3196 */
3197 handler = xmlFindCharEncodingHandler((const char *) encoding);
3198 if (handler != NULL) {
3199 xmlSwitchToEncoding(ctxt, handler);
3200 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3201 } else {
3202 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3203 }
3204 }
3205
3206 if ((ctxt->input->buf != NULL) &&
3207 (ctxt->input->buf->encoder != NULL) &&
3208 (ctxt->input->buf->raw != NULL) &&
3209 (ctxt->input->buf->buffer != NULL)) {
3210 int nbchars;
3211 int processed;
3212
3213 /*
3214 * convert as much as possible to the parser reading buffer.
3215 */
3216 processed = ctxt->input->cur - ctxt->input->base;
3217 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3218 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3219 ctxt->input->buf->buffer,
3220 ctxt->input->buf->raw);
3221 if (nbchars < 0) {
3222 ctxt->errNo = XML_ERR_INVALID_ENCODING;
3223 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3224 ctxt->sax->error(ctxt->userData,
3225 "htmlCheckEncoding: encoder error\n");
3226 }
3227 ctxt->input->base =
3228 ctxt->input->cur = ctxt->input->buf->buffer->content;
3229 }
3230 }
3231}
3232
3233/**
3234 * htmlCheckMeta:
3235 * @ctxt: an HTML parser context
3236 * @atts: the attributes values
3237 *
3238 * Checks an attributes from a Meta tag
3239 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003240static void
Owen Taylor3473f882001-02-23 17:55:21 +00003241htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3242 int i;
3243 const xmlChar *att, *value;
3244 int http = 0;
3245 const xmlChar *content = NULL;
3246
3247 if ((ctxt == NULL) || (atts == NULL))
3248 return;
3249
3250 i = 0;
3251 att = atts[i++];
3252 while (att != NULL) {
3253 value = atts[i++];
3254 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3255 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3256 http = 1;
3257 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3258 content = value;
3259 att = atts[i++];
3260 }
3261 if ((http) && (content != NULL))
3262 htmlCheckEncoding(ctxt, content);
3263
3264}
3265
3266/**
3267 * htmlParseStartTag:
3268 * @ctxt: an HTML parser context
3269 *
3270 * parse a start of tag either for rule element or
3271 * EmptyElement. In both case we don't parse the tag closing chars.
3272 *
3273 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3274 *
3275 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3276 *
3277 * With namespace:
3278 *
3279 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3280 *
3281 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3282 *
3283 */
3284
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003285static void
Owen Taylor3473f882001-02-23 17:55:21 +00003286htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003287 const xmlChar *name;
3288 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003289 xmlChar *attvalue;
3290 const xmlChar **atts = NULL;
3291 int nbatts = 0;
3292 int maxatts = 0;
3293 int meta = 0;
3294 int i;
3295
3296 if (CUR != '<') return;
3297 NEXT;
3298
3299 GROW;
3300 name = htmlParseHTMLName(ctxt);
3301 if (name == NULL) {
3302 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3303 ctxt->sax->error(ctxt->userData,
3304 "htmlParseStartTag: invalid element name\n");
3305 ctxt->wellFormed = 0;
3306 /* Dump the bogus tag like browsers do */
Daniel Veillard34ba3872003-07-15 13:34:05 +00003307 while ((IS_CHAR((unsigned int) CUR)) && (CUR != '>'))
Owen Taylor3473f882001-02-23 17:55:21 +00003308 NEXT;
3309 return;
3310 }
3311 if (xmlStrEqual(name, BAD_CAST"meta"))
3312 meta = 1;
3313
3314 /*
3315 * Check for auto-closure of HTML elements.
3316 */
3317 htmlAutoClose(ctxt, name);
3318
3319 /*
3320 * Check for implied HTML elements.
3321 */
3322 htmlCheckImplied(ctxt, name);
3323
3324 /*
3325 * Avoid html at any level > 0, head at any level != 1
3326 * or any attempt to recurse body
3327 */
3328 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3329 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3330 ctxt->sax->error(ctxt->userData,
3331 "htmlParseStartTag: misplaced <html> tag\n");
3332 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003333 return;
3334 }
3335 if ((ctxt->nameNr != 1) &&
3336 (xmlStrEqual(name, BAD_CAST"head"))) {
3337 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3338 ctxt->sax->error(ctxt->userData,
3339 "htmlParseStartTag: misplaced <head> tag\n");
3340 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003341 return;
3342 }
3343 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003344 int indx;
3345 for (indx = 0;indx < ctxt->nameNr;indx++) {
3346 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00003347 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3348 ctxt->sax->error(ctxt->userData,
3349 "htmlParseStartTag: misplaced <body> tag\n");
3350 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003351 return;
3352 }
3353 }
3354 }
3355
3356 /*
3357 * Now parse the attributes, it ends up with the ending
3358 *
3359 * (S Attribute)* S?
3360 */
3361 SKIP_BLANKS;
Daniel Veillard34ba3872003-07-15 13:34:05 +00003362 while ((IS_CHAR((unsigned int) CUR)) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003363 (CUR != '>') &&
3364 ((CUR != '/') || (NXT(1) != '>'))) {
3365 long cons = ctxt->nbChars;
3366
3367 GROW;
3368 attname = htmlParseAttribute(ctxt, &attvalue);
3369 if (attname != NULL) {
3370
3371 /*
3372 * Well formedness requires at most one declaration of an attribute
3373 */
3374 for (i = 0; i < nbatts;i += 2) {
3375 if (xmlStrEqual(atts[i], attname)) {
3376 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3377 ctxt->sax->error(ctxt->userData,
3378 "Attribute %s redefined\n",
3379 attname);
3380 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003381 if (attvalue != NULL)
3382 xmlFree(attvalue);
3383 goto failed;
3384 }
3385 }
3386
3387 /*
3388 * Add the pair to atts
3389 */
3390 if (atts == NULL) {
3391 maxatts = 10;
3392 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
3393 if (atts == NULL) {
3394 xmlGenericError(xmlGenericErrorContext,
3395 "malloc of %ld byte failed\n",
3396 maxatts * (long)sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003397 return;
3398 }
3399 } else if (nbatts + 4 > maxatts) {
3400 maxatts *= 2;
3401 atts = (const xmlChar **) xmlRealloc((void *) atts,
3402 maxatts * sizeof(xmlChar *));
3403 if (atts == NULL) {
3404 xmlGenericError(xmlGenericErrorContext,
3405 "realloc of %ld byte failed\n",
3406 maxatts * (long)sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003407 return;
3408 }
3409 }
3410 atts[nbatts++] = attname;
3411 atts[nbatts++] = attvalue;
3412 atts[nbatts] = NULL;
3413 atts[nbatts + 1] = NULL;
3414 }
3415 else {
3416 /* Dump the bogus attribute string up to the next blank or
3417 * the end of the tag. */
Daniel Veillard34ba3872003-07-15 13:34:05 +00003418 while ((IS_CHAR((unsigned int) CUR)) &&
3419 !(IS_BLANK(CUR)) && (CUR != '>') &&
3420 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003421 NEXT;
3422 }
3423
3424failed:
3425 SKIP_BLANKS;
3426 if (cons == ctxt->nbChars) {
3427 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3428 ctxt->sax->error(ctxt->userData,
3429 "htmlParseStartTag: problem parsing attributes\n");
3430 ctxt->wellFormed = 0;
3431 break;
3432 }
3433 }
3434
3435 /*
3436 * Handle specific association to the META tag
3437 */
3438 if (meta)
3439 htmlCheckMeta(ctxt, atts);
3440
3441 /*
3442 * SAX: Start of Element !
3443 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003444 htmlnamePush(ctxt, name);
Owen Taylor3473f882001-02-23 17:55:21 +00003445#ifdef DEBUG
3446 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3447#endif
3448 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3449 ctxt->sax->startElement(ctxt->userData, name, atts);
3450
3451 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003452 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003453 if (atts[i] != NULL)
3454 xmlFree((xmlChar *) atts[i]);
3455 }
3456 xmlFree((void *) atts);
3457 }
Owen Taylor3473f882001-02-23 17:55:21 +00003458}
3459
3460/**
3461 * htmlParseEndTag:
3462 * @ctxt: an HTML parser context
3463 *
3464 * parse an end of tag
3465 *
3466 * [42] ETag ::= '</' Name S? '>'
3467 *
3468 * With namespace
3469 *
3470 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003471 *
3472 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003473 */
3474
Daniel Veillardf420ac52001-07-04 16:04:09 +00003475static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003476htmlParseEndTag(htmlParserCtxtPtr ctxt)
3477{
3478 const xmlChar *name;
3479 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003480 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003481
3482 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003483 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3484 ctxt->sax->error(ctxt->userData,
3485 "htmlParseEndTag: '</' not found\n");
3486 ctxt->wellFormed = 0;
3487 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003488 }
3489 SKIP(2);
3490
3491 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003492 if (name == NULL)
3493 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003494
3495 /*
3496 * We should definitely be at the ending "S? '>'" part
3497 */
3498 SKIP_BLANKS;
Daniel Veillard34ba3872003-07-15 13:34:05 +00003499 if ((!IS_CHAR((unsigned int) CUR)) || (CUR != '>')) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003500 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3501 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3502 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003503 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003504 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003505
3506 /*
3507 * If the name read is not one of the element in the parsing stack
3508 * then return, it's just an error.
3509 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003510 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3511 if (xmlStrEqual(name, ctxt->nameTab[i]))
3512 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003513 }
3514 if (i < 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003515 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3516 ctxt->sax->error(ctxt->userData,
3517 "Unexpected end tag : %s\n", name);
3518 ctxt->wellFormed = 0;
3519 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003520 }
3521
3522
3523 /*
3524 * Check for auto-closure of HTML elements.
3525 */
3526
3527 htmlAutoCloseOnClose(ctxt, name);
3528
3529 /*
3530 * Well formedness constraints, opening and closing must match.
3531 * With the exception that the autoclose may have popped stuff out
3532 * of the stack.
3533 */
3534 if (!xmlStrEqual(name, ctxt->name)) {
3535#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003536 xmlGenericError(xmlGenericErrorContext,
3537 "End of tag %s: expecting %s\n", name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003538#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003539 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
3540 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3541 ctxt->sax->error(ctxt->userData,
3542 "Opening and ending tag mismatch: %s and %s\n",
3543 name, ctxt->name);
3544 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003545 }
3546 }
3547
3548 /*
3549 * SAX: End of Tag
3550 */
3551 oldname = ctxt->name;
3552 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003553 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3554 ctxt->sax->endElement(ctxt->userData, name);
3555 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003556#ifdef DEBUG
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003557 if (oldname != NULL) {
3558 xmlGenericError(xmlGenericErrorContext,
3559 "End of tag %s: popping out %s\n", name,
3560 oldname);
3561 } else {
3562 xmlGenericError(xmlGenericErrorContext,
3563 "End of tag %s: stack empty !!!\n", name);
3564 }
Owen Taylor3473f882001-02-23 17:55:21 +00003565#endif
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003566 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003567 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003568 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003569 }
3570
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003571 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003572}
3573
3574
3575/**
3576 * htmlParseReference:
3577 * @ctxt: an HTML parser context
3578 *
3579 * parse and handle entity references in content,
3580 * this will end-up in a call to character() since this is either a
3581 * CharRef, or a predefined entity.
3582 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003583static void
Owen Taylor3473f882001-02-23 17:55:21 +00003584htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003585 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003586 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003587 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003588 if (CUR != '&') return;
3589
3590 if (NXT(1) == '#') {
3591 unsigned int c;
3592 int bits, i = 0;
3593
3594 c = htmlParseCharRef(ctxt);
3595 if (c == 0)
3596 return;
3597
3598 if (c < 0x80) { out[i++]= c; bits= -6; }
3599 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3600 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3601 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3602
3603 for ( ; bits >= 0; bits-= 6) {
3604 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3605 }
3606 out[i] = 0;
3607
3608 htmlCheckParagraph(ctxt);
3609 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3610 ctxt->sax->characters(ctxt->userData, out, i);
3611 } else {
3612 ent = htmlParseEntityRef(ctxt, &name);
3613 if (name == NULL) {
3614 htmlCheckParagraph(ctxt);
3615 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3616 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3617 return;
3618 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003619 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003620 htmlCheckParagraph(ctxt);
3621 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3622 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3623 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3624 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3625 }
3626 } else {
3627 unsigned int c;
3628 int bits, i = 0;
3629
3630 c = ent->value;
3631 if (c < 0x80)
3632 { out[i++]= c; bits= -6; }
3633 else if (c < 0x800)
3634 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3635 else if (c < 0x10000)
3636 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3637 else
3638 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3639
3640 for ( ; bits >= 0; bits-= 6) {
3641 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3642 }
3643 out[i] = 0;
3644
3645 htmlCheckParagraph(ctxt);
3646 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3647 ctxt->sax->characters(ctxt->userData, out, i);
3648 }
Owen Taylor3473f882001-02-23 17:55:21 +00003649 }
3650}
3651
3652/**
3653 * htmlParseContent:
3654 * @ctxt: an HTML parser context
3655 * @name: the node name
3656 *
3657 * Parse a content: comment, sub-element, reference or text.
3658 *
3659 */
3660
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003661static void
Owen Taylor3473f882001-02-23 17:55:21 +00003662htmlParseContent(htmlParserCtxtPtr ctxt) {
3663 xmlChar *currentNode;
3664 int depth;
3665
3666 currentNode = xmlStrdup(ctxt->name);
3667 depth = ctxt->nameNr;
3668 while (1) {
3669 long cons = ctxt->nbChars;
3670
3671 GROW;
3672 /*
3673 * Our tag or one of it's parent or children is ending.
3674 */
3675 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003676 if (htmlParseEndTag(ctxt) &&
3677 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3678 if (currentNode != NULL)
3679 xmlFree(currentNode);
3680 return;
3681 }
3682 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003683 }
3684
3685 /*
3686 * Has this node been popped out during parsing of
3687 * the next element
3688 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003689 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3690 (!xmlStrEqual(currentNode, ctxt->name)))
3691 {
Owen Taylor3473f882001-02-23 17:55:21 +00003692 if (currentNode != NULL) xmlFree(currentNode);
3693 return;
3694 }
3695
Daniel Veillardf9533d12001-03-03 10:04:57 +00003696 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3697 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003698 /*
3699 * Handle SCRIPT/STYLE separately
3700 */
3701 htmlParseScript(ctxt);
3702 } else {
3703 /*
3704 * Sometimes DOCTYPE arrives in the middle of the document
3705 */
3706 if ((CUR == '<') && (NXT(1) == '!') &&
3707 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3708 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3709 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3710 (UPP(8) == 'E')) {
3711 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3712 ctxt->sax->error(ctxt->userData,
3713 "Misplaced DOCTYPE declaration\n");
3714 ctxt->wellFormed = 0;
3715 htmlParseDocTypeDecl(ctxt);
3716 }
3717
3718 /*
3719 * First case : a comment
3720 */
3721 if ((CUR == '<') && (NXT(1) == '!') &&
3722 (NXT(2) == '-') && (NXT(3) == '-')) {
3723 htmlParseComment(ctxt);
3724 }
3725
3726 /*
3727 * Second case : a sub-element.
3728 */
3729 else if (CUR == '<') {
3730 htmlParseElement(ctxt);
3731 }
3732
3733 /*
3734 * Third case : a reference. If if has not been resolved,
3735 * parsing returns it's Name, create the node
3736 */
3737 else if (CUR == '&') {
3738 htmlParseReference(ctxt);
3739 }
3740
3741 /*
3742 * Fourth : end of the resource
3743 */
3744 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003745 htmlAutoCloseOnEnd(ctxt);
3746 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003747 }
3748
3749 /*
3750 * Last case, text. Note that References are handled directly.
3751 */
3752 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003753 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003754 }
3755
3756 if (cons == ctxt->nbChars) {
3757 if (ctxt->node != NULL) {
3758 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3759 ctxt->sax->error(ctxt->userData,
3760 "detected an error in element content\n");
3761 ctxt->wellFormed = 0;
3762 }
3763 break;
3764 }
3765 }
3766 GROW;
3767 }
3768 if (currentNode != NULL) xmlFree(currentNode);
3769}
3770
3771/**
3772 * htmlParseElement:
3773 * @ctxt: an HTML parser context
3774 *
3775 * parse an HTML element, this is highly recursive
3776 *
3777 * [39] element ::= EmptyElemTag | STag content ETag
3778 *
3779 * [41] Attribute ::= Name Eq AttValue
3780 */
3781
3782void
3783htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003784 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003785 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003786 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003787 htmlParserNodeInfo node_info;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003788 const xmlChar *oldname;
Owen Taylor3473f882001-02-23 17:55:21 +00003789 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003790 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003791
3792 /* Capture start position */
3793 if (ctxt->record_info) {
3794 node_info.begin_pos = ctxt->input->consumed +
3795 (CUR_PTR - ctxt->input->base);
3796 node_info.begin_line = ctxt->input->line;
3797 }
3798
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003799 oldname = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00003800 htmlParseStartTag(ctxt);
3801 name = ctxt->name;
3802#ifdef DEBUG
3803 if (oldname == NULL)
3804 xmlGenericError(xmlGenericErrorContext,
3805 "Start of element %s\n", name);
3806 else if (name == NULL)
3807 xmlGenericError(xmlGenericErrorContext,
3808 "Start of element failed, was %s\n", oldname);
3809 else
3810 xmlGenericError(xmlGenericErrorContext,
3811 "Start of element %s, was %s\n", name, oldname);
3812#endif
3813 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3814 (name == NULL)) {
3815 if (CUR == '>')
3816 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003817 return;
3818 }
Owen Taylor3473f882001-02-23 17:55:21 +00003819
3820 /*
3821 * Lookup the info for that element.
3822 */
3823 info = htmlTagLookup(name);
3824 if (info == NULL) {
3825 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3826 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3827 name);
3828 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003829 }
3830
3831 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003832 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003833 */
3834 if ((CUR == '/') && (NXT(1) == '>')) {
3835 SKIP(2);
3836 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3837 ctxt->sax->endElement(ctxt->userData, name);
3838 oldname = htmlnamePop(ctxt);
3839#ifdef DEBUG
3840 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3841#endif
Owen Taylor3473f882001-02-23 17:55:21 +00003842 return;
3843 }
3844
3845 if (CUR == '>') {
3846 NEXT;
3847 } else {
3848 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3849 ctxt->sax->error(ctxt->userData,
3850 "Couldn't find end of Start Tag %s\n",
3851 name);
3852 ctxt->wellFormed = 0;
3853
3854 /*
3855 * end of parsing of this node.
3856 */
3857 if (xmlStrEqual(name, ctxt->name)) {
3858 nodePop(ctxt);
3859 oldname = htmlnamePop(ctxt);
3860#ifdef DEBUG
3861 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3862#endif
Owen Taylor3473f882001-02-23 17:55:21 +00003863 }
3864
3865 /*
3866 * Capture end position and add node
3867 */
3868 if ( currentNode != NULL && ctxt->record_info ) {
3869 node_info.end_pos = ctxt->input->consumed +
3870 (CUR_PTR - ctxt->input->base);
3871 node_info.end_line = ctxt->input->line;
3872 node_info.node = ctxt->node;
3873 xmlParserAddNodeInfo(ctxt, &node_info);
3874 }
3875 return;
3876 }
3877
3878 /*
3879 * Check for an Empty Element from DTD definition
3880 */
3881 if ((info != NULL) && (info->empty)) {
3882 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3883 ctxt->sax->endElement(ctxt->userData, name);
3884 oldname = htmlnamePop(ctxt);
3885#ifdef DEBUG
3886 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3887#endif
Owen Taylor3473f882001-02-23 17:55:21 +00003888 return;
3889 }
3890
3891 /*
3892 * Parse the content of the element:
3893 */
3894 currentNode = xmlStrdup(ctxt->name);
3895 depth = ctxt->nameNr;
Daniel Veillard34ba3872003-07-15 13:34:05 +00003896 while (IS_CHAR((unsigned int) CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003897 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003898 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003899 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003900 if (ctxt->nameNr < depth) break;
3901 }
3902
Owen Taylor3473f882001-02-23 17:55:21 +00003903 /*
3904 * Capture end position and add node
3905 */
3906 if ( currentNode != NULL && ctxt->record_info ) {
3907 node_info.end_pos = ctxt->input->consumed +
3908 (CUR_PTR - ctxt->input->base);
3909 node_info.end_line = ctxt->input->line;
3910 node_info.node = ctxt->node;
3911 xmlParserAddNodeInfo(ctxt, &node_info);
3912 }
Daniel Veillard34ba3872003-07-15 13:34:05 +00003913 if (!IS_CHAR((unsigned int) CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003914 htmlAutoCloseOnEnd(ctxt);
3915 }
3916
Owen Taylor3473f882001-02-23 17:55:21 +00003917 if (currentNode != NULL)
3918 xmlFree(currentNode);
3919}
3920
3921/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003922 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00003923 * @ctxt: an HTML parser context
3924 *
3925 * parse an HTML document (and build a tree if using the standard SAX
3926 * interface).
3927 *
3928 * Returns 0, -1 in case of error. the parser context is augmented
3929 * as a result of the parsing.
3930 */
3931
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00003932int
Owen Taylor3473f882001-02-23 17:55:21 +00003933htmlParseDocument(htmlParserCtxtPtr ctxt) {
3934 xmlDtdPtr dtd;
3935
Daniel Veillardd0463562001-10-13 09:15:48 +00003936 xmlInitParser();
3937
Owen Taylor3473f882001-02-23 17:55:21 +00003938 htmlDefaultSAXHandlerInit();
3939 ctxt->html = 1;
3940
3941 GROW;
3942 /*
3943 * SAX: beginning of the document processing.
3944 */
3945 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3946 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3947
3948 /*
3949 * Wipe out everything which is before the first '<'
3950 */
3951 SKIP_BLANKS;
3952 if (CUR == 0) {
3953 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3954 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3955 ctxt->wellFormed = 0;
3956 }
3957
3958 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3959 ctxt->sax->startDocument(ctxt->userData);
3960
3961
3962 /*
3963 * Parse possible comments before any content
3964 */
3965 while ((CUR == '<') && (NXT(1) == '!') &&
3966 (NXT(2) == '-') && (NXT(3) == '-')) {
3967 htmlParseComment(ctxt);
3968 SKIP_BLANKS;
3969 }
3970
3971
3972 /*
3973 * Then possibly doc type declaration(s) and more Misc
3974 * (doctypedecl Misc*)?
3975 */
3976 if ((CUR == '<') && (NXT(1) == '!') &&
3977 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3978 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3979 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3980 (UPP(8) == 'E')) {
3981 htmlParseDocTypeDecl(ctxt);
3982 }
3983 SKIP_BLANKS;
3984
3985 /*
3986 * Parse possible comments before any content
3987 */
3988 while ((CUR == '<') && (NXT(1) == '!') &&
3989 (NXT(2) == '-') && (NXT(3) == '-')) {
3990 htmlParseComment(ctxt);
3991 SKIP_BLANKS;
3992 }
3993
3994 /*
3995 * Time to start parsing the tree itself
3996 */
3997 htmlParseContent(ctxt);
3998
3999 /*
4000 * autoclose
4001 */
4002 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004003 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004004
4005
4006 /*
4007 * SAX: end of the document processing.
4008 */
4009 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4010 ctxt->sax->endDocument(ctxt->userData);
4011
4012 if (ctxt->myDoc != NULL) {
4013 dtd = xmlGetIntSubset(ctxt->myDoc);
4014 if (dtd == NULL)
4015 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00004016 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004017 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4018 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4019 }
4020 if (! ctxt->wellFormed) return(-1);
4021 return(0);
4022}
4023
4024
4025/************************************************************************
4026 * *
4027 * Parser contexts handling *
4028 * *
4029 ************************************************************************/
4030
4031/**
4032 * xmlInitParserCtxt:
4033 * @ctxt: an HTML parser context
4034 *
4035 * Initialize a parser context
4036 */
4037
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004038static void
Owen Taylor3473f882001-02-23 17:55:21 +00004039htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4040{
4041 htmlSAXHandler *sax;
4042
4043 if (ctxt == NULL) return;
4044 memset(ctxt, 0, sizeof(htmlParserCtxt));
4045
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004046 ctxt->dict = xmlDictCreate();
4047 if (ctxt->dict == NULL) {
4048 xmlGenericError(xmlGenericErrorContext,
4049 "xmlInitParserCtxt: out of memory\n");
4050 return;
4051 }
Owen Taylor3473f882001-02-23 17:55:21 +00004052 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4053 if (sax == NULL) {
4054 xmlGenericError(xmlGenericErrorContext,
4055 "htmlInitParserCtxt: out of memory\n");
4056 }
4057 else
4058 memset(sax, 0, sizeof(htmlSAXHandler));
4059
4060 /* Allocate the Input stack */
4061 ctxt->inputTab = (htmlParserInputPtr *)
4062 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4063 if (ctxt->inputTab == NULL) {
4064 xmlGenericError(xmlGenericErrorContext,
4065 "htmlInitParserCtxt: out of memory\n");
4066 ctxt->inputNr = 0;
4067 ctxt->inputMax = 0;
4068 ctxt->input = NULL;
4069 return;
4070 }
4071 ctxt->inputNr = 0;
4072 ctxt->inputMax = 5;
4073 ctxt->input = NULL;
4074 ctxt->version = NULL;
4075 ctxt->encoding = NULL;
4076 ctxt->standalone = -1;
4077 ctxt->instate = XML_PARSER_START;
4078
4079 /* Allocate the Node stack */
4080 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4081 if (ctxt->nodeTab == NULL) {
4082 xmlGenericError(xmlGenericErrorContext,
4083 "htmlInitParserCtxt: out of memory\n");
4084 ctxt->nodeNr = 0;
4085 ctxt->nodeMax = 0;
4086 ctxt->node = NULL;
4087 ctxt->inputNr = 0;
4088 ctxt->inputMax = 0;
4089 ctxt->input = NULL;
4090 return;
4091 }
4092 ctxt->nodeNr = 0;
4093 ctxt->nodeMax = 10;
4094 ctxt->node = NULL;
4095
4096 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004097 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004098 if (ctxt->nameTab == NULL) {
4099 xmlGenericError(xmlGenericErrorContext,
4100 "htmlInitParserCtxt: out of memory\n");
4101 ctxt->nameNr = 0;
4102 ctxt->nameMax = 10;
4103 ctxt->name = NULL;
4104 ctxt->nodeNr = 0;
4105 ctxt->nodeMax = 0;
4106 ctxt->node = NULL;
4107 ctxt->inputNr = 0;
4108 ctxt->inputMax = 0;
4109 ctxt->input = NULL;
4110 return;
4111 }
4112 ctxt->nameNr = 0;
4113 ctxt->nameMax = 10;
4114 ctxt->name = NULL;
4115
Daniel Veillard092643b2003-09-25 14:29:29 +00004116 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004117 else {
4118 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004119 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004120 }
4121 ctxt->userData = ctxt;
4122 ctxt->myDoc = NULL;
4123 ctxt->wellFormed = 1;
4124 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004125 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004126 ctxt->html = 1;
4127 ctxt->record_info = 0;
4128 ctxt->validate = 0;
4129 ctxt->nbChars = 0;
4130 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004131 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004132 xmlInitNodeInfoSeq(&ctxt->node_seq);
4133}
4134
4135/**
4136 * htmlFreeParserCtxt:
4137 * @ctxt: an HTML parser context
4138 *
4139 * Free all the memory used by a parser context. However the parsed
4140 * document in ctxt->myDoc is not freed.
4141 */
4142
4143void
4144htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4145{
4146 xmlFreeParserCtxt(ctxt);
4147}
4148
4149/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004150 * htmlNewParserCtxt:
4151 *
4152 * Allocate and initialize a new parser context.
4153 *
4154 * Returns the xmlParserCtxtPtr or NULL
4155 */
4156
4157static htmlParserCtxtPtr
4158htmlNewParserCtxt(void)
4159{
4160 xmlParserCtxtPtr ctxt;
4161
4162 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4163 if (ctxt == NULL) {
4164 xmlGenericError(xmlGenericErrorContext,
4165 "xmlNewParserCtxt : cannot allocate context\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004166 return(NULL);
4167 }
4168 memset(ctxt, 0, sizeof(xmlParserCtxt));
4169 htmlInitParserCtxt(ctxt);
4170 return(ctxt);
4171}
4172
4173/**
4174 * htmlCreateMemoryParserCtxt:
4175 * @buffer: a pointer to a char array
4176 * @size: the size of the array
4177 *
4178 * Create a parser context for an HTML in-memory document.
4179 *
4180 * Returns the new parser context or NULL
4181 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004182htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004183htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4184 xmlParserCtxtPtr ctxt;
4185 xmlParserInputPtr input;
4186 xmlParserInputBufferPtr buf;
4187
4188 if (buffer == NULL)
4189 return(NULL);
4190 if (size <= 0)
4191 return(NULL);
4192
4193 ctxt = htmlNewParserCtxt();
4194 if (ctxt == NULL)
4195 return(NULL);
4196
4197 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4198 if (buf == NULL) return(NULL);
4199
4200 input = xmlNewInputStream(ctxt);
4201 if (input == NULL) {
4202 xmlFreeParserCtxt(ctxt);
4203 return(NULL);
4204 }
4205
4206 input->filename = NULL;
4207 input->buf = buf;
4208 input->base = input->buf->buffer->content;
4209 input->cur = input->buf->buffer->content;
4210 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4211
4212 inputPush(ctxt, input);
4213 return(ctxt);
4214}
4215
4216/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004217 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004218 * @cur: a pointer to an array of xmlChar
4219 * @encoding: a free form C string describing the HTML document encoding, or NULL
4220 *
4221 * Create a parser context for an HTML document.
4222 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004223 * TODO: check the need to add encoding handling there
4224 *
Owen Taylor3473f882001-02-23 17:55:21 +00004225 * Returns the new parser context or NULL
4226 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004227static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00004228htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004229 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004230 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004231
Daniel Veillard1d995272002-07-22 16:43:32 +00004232 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004233 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004234 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004235 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4236
4237 if (encoding != NULL) {
4238 xmlCharEncoding enc;
4239 xmlCharEncodingHandlerPtr handler;
4240
4241 if (ctxt->input->encoding != NULL)
4242 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004243 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004244
4245 enc = xmlParseCharEncoding(encoding);
4246 /*
4247 * registered set of known encodings
4248 */
4249 if (enc != XML_CHAR_ENCODING_ERROR) {
4250 xmlSwitchEncoding(ctxt, enc);
4251 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4252 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4253 ctxt->sax->error(ctxt->userData,
4254 "Unsupported encoding %s\n", encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004255 }
4256 } else {
4257 /*
4258 * fallback for unknown encodings
4259 */
4260 handler = xmlFindCharEncodingHandler((const char *) encoding);
4261 if (handler != NULL) {
4262 xmlSwitchToEncoding(ctxt, handler);
4263 } else {
4264 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
4265 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4266 ctxt->sax->error(ctxt->userData,
4267 "Unsupported encoding %s\n", encoding);
4268 }
4269 }
4270 }
4271 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004272}
4273
4274/************************************************************************
4275 * *
4276 * Progressive parsing interfaces *
4277 * *
4278 ************************************************************************/
4279
4280/**
4281 * htmlParseLookupSequence:
4282 * @ctxt: an HTML parser context
4283 * @first: the first char to lookup
4284 * @next: the next char to lookup or zero
4285 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004286 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004287 *
4288 * Try to find if a sequence (first, next, third) or just (first next) or
4289 * (first) is available in the input stream.
4290 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4291 * to avoid rescanning sequences of bytes, it DOES change the state of the
4292 * parser, do not use liberally.
4293 * This is basically similar to xmlParseLookupSequence()
4294 *
4295 * Returns the index to the current parsing point if the full sequence
4296 * is available, -1 otherwise.
4297 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004298static int
Owen Taylor3473f882001-02-23 17:55:21 +00004299htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
William M. Brackc1939562003-08-05 15:52:22 +00004300 xmlChar next, xmlChar third, int iscomment) {
Owen Taylor3473f882001-02-23 17:55:21 +00004301 int base, len;
4302 htmlParserInputPtr in;
4303 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004304 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004305
4306 in = ctxt->input;
4307 if (in == NULL) return(-1);
4308 base = in->cur - in->base;
4309 if (base < 0) return(-1);
4310 if (ctxt->checkIndex > base)
4311 base = ctxt->checkIndex;
4312 if (in->buf == NULL) {
4313 buf = in->base;
4314 len = in->length;
4315 } else {
4316 buf = in->buf->buffer->content;
4317 len = in->buf->buffer->use;
4318 }
4319 /* take into account the sequence length */
4320 if (third) len -= 2;
4321 else if (next) len --;
4322 for (;base < len;base++) {
William M. Brackc1939562003-08-05 15:52:22 +00004323 if (!incomment && (base + 4 < len) && !iscomment) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004324 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4325 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4326 incomment = 1;
Daniel Veillard97e01882003-07-30 18:59:19 +00004327 /* do not increment past <! - some people use <!--> */
4328 base += 2;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004329 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004330 }
4331 if (incomment) {
William M. Brack4a557d92003-07-29 04:28:04 +00004332 if (base + 3 > len)
Daniel Veillardc1f78342001-11-10 11:43:05 +00004333 return(-1);
4334 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4335 (buf[base + 2] == '>')) {
4336 incomment = 0;
4337 base += 2;
4338 }
4339 continue;
4340 }
Owen Taylor3473f882001-02-23 17:55:21 +00004341 if (buf[base] == first) {
4342 if (third != 0) {
4343 if ((buf[base + 1] != next) ||
4344 (buf[base + 2] != third)) continue;
4345 } else if (next != 0) {
4346 if (buf[base + 1] != next) continue;
4347 }
4348 ctxt->checkIndex = 0;
4349#ifdef DEBUG_PUSH
4350 if (next == 0)
4351 xmlGenericError(xmlGenericErrorContext,
4352 "HPP: lookup '%c' found at %d\n",
4353 first, base);
4354 else if (third == 0)
4355 xmlGenericError(xmlGenericErrorContext,
4356 "HPP: lookup '%c%c' found at %d\n",
4357 first, next, base);
4358 else
4359 xmlGenericError(xmlGenericErrorContext,
4360 "HPP: lookup '%c%c%c' found at %d\n",
4361 first, next, third, base);
4362#endif
4363 return(base - (in->cur - in->base));
4364 }
4365 }
4366 ctxt->checkIndex = base;
4367#ifdef DEBUG_PUSH
4368 if (next == 0)
4369 xmlGenericError(xmlGenericErrorContext,
4370 "HPP: lookup '%c' failed\n", first);
4371 else if (third == 0)
4372 xmlGenericError(xmlGenericErrorContext,
4373 "HPP: lookup '%c%c' failed\n", first, next);
4374 else
4375 xmlGenericError(xmlGenericErrorContext,
4376 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4377#endif
4378 return(-1);
4379}
4380
4381/**
4382 * htmlParseTryOrFinish:
4383 * @ctxt: an HTML parser context
4384 * @terminate: last chunk indicator
4385 *
4386 * Try to progress on parsing
4387 *
4388 * Returns zero if no parsing was possible
4389 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004390static int
Owen Taylor3473f882001-02-23 17:55:21 +00004391htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4392 int ret = 0;
4393 htmlParserInputPtr in;
4394 int avail = 0;
4395 xmlChar cur, next;
4396
4397#ifdef DEBUG_PUSH
4398 switch (ctxt->instate) {
4399 case XML_PARSER_EOF:
4400 xmlGenericError(xmlGenericErrorContext,
4401 "HPP: try EOF\n"); break;
4402 case XML_PARSER_START:
4403 xmlGenericError(xmlGenericErrorContext,
4404 "HPP: try START\n"); break;
4405 case XML_PARSER_MISC:
4406 xmlGenericError(xmlGenericErrorContext,
4407 "HPP: try MISC\n");break;
4408 case XML_PARSER_COMMENT:
4409 xmlGenericError(xmlGenericErrorContext,
4410 "HPP: try COMMENT\n");break;
4411 case XML_PARSER_PROLOG:
4412 xmlGenericError(xmlGenericErrorContext,
4413 "HPP: try PROLOG\n");break;
4414 case XML_PARSER_START_TAG:
4415 xmlGenericError(xmlGenericErrorContext,
4416 "HPP: try START_TAG\n");break;
4417 case XML_PARSER_CONTENT:
4418 xmlGenericError(xmlGenericErrorContext,
4419 "HPP: try CONTENT\n");break;
4420 case XML_PARSER_CDATA_SECTION:
4421 xmlGenericError(xmlGenericErrorContext,
4422 "HPP: try CDATA_SECTION\n");break;
4423 case XML_PARSER_END_TAG:
4424 xmlGenericError(xmlGenericErrorContext,
4425 "HPP: try END_TAG\n");break;
4426 case XML_PARSER_ENTITY_DECL:
4427 xmlGenericError(xmlGenericErrorContext,
4428 "HPP: try ENTITY_DECL\n");break;
4429 case XML_PARSER_ENTITY_VALUE:
4430 xmlGenericError(xmlGenericErrorContext,
4431 "HPP: try ENTITY_VALUE\n");break;
4432 case XML_PARSER_ATTRIBUTE_VALUE:
4433 xmlGenericError(xmlGenericErrorContext,
4434 "HPP: try ATTRIBUTE_VALUE\n");break;
4435 case XML_PARSER_DTD:
4436 xmlGenericError(xmlGenericErrorContext,
4437 "HPP: try DTD\n");break;
4438 case XML_PARSER_EPILOG:
4439 xmlGenericError(xmlGenericErrorContext,
4440 "HPP: try EPILOG\n");break;
4441 case XML_PARSER_PI:
4442 xmlGenericError(xmlGenericErrorContext,
4443 "HPP: try PI\n");break;
4444 case XML_PARSER_SYSTEM_LITERAL:
4445 xmlGenericError(xmlGenericErrorContext,
4446 "HPP: try SYSTEM_LITERAL\n");break;
4447 }
4448#endif
4449
4450 while (1) {
4451
4452 in = ctxt->input;
4453 if (in == NULL) break;
4454 if (in->buf == NULL)
4455 avail = in->length - (in->cur - in->base);
4456 else
4457 avail = in->buf->buffer->use - (in->cur - in->base);
4458 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004459 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004460 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4461 /*
4462 * SAX: end of the document processing.
4463 */
4464 ctxt->instate = XML_PARSER_EOF;
4465 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4466 ctxt->sax->endDocument(ctxt->userData);
4467 }
4468 }
4469 if (avail < 1)
4470 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004471 cur = in->cur[0];
4472 if (cur == 0) {
4473 SKIP(1);
4474 continue;
4475 }
4476
Owen Taylor3473f882001-02-23 17:55:21 +00004477 switch (ctxt->instate) {
4478 case XML_PARSER_EOF:
4479 /*
4480 * Document parsing is done !
4481 */
4482 goto done;
4483 case XML_PARSER_START:
4484 /*
4485 * Very first chars read from the document flow.
4486 */
4487 cur = in->cur[0];
4488 if (IS_BLANK(cur)) {
4489 SKIP_BLANKS;
4490 if (in->buf == NULL)
4491 avail = in->length - (in->cur - in->base);
4492 else
4493 avail = in->buf->buffer->use - (in->cur - in->base);
4494 }
4495 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4496 ctxt->sax->setDocumentLocator(ctxt->userData,
4497 &xmlDefaultSAXLocator);
4498 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4499 (!ctxt->disableSAX))
4500 ctxt->sax->startDocument(ctxt->userData);
4501
4502 cur = in->cur[0];
4503 next = in->cur[1];
4504 if ((cur == '<') && (next == '!') &&
4505 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4506 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4507 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4508 (UPP(8) == 'E')) {
4509 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004510 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004511 goto done;
4512#ifdef DEBUG_PUSH
4513 xmlGenericError(xmlGenericErrorContext,
4514 "HPP: Parsing internal subset\n");
4515#endif
4516 htmlParseDocTypeDecl(ctxt);
4517 ctxt->instate = XML_PARSER_PROLOG;
4518#ifdef DEBUG_PUSH
4519 xmlGenericError(xmlGenericErrorContext,
4520 "HPP: entering PROLOG\n");
4521#endif
4522 } else {
4523 ctxt->instate = XML_PARSER_MISC;
4524 }
4525#ifdef DEBUG_PUSH
4526 xmlGenericError(xmlGenericErrorContext,
4527 "HPP: entering MISC\n");
4528#endif
4529 break;
4530 case XML_PARSER_MISC:
4531 SKIP_BLANKS;
4532 if (in->buf == NULL)
4533 avail = in->length - (in->cur - in->base);
4534 else
4535 avail = in->buf->buffer->use - (in->cur - in->base);
4536 if (avail < 2)
4537 goto done;
4538 cur = in->cur[0];
4539 next = in->cur[1];
4540 if ((cur == '<') && (next == '!') &&
4541 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4542 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004543 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004544 goto done;
4545#ifdef DEBUG_PUSH
4546 xmlGenericError(xmlGenericErrorContext,
4547 "HPP: Parsing Comment\n");
4548#endif
4549 htmlParseComment(ctxt);
4550 ctxt->instate = XML_PARSER_MISC;
4551 } else if ((cur == '<') && (next == '!') &&
4552 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4553 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4554 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4555 (UPP(8) == 'E')) {
4556 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004557 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004558 goto done;
4559#ifdef DEBUG_PUSH
4560 xmlGenericError(xmlGenericErrorContext,
4561 "HPP: Parsing internal subset\n");
4562#endif
4563 htmlParseDocTypeDecl(ctxt);
4564 ctxt->instate = XML_PARSER_PROLOG;
4565#ifdef DEBUG_PUSH
4566 xmlGenericError(xmlGenericErrorContext,
4567 "HPP: entering PROLOG\n");
4568#endif
4569 } else if ((cur == '<') && (next == '!') &&
4570 (avail < 9)) {
4571 goto done;
4572 } else {
4573 ctxt->instate = XML_PARSER_START_TAG;
4574#ifdef DEBUG_PUSH
4575 xmlGenericError(xmlGenericErrorContext,
4576 "HPP: entering START_TAG\n");
4577#endif
4578 }
4579 break;
4580 case XML_PARSER_PROLOG:
4581 SKIP_BLANKS;
4582 if (in->buf == NULL)
4583 avail = in->length - (in->cur - in->base);
4584 else
4585 avail = in->buf->buffer->use - (in->cur - in->base);
4586 if (avail < 2)
4587 goto done;
4588 cur = in->cur[0];
4589 next = in->cur[1];
4590 if ((cur == '<') && (next == '!') &&
4591 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4592 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004593 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004594 goto done;
4595#ifdef DEBUG_PUSH
4596 xmlGenericError(xmlGenericErrorContext,
4597 "HPP: Parsing Comment\n");
4598#endif
4599 htmlParseComment(ctxt);
4600 ctxt->instate = XML_PARSER_PROLOG;
4601 } else if ((cur == '<') && (next == '!') &&
4602 (avail < 4)) {
4603 goto done;
4604 } else {
4605 ctxt->instate = XML_PARSER_START_TAG;
4606#ifdef DEBUG_PUSH
4607 xmlGenericError(xmlGenericErrorContext,
4608 "HPP: entering START_TAG\n");
4609#endif
4610 }
4611 break;
4612 case XML_PARSER_EPILOG:
4613 if (in->buf == NULL)
4614 avail = in->length - (in->cur - in->base);
4615 else
4616 avail = in->buf->buffer->use - (in->cur - in->base);
4617 if (avail < 1)
4618 goto done;
4619 cur = in->cur[0];
4620 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004621 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004622 goto done;
4623 }
4624 if (avail < 2)
4625 goto done;
4626 next = in->cur[1];
4627 if ((cur == '<') && (next == '!') &&
4628 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4629 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004630 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004631 goto done;
4632#ifdef DEBUG_PUSH
4633 xmlGenericError(xmlGenericErrorContext,
4634 "HPP: Parsing Comment\n");
4635#endif
4636 htmlParseComment(ctxt);
4637 ctxt->instate = XML_PARSER_EPILOG;
4638 } else if ((cur == '<') && (next == '!') &&
4639 (avail < 4)) {
4640 goto done;
4641 } else {
4642 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004643 ctxt->wellFormed = 0;
4644 ctxt->instate = XML_PARSER_EOF;
4645#ifdef DEBUG_PUSH
4646 xmlGenericError(xmlGenericErrorContext,
4647 "HPP: entering EOF\n");
4648#endif
4649 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4650 ctxt->sax->endDocument(ctxt->userData);
4651 goto done;
4652 }
4653 break;
4654 case XML_PARSER_START_TAG: {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004655 const xmlChar *name, *oldname;
Owen Taylor3473f882001-02-23 17:55:21 +00004656 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004657 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004658
4659 if (avail < 2)
4660 goto done;
4661 cur = in->cur[0];
4662 if (cur != '<') {
4663 ctxt->instate = XML_PARSER_CONTENT;
4664#ifdef DEBUG_PUSH
4665 xmlGenericError(xmlGenericErrorContext,
4666 "HPP: entering CONTENT\n");
4667#endif
4668 break;
4669 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004670 if (in->cur[1] == '/') {
4671 ctxt->instate = XML_PARSER_END_TAG;
4672 ctxt->checkIndex = 0;
4673#ifdef DEBUG_PUSH
4674 xmlGenericError(xmlGenericErrorContext,
4675 "HPP: entering END_TAG\n");
4676#endif
4677 break;
4678 }
Owen Taylor3473f882001-02-23 17:55:21 +00004679 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004680 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004681 goto done;
4682
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004683 oldname = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00004684 htmlParseStartTag(ctxt);
4685 name = ctxt->name;
4686#ifdef DEBUG
4687 if (oldname == NULL)
4688 xmlGenericError(xmlGenericErrorContext,
4689 "Start of element %s\n", name);
4690 else if (name == NULL)
4691 xmlGenericError(xmlGenericErrorContext,
4692 "Start of element failed, was %s\n",
4693 oldname);
4694 else
4695 xmlGenericError(xmlGenericErrorContext,
4696 "Start of element %s, was %s\n",
4697 name, oldname);
4698#endif
4699 if (((depth == ctxt->nameNr) &&
4700 (xmlStrEqual(oldname, ctxt->name))) ||
4701 (name == NULL)) {
4702 if (CUR == '>')
4703 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004704 break;
4705 }
Owen Taylor3473f882001-02-23 17:55:21 +00004706
4707 /*
4708 * Lookup the info for that element.
4709 */
4710 info = htmlTagLookup(name);
4711 if (info == NULL) {
4712 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4713 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4714 name);
4715 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004716 }
4717
4718 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004719 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004720 */
4721 if ((CUR == '/') && (NXT(1) == '>')) {
4722 SKIP(2);
4723 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4724 ctxt->sax->endElement(ctxt->userData, name);
4725 oldname = htmlnamePop(ctxt);
4726#ifdef DEBUG
4727 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4728 oldname);
4729#endif
Owen Taylor3473f882001-02-23 17:55:21 +00004730 ctxt->instate = XML_PARSER_CONTENT;
4731#ifdef DEBUG_PUSH
4732 xmlGenericError(xmlGenericErrorContext,
4733 "HPP: entering CONTENT\n");
4734#endif
4735 break;
4736 }
4737
4738 if (CUR == '>') {
4739 NEXT;
4740 } else {
4741 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4742 ctxt->sax->error(ctxt->userData,
4743 "Couldn't find end of Start Tag %s\n",
4744 name);
4745 ctxt->wellFormed = 0;
4746
4747 /*
4748 * end of parsing of this node.
4749 */
4750 if (xmlStrEqual(name, ctxt->name)) {
4751 nodePop(ctxt);
4752 oldname = htmlnamePop(ctxt);
4753#ifdef DEBUG
4754 xmlGenericError(xmlGenericErrorContext,
4755 "End of start tag problem: popping out %s\n", oldname);
4756#endif
Owen Taylor3473f882001-02-23 17:55:21 +00004757 }
4758
4759 ctxt->instate = XML_PARSER_CONTENT;
4760#ifdef DEBUG_PUSH
4761 xmlGenericError(xmlGenericErrorContext,
4762 "HPP: entering CONTENT\n");
4763#endif
4764 break;
4765 }
4766
4767 /*
4768 * Check for an Empty Element from DTD definition
4769 */
4770 if ((info != NULL) && (info->empty)) {
4771 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4772 ctxt->sax->endElement(ctxt->userData, name);
4773 oldname = htmlnamePop(ctxt);
4774#ifdef DEBUG
4775 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4776#endif
Owen Taylor3473f882001-02-23 17:55:21 +00004777 }
4778 ctxt->instate = XML_PARSER_CONTENT;
4779#ifdef DEBUG_PUSH
4780 xmlGenericError(xmlGenericErrorContext,
4781 "HPP: entering CONTENT\n");
4782#endif
4783 break;
4784 }
4785 case XML_PARSER_CONTENT: {
4786 long cons;
4787 /*
4788 * Handle preparsed entities and charRef
4789 */
4790 if (ctxt->token != 0) {
4791 xmlChar chr[2] = { 0 , 0 } ;
4792
4793 chr[0] = (xmlChar) ctxt->token;
4794 htmlCheckParagraph(ctxt);
4795 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4796 ctxt->sax->characters(ctxt->userData, chr, 1);
4797 ctxt->token = 0;
4798 ctxt->checkIndex = 0;
4799 }
4800 if ((avail == 1) && (terminate)) {
4801 cur = in->cur[0];
4802 if ((cur != '<') && (cur != '&')) {
4803 if (ctxt->sax != NULL) {
4804 if (IS_BLANK(cur)) {
4805 if (ctxt->sax->ignorableWhitespace != NULL)
4806 ctxt->sax->ignorableWhitespace(
4807 ctxt->userData, &cur, 1);
4808 } else {
4809 htmlCheckParagraph(ctxt);
4810 if (ctxt->sax->characters != NULL)
4811 ctxt->sax->characters(
4812 ctxt->userData, &cur, 1);
4813 }
4814 }
4815 ctxt->token = 0;
4816 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00004817 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00004818 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004819 }
Owen Taylor3473f882001-02-23 17:55:21 +00004820 }
4821 if (avail < 2)
4822 goto done;
4823 cur = in->cur[0];
4824 next = in->cur[1];
4825 cons = ctxt->nbChars;
4826 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4827 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4828 /*
4829 * Handle SCRIPT/STYLE separately
4830 */
4831 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004832 (htmlParseLookupSequence(ctxt, '<', '/', 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004833 goto done;
4834 htmlParseScript(ctxt);
4835 if ((cur == '<') && (next == '/')) {
4836 ctxt->instate = XML_PARSER_END_TAG;
4837 ctxt->checkIndex = 0;
4838#ifdef DEBUG_PUSH
4839 xmlGenericError(xmlGenericErrorContext,
4840 "HPP: entering END_TAG\n");
4841#endif
4842 break;
4843 }
4844 } else {
4845 /*
4846 * Sometimes DOCTYPE arrives in the middle of the document
4847 */
4848 if ((cur == '<') && (next == '!') &&
4849 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4850 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4851 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4852 (UPP(8) == 'E')) {
4853 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004854 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004855 goto done;
4856 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4857 ctxt->sax->error(ctxt->userData,
4858 "Misplaced DOCTYPE declaration\n");
4859 ctxt->wellFormed = 0;
4860 htmlParseDocTypeDecl(ctxt);
4861 } else if ((cur == '<') && (next == '!') &&
4862 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4863 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004864 (htmlParseLookupSequence(
4865 ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004866 goto done;
4867#ifdef DEBUG_PUSH
4868 xmlGenericError(xmlGenericErrorContext,
4869 "HPP: Parsing Comment\n");
4870#endif
4871 htmlParseComment(ctxt);
4872 ctxt->instate = XML_PARSER_CONTENT;
4873 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4874 goto done;
4875 } else if ((cur == '<') && (next == '/')) {
4876 ctxt->instate = XML_PARSER_END_TAG;
4877 ctxt->checkIndex = 0;
4878#ifdef DEBUG_PUSH
4879 xmlGenericError(xmlGenericErrorContext,
4880 "HPP: entering END_TAG\n");
4881#endif
4882 break;
4883 } else if (cur == '<') {
4884 ctxt->instate = XML_PARSER_START_TAG;
4885 ctxt->checkIndex = 0;
4886#ifdef DEBUG_PUSH
4887 xmlGenericError(xmlGenericErrorContext,
4888 "HPP: entering START_TAG\n");
4889#endif
4890 break;
4891 } else if (cur == '&') {
4892 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004893 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004894 goto done;
4895#ifdef DEBUG_PUSH
4896 xmlGenericError(xmlGenericErrorContext,
4897 "HPP: Parsing Reference\n");
4898#endif
4899 /* TODO: check generation of subtrees if noent !!! */
4900 htmlParseReference(ctxt);
4901 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00004902 /*
4903 * check that the text sequence is complete
4904 * before handing out the data to the parser
4905 * to avoid problems with erroneous end of
4906 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00004907 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00004908 if ((!terminate) &&
4909 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
4910 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00004911 ctxt->checkIndex = 0;
4912#ifdef DEBUG_PUSH
4913 xmlGenericError(xmlGenericErrorContext,
4914 "HPP: Parsing char data\n");
4915#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004916 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004917 }
4918 }
4919 if (cons == ctxt->nbChars) {
4920 if (ctxt->node != NULL) {
4921 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4922 ctxt->sax->error(ctxt->userData,
4923 "detected an error in element content\n");
4924 ctxt->wellFormed = 0;
4925 }
4926 NEXT;
4927 break;
4928 }
4929
4930 break;
4931 }
4932 case XML_PARSER_END_TAG:
4933 if (avail < 2)
4934 goto done;
4935 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004936 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004937 goto done;
4938 htmlParseEndTag(ctxt);
4939 if (ctxt->nameNr == 0) {
4940 ctxt->instate = XML_PARSER_EPILOG;
4941 } else {
4942 ctxt->instate = XML_PARSER_CONTENT;
4943 }
4944 ctxt->checkIndex = 0;
4945#ifdef DEBUG_PUSH
4946 xmlGenericError(xmlGenericErrorContext,
4947 "HPP: entering CONTENT\n");
4948#endif
4949 break;
4950 case XML_PARSER_CDATA_SECTION:
4951 xmlGenericError(xmlGenericErrorContext,
4952 "HPP: internal error, state == CDATA\n");
4953 ctxt->instate = XML_PARSER_CONTENT;
4954 ctxt->checkIndex = 0;
4955#ifdef DEBUG_PUSH
4956 xmlGenericError(xmlGenericErrorContext,
4957 "HPP: entering CONTENT\n");
4958#endif
4959 break;
4960 case XML_PARSER_DTD:
4961 xmlGenericError(xmlGenericErrorContext,
4962 "HPP: internal error, state == DTD\n");
4963 ctxt->instate = XML_PARSER_CONTENT;
4964 ctxt->checkIndex = 0;
4965#ifdef DEBUG_PUSH
4966 xmlGenericError(xmlGenericErrorContext,
4967 "HPP: entering CONTENT\n");
4968#endif
4969 break;
4970 case XML_PARSER_COMMENT:
4971 xmlGenericError(xmlGenericErrorContext,
4972 "HPP: internal error, state == COMMENT\n");
4973 ctxt->instate = XML_PARSER_CONTENT;
4974 ctxt->checkIndex = 0;
4975#ifdef DEBUG_PUSH
4976 xmlGenericError(xmlGenericErrorContext,
4977 "HPP: entering CONTENT\n");
4978#endif
4979 break;
4980 case XML_PARSER_PI:
4981 xmlGenericError(xmlGenericErrorContext,
4982 "HPP: internal error, state == PI\n");
4983 ctxt->instate = XML_PARSER_CONTENT;
4984 ctxt->checkIndex = 0;
4985#ifdef DEBUG_PUSH
4986 xmlGenericError(xmlGenericErrorContext,
4987 "HPP: entering CONTENT\n");
4988#endif
4989 break;
4990 case XML_PARSER_ENTITY_DECL:
4991 xmlGenericError(xmlGenericErrorContext,
4992 "HPP: internal error, state == ENTITY_DECL\n");
4993 ctxt->instate = XML_PARSER_CONTENT;
4994 ctxt->checkIndex = 0;
4995#ifdef DEBUG_PUSH
4996 xmlGenericError(xmlGenericErrorContext,
4997 "HPP: entering CONTENT\n");
4998#endif
4999 break;
5000 case XML_PARSER_ENTITY_VALUE:
5001 xmlGenericError(xmlGenericErrorContext,
5002 "HPP: internal error, state == ENTITY_VALUE\n");
5003 ctxt->instate = XML_PARSER_CONTENT;
5004 ctxt->checkIndex = 0;
5005#ifdef DEBUG_PUSH
5006 xmlGenericError(xmlGenericErrorContext,
5007 "HPP: entering DTD\n");
5008#endif
5009 break;
5010 case XML_PARSER_ATTRIBUTE_VALUE:
5011 xmlGenericError(xmlGenericErrorContext,
5012 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
5013 ctxt->instate = XML_PARSER_START_TAG;
5014 ctxt->checkIndex = 0;
5015#ifdef DEBUG_PUSH
5016 xmlGenericError(xmlGenericErrorContext,
5017 "HPP: entering START_TAG\n");
5018#endif
5019 break;
5020 case XML_PARSER_SYSTEM_LITERAL:
5021 xmlGenericError(xmlGenericErrorContext,
5022 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
5023 ctxt->instate = XML_PARSER_CONTENT;
5024 ctxt->checkIndex = 0;
5025#ifdef DEBUG_PUSH
5026 xmlGenericError(xmlGenericErrorContext,
5027 "HPP: entering CONTENT\n");
5028#endif
5029 break;
5030 case XML_PARSER_IGNORE:
5031 xmlGenericError(xmlGenericErrorContext,
5032 "HPP: internal error, state == XML_PARSER_IGNORE\n");
5033 ctxt->instate = XML_PARSER_CONTENT;
5034 ctxt->checkIndex = 0;
5035#ifdef DEBUG_PUSH
5036 xmlGenericError(xmlGenericErrorContext,
5037 "HPP: entering CONTENT\n");
5038#endif
5039 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005040 case XML_PARSER_PUBLIC_LITERAL:
5041 xmlGenericError(xmlGenericErrorContext,
5042 "HPP: internal error, state == XML_PARSER_LITERAL\n");
5043 ctxt->instate = XML_PARSER_CONTENT;
5044 ctxt->checkIndex = 0;
5045#ifdef DEBUG_PUSH
5046 xmlGenericError(xmlGenericErrorContext,
5047 "HPP: entering CONTENT\n");
5048#endif
5049 break;
5050
Owen Taylor3473f882001-02-23 17:55:21 +00005051 }
5052 }
5053done:
5054 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005055 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005056 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5057 /*
5058 * SAX: end of the document processing.
5059 */
5060 ctxt->instate = XML_PARSER_EOF;
5061 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5062 ctxt->sax->endDocument(ctxt->userData);
5063 }
5064 }
5065 if ((ctxt->myDoc != NULL) &&
5066 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5067 (ctxt->instate == XML_PARSER_EPILOG))) {
5068 xmlDtdPtr dtd;
5069 dtd = xmlGetIntSubset(ctxt->myDoc);
5070 if (dtd == NULL)
5071 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00005072 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005073 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5074 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5075 }
5076#ifdef DEBUG_PUSH
5077 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5078#endif
5079 return(ret);
5080}
5081
5082/**
Owen Taylor3473f882001-02-23 17:55:21 +00005083 * htmlParseChunk:
5084 * @ctxt: an XML parser context
5085 * @chunk: an char array
5086 * @size: the size in byte of the chunk
5087 * @terminate: last chunk indicator
5088 *
5089 * Parse a Chunk of memory
5090 *
5091 * Returns zero if no error, the xmlParserErrors otherwise.
5092 */
5093int
5094htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5095 int terminate) {
5096 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5097 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5098 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5099 int cur = ctxt->input->cur - ctxt->input->base;
5100
5101 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5102 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5103 ctxt->input->cur = ctxt->input->base + cur;
5104#ifdef DEBUG_PUSH
5105 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5106#endif
5107
Daniel Veillard14f752c2003-08-09 11:44:50 +00005108#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00005109 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5110 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005111#endif
Owen Taylor3473f882001-02-23 17:55:21 +00005112 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005113 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5114 xmlParserInputBufferPtr in = ctxt->input->buf;
5115 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5116 (in->raw != NULL)) {
5117 int nbchars;
5118
5119 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5120 if (nbchars < 0) {
5121 xmlGenericError(xmlGenericErrorContext,
5122 "htmlParseChunk: encoder error\n");
5123 return(XML_ERR_INVALID_ENCODING);
5124 }
5125 }
5126 }
Owen Taylor3473f882001-02-23 17:55:21 +00005127 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00005128 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00005129 if (terminate) {
5130 if ((ctxt->instate != XML_PARSER_EOF) &&
5131 (ctxt->instate != XML_PARSER_EPILOG) &&
5132 (ctxt->instate != XML_PARSER_MISC)) {
5133 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005134 ctxt->wellFormed = 0;
5135 }
5136 if (ctxt->instate != XML_PARSER_EOF) {
5137 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5138 ctxt->sax->endDocument(ctxt->userData);
5139 }
5140 ctxt->instate = XML_PARSER_EOF;
5141 }
5142 return((xmlParserErrors) ctxt->errNo);
5143}
5144
5145/************************************************************************
5146 * *
5147 * User entry points *
5148 * *
5149 ************************************************************************/
5150
5151/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005152 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005153 * @sax: a SAX handler
5154 * @user_data: The user data returned on SAX callbacks
5155 * @chunk: a pointer to an array of chars
5156 * @size: number of chars in the array
5157 * @filename: an optional file name or URI
5158 * @enc: an optional encoding
5159 *
5160 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005161 * The value of @filename is used for fetching external entities
5162 * and error/warning reports.
5163 *
5164 * Returns the new parser context or NULL
5165 */
5166htmlParserCtxtPtr
5167htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5168 const char *chunk, int size, const char *filename,
5169 xmlCharEncoding enc) {
5170 htmlParserCtxtPtr ctxt;
5171 htmlParserInputPtr inputStream;
5172 xmlParserInputBufferPtr buf;
5173
Daniel Veillardd0463562001-10-13 09:15:48 +00005174 xmlInitParser();
5175
Owen Taylor3473f882001-02-23 17:55:21 +00005176 buf = xmlAllocParserInputBuffer(enc);
5177 if (buf == NULL) return(NULL);
5178
5179 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
5180 if (ctxt == NULL) {
5181 xmlFree(buf);
5182 return(NULL);
5183 }
5184 memset(ctxt, 0, sizeof(htmlParserCtxt));
5185 htmlInitParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005186 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5187 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005188 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005189 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005190 xmlFree(ctxt->sax);
5191 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5192 if (ctxt->sax == NULL) {
5193 xmlFree(buf);
5194 xmlFree(ctxt);
5195 return(NULL);
5196 }
5197 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5198 if (user_data != NULL)
5199 ctxt->userData = user_data;
5200 }
5201 if (filename == NULL) {
5202 ctxt->directory = NULL;
5203 } else {
5204 ctxt->directory = xmlParserGetDirectory(filename);
5205 }
5206
5207 inputStream = htmlNewInputStream(ctxt);
5208 if (inputStream == NULL) {
5209 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005210 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005211 return(NULL);
5212 }
5213
5214 if (filename == NULL)
5215 inputStream->filename = NULL;
5216 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005217 inputStream->filename = (char *)
5218 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005219 inputStream->buf = buf;
5220 inputStream->base = inputStream->buf->buffer->content;
5221 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillard5f704af2003-03-05 10:01:43 +00005222 inputStream->end =
5223 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005224
5225 inputPush(ctxt, inputStream);
5226
5227 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5228 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005229 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5230 int cur = ctxt->input->cur - ctxt->input->base;
5231
Owen Taylor3473f882001-02-23 17:55:21 +00005232 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005233
5234 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5235 ctxt->input->cur = ctxt->input->base + cur;
5236 ctxt->input->end =
5237 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005238#ifdef DEBUG_PUSH
5239 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5240#endif
5241 }
5242
5243 return(ctxt);
5244}
5245
5246/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005247 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005248 * @cur: a pointer to an array of xmlChar
5249 * @encoding: a free form C string describing the HTML document encoding, or NULL
5250 * @sax: the SAX handler block
5251 * @userData: if using SAX, this pointer will be provided on callbacks.
5252 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005253 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5254 * to handle parse events. If sax is NULL, fallback to the default DOM
5255 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00005256 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005257 * Returns the resulting document tree unless SAX is NULL or the document is
5258 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005259 */
5260
5261htmlDocPtr
5262htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5263 htmlDocPtr ret;
5264 htmlParserCtxtPtr ctxt;
5265
Daniel Veillardd0463562001-10-13 09:15:48 +00005266 xmlInitParser();
5267
Owen Taylor3473f882001-02-23 17:55:21 +00005268 if (cur == NULL) return(NULL);
5269
5270
5271 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5272 if (ctxt == NULL) return(NULL);
5273 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005274 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005275 ctxt->sax = sax;
5276 ctxt->userData = userData;
5277 }
5278
5279 htmlParseDocument(ctxt);
5280 ret = ctxt->myDoc;
5281 if (sax != NULL) {
5282 ctxt->sax = NULL;
5283 ctxt->userData = NULL;
5284 }
5285 htmlFreeParserCtxt(ctxt);
5286
5287 return(ret);
5288}
5289
5290/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005291 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005292 * @cur: a pointer to an array of xmlChar
5293 * @encoding: a free form C string describing the HTML document encoding, or NULL
5294 *
5295 * parse an HTML in-memory document and build a tree.
5296 *
5297 * Returns the resulting document tree
5298 */
5299
5300htmlDocPtr
5301htmlParseDoc(xmlChar *cur, const char *encoding) {
5302 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5303}
5304
5305
5306/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005307 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005308 * @filename: the filename
5309 * @encoding: a free form C string describing the HTML document encoding, or NULL
5310 *
5311 * Create a parser context for a file content.
5312 * Automatic support for ZLIB/Compress compressed document is provided
5313 * by default if found at compile-time.
5314 *
5315 * Returns the new parser context or NULL
5316 */
5317htmlParserCtxtPtr
5318htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5319{
5320 htmlParserCtxtPtr ctxt;
5321 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005322 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005323 /* htmlCharEncoding enc; */
5324 xmlChar *content, *content_line = (xmlChar *) "charset=";
5325
Owen Taylor3473f882001-02-23 17:55:21 +00005326 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
5327 if (ctxt == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00005328 xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00005329 return(NULL);
5330 }
5331 memset(ctxt, 0, sizeof(htmlParserCtxt));
5332 htmlInitParserCtxt(ctxt);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005333 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5334 if (canonicFilename == NULL) {
5335 if (xmlDefaultSAXHandler.error != NULL) {
5336 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5337 }
Daniel Veillard104caa32003-05-13 22:54:05 +00005338 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005339 return(NULL);
5340 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005341
5342 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5343 xmlFree(canonicFilename);
5344 if (inputStream == NULL) {
5345 xmlFreeParserCtxt(ctxt);
5346 return(NULL);
5347 }
Owen Taylor3473f882001-02-23 17:55:21 +00005348
5349 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005350
Owen Taylor3473f882001-02-23 17:55:21 +00005351 /* set encoding */
5352 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005353 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Owen Taylor3473f882001-02-23 17:55:21 +00005354 if (content) {
5355 strcpy ((char *)content, (char *)content_line);
5356 strcat ((char *)content, (char *)encoding);
5357 htmlCheckEncoding (ctxt, content);
5358 xmlFree (content);
5359 }
5360 }
5361
5362 return(ctxt);
5363}
5364
5365/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005366 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005367 * @filename: the filename
5368 * @encoding: a free form C string describing the HTML document encoding, or NULL
5369 * @sax: the SAX handler block
5370 * @userData: if using SAX, this pointer will be provided on callbacks.
5371 *
5372 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5373 * compressed document is provided by default if found at compile-time.
5374 * It use the given SAX function block to handle the parsing callback.
5375 * If sax is NULL, fallback to the default DOM tree building routines.
5376 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005377 * Returns the resulting document tree unless SAX is NULL or the document is
5378 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005379 */
5380
5381htmlDocPtr
5382htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5383 void *userData) {
5384 htmlDocPtr ret;
5385 htmlParserCtxtPtr ctxt;
5386 htmlSAXHandlerPtr oldsax = NULL;
5387
Daniel Veillardd0463562001-10-13 09:15:48 +00005388 xmlInitParser();
5389
Owen Taylor3473f882001-02-23 17:55:21 +00005390 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5391 if (ctxt == NULL) return(NULL);
5392 if (sax != NULL) {
5393 oldsax = ctxt->sax;
5394 ctxt->sax = sax;
5395 ctxt->userData = userData;
5396 }
5397
5398 htmlParseDocument(ctxt);
5399
5400 ret = ctxt->myDoc;
5401 if (sax != NULL) {
5402 ctxt->sax = oldsax;
5403 ctxt->userData = NULL;
5404 }
5405 htmlFreeParserCtxt(ctxt);
5406
5407 return(ret);
5408}
5409
5410/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005411 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005412 * @filename: the filename
5413 * @encoding: a free form C string describing the HTML document encoding, or NULL
5414 *
5415 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5416 * compressed document is provided by default if found at compile-time.
5417 *
5418 * Returns the resulting document tree
5419 */
5420
5421htmlDocPtr
5422htmlParseFile(const char *filename, const char *encoding) {
5423 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5424}
5425
5426/**
5427 * htmlHandleOmittedElem:
5428 * @val: int 0 or 1
5429 *
5430 * Set and return the previous value for handling HTML omitted tags.
5431 *
5432 * Returns the last value for 0 for no handling, 1 for auto insertion.
5433 */
5434
5435int
5436htmlHandleOmittedElem(int val) {
5437 int old = htmlOmittedDefaultValue;
5438
5439 htmlOmittedDefaultValue = val;
5440 return(old);
5441}
5442
Daniel Veillard930dfb62003-02-05 10:17:38 +00005443/**
5444 * htmlElementAllowedHere:
5445 * @parent: HTML parent element
5446 * @elt: HTML element
5447 *
5448 * Checks whether an HTML element may be a direct child of a parent element.
5449 * Note - doesn't check for deprecated elements
5450 *
5451 * Returns 1 if allowed; 0 otherwise.
5452 */
5453int
5454htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5455 const char** p ;
5456
5457 if ( ! elt || ! parent || ! parent->subelts )
5458 return 0 ;
5459
5460 for ( p = parent->subelts; *p; ++p )
5461 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5462 return 1 ;
5463
5464 return 0 ;
5465}
5466/**
5467 * htmlElementStatusHere:
5468 * @parent: HTML parent element
5469 * @elt: HTML element
5470 *
5471 * Checks whether an HTML element may be a direct child of a parent element.
5472 * and if so whether it is valid or deprecated.
5473 *
5474 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5475 */
5476htmlStatus
5477htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5478 if ( ! parent || ! elt )
5479 return HTML_INVALID ;
5480 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5481 return HTML_INVALID ;
5482
5483 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5484}
5485/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005486 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005487 * @elt: HTML element
5488 * @attr: HTML attribute
5489 * @legacy: whether to allow deprecated attributes
5490 *
5491 * Checks whether an attribute is valid for an element
5492 * Has full knowledge of Required and Deprecated attributes
5493 *
5494 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5495 */
5496htmlStatus
5497htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5498 const char** p ;
5499
5500 if ( !elt || ! attr )
5501 return HTML_INVALID ;
5502
5503 if ( elt->attrs_req )
5504 for ( p = elt->attrs_req; *p; ++p)
5505 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5506 return HTML_REQUIRED ;
5507
5508 if ( elt->attrs_opt )
5509 for ( p = elt->attrs_opt; *p; ++p)
5510 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5511 return HTML_VALID ;
5512
5513 if ( legacy && elt->attrs_depr )
5514 for ( p = elt->attrs_depr; *p; ++p)
5515 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5516 return HTML_DEPRECATED ;
5517
5518 return HTML_INVALID ;
5519}
5520/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005521 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005522 * @node: an htmlNodePtr in a tree
5523 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005524 * for Element nodes)
5525 *
5526 * Checks whether the tree node is valid. Experimental (the author
5527 * only uses the HTML enhancements in a SAX parser)
5528 *
5529 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5530 * legacy allowed) or htmlElementStatusHere (otherwise).
5531 * for Attribute nodes, a return from htmlAttrAllowed
5532 * for other nodes, HTML_NA (no checks performed)
5533 */
5534htmlStatus
5535htmlNodeStatus(const htmlNodePtr node, int legacy) {
5536 if ( ! node )
5537 return HTML_INVALID ;
5538
5539 switch ( node->type ) {
5540 case XML_ELEMENT_NODE:
5541 return legacy
5542 ? ( htmlElementAllowedHere (
5543 htmlTagLookup(node->parent->name) , node->name
5544 ) ? HTML_VALID : HTML_INVALID )
5545 : htmlElementStatusHere(
5546 htmlTagLookup(node->parent->name) ,
5547 htmlTagLookup(node->name) )
5548 ;
5549 case XML_ATTRIBUTE_NODE:
5550 return htmlAttrAllowed(
5551 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5552 default: return HTML_NA ;
5553 }
5554}
Daniel Veillard9475a352003-09-26 12:47:50 +00005555/************************************************************************
5556 * *
5557 * New set (2.6.0) of simpler and more flexible APIs *
5558 * *
5559 ************************************************************************/
5560/**
5561 * DICT_FREE:
5562 * @str: a string
5563 *
5564 * Free a string if it is not owned by the "dict" dictionnary in the
5565 * current scope
5566 */
5567#define DICT_FREE(str) \
5568 if ((str) && ((!dict) || \
5569 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5570 xmlFree((char *)(str));
5571
5572/**
5573 * htmlCtxtReset:
5574 * @ctxt: an XML parser context
5575 *
5576 * Reset a parser context
5577 */
5578void
5579htmlCtxtReset(htmlParserCtxtPtr ctxt)
5580{
5581 xmlParserInputPtr input;
5582 xmlDictPtr dict = ctxt->dict;
5583
5584 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5585 xmlFreeInputStream(input);
5586 }
5587 ctxt->inputNr = 0;
5588 ctxt->input = NULL;
5589
5590 ctxt->spaceNr = 0;
5591 ctxt->spaceTab[0] = -1;
5592 ctxt->space = &ctxt->spaceTab[0];
5593
5594
5595 ctxt->nodeNr = 0;
5596 ctxt->node = NULL;
5597
5598 ctxt->nameNr = 0;
5599 ctxt->name = NULL;
5600
5601 DICT_FREE(ctxt->version);
5602 ctxt->version = NULL;
5603 DICT_FREE(ctxt->encoding);
5604 ctxt->encoding = NULL;
5605 DICT_FREE(ctxt->directory);
5606 ctxt->directory = NULL;
5607 DICT_FREE(ctxt->extSubURI);
5608 ctxt->extSubURI = NULL;
5609 DICT_FREE(ctxt->extSubSystem);
5610 ctxt->extSubSystem = NULL;
5611 if (ctxt->myDoc != NULL)
5612 xmlFreeDoc(ctxt->myDoc);
5613 ctxt->myDoc = NULL;
5614
5615 ctxt->standalone = -1;
5616 ctxt->hasExternalSubset = 0;
5617 ctxt->hasPErefs = 0;
5618 ctxt->html = 1;
5619 ctxt->external = 0;
5620 ctxt->instate = XML_PARSER_START;
5621 ctxt->token = 0;
5622
5623 ctxt->wellFormed = 1;
5624 ctxt->nsWellFormed = 1;
5625 ctxt->valid = 1;
5626 ctxt->vctxt.userData = ctxt;
5627 ctxt->vctxt.error = xmlParserValidityError;
5628 ctxt->vctxt.warning = xmlParserValidityWarning;
5629 ctxt->record_info = 0;
5630 ctxt->nbChars = 0;
5631 ctxt->checkIndex = 0;
5632 ctxt->inSubset = 0;
5633 ctxt->errNo = XML_ERR_OK;
5634 ctxt->depth = 0;
5635 ctxt->charset = XML_CHAR_ENCODING_UTF8;
5636 ctxt->catalogs = NULL;
5637 xmlInitNodeInfoSeq(&ctxt->node_seq);
5638
5639 if (ctxt->attsDefault != NULL) {
5640 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5641 ctxt->attsDefault = NULL;
5642 }
5643 if (ctxt->attsSpecial != NULL) {
5644 xmlHashFree(ctxt->attsSpecial, NULL);
5645 ctxt->attsSpecial = NULL;
5646 }
5647}
5648
5649/**
5650 * htmlCtxtUseOptions:
5651 * @ctxt: an HTML parser context
5652 * @options: a combination of htmlParserOption(s)
5653 *
5654 * Applies the options to the parser context
5655 *
5656 * Returns 0 in case of success, the set of unknown or unimplemented options
5657 * in case of error.
5658 */
5659int
5660htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5661{
5662 if (options & HTML_PARSE_NOWARNING) {
5663 ctxt->sax->warning = NULL;
5664 options -= XML_PARSE_NOWARNING;
5665 }
5666 if (options & HTML_PARSE_NOERROR) {
5667 ctxt->sax->error = NULL;
5668 ctxt->sax->fatalError = NULL;
5669 options -= XML_PARSE_NOERROR;
5670 }
5671 if (options & HTML_PARSE_PEDANTIC) {
5672 ctxt->pedantic = 1;
5673 options -= XML_PARSE_PEDANTIC;
5674 } else
5675 ctxt->pedantic = 0;
5676 if (options & XML_PARSE_NOBLANKS) {
5677 ctxt->keepBlanks = 0;
5678 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5679 options -= XML_PARSE_NOBLANKS;
5680 } else
5681 ctxt->keepBlanks = 1;
5682 ctxt->dictNames = 0;
5683 return (options);
5684}
5685
5686/**
5687 * htmlDoRead:
5688 * @ctxt: an HTML parser context
5689 * @URL: the base URL to use for the document
5690 * @encoding: the document encoding, or NULL
5691 * @options: a combination of htmlParserOption(s)
5692 * @reuse: keep the context for reuse
5693 *
5694 * Common front-end for the htmlRead functions
5695 *
5696 * Returns the resulting document tree or NULL
5697 */
5698static htmlDocPtr
5699htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5700 int options, int reuse)
5701{
5702 htmlDocPtr ret;
5703
5704 htmlCtxtUseOptions(ctxt, options);
5705 ctxt->html = 1;
5706 if (encoding != NULL) {
5707 xmlCharEncodingHandlerPtr hdlr;
5708
5709 hdlr = xmlFindCharEncodingHandler(encoding);
5710 if (hdlr != NULL)
5711 xmlSwitchToEncoding(ctxt, hdlr);
5712 }
5713 if ((URL != NULL) && (ctxt->input != NULL) &&
5714 (ctxt->input->filename == NULL))
5715 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
5716 htmlParseDocument(ctxt);
5717 ret = ctxt->myDoc;
5718 ctxt->myDoc = NULL;
5719 if (!reuse) {
5720 if ((ctxt->dictNames) &&
5721 (ret != NULL) &&
5722 (ret->dict == ctxt->dict))
5723 ctxt->dict = NULL;
5724 xmlFreeParserCtxt(ctxt);
5725 } else {
5726 /* Must duplicate the reference to the dictionary */
5727 if ((ctxt->dictNames) &&
5728 (ret != NULL) &&
5729 (ret->dict == ctxt->dict))
5730 xmlDictReference(ctxt->dict);
5731 }
5732 return (ret);
5733}
5734
5735/**
5736 * htmlReadDoc:
5737 * @cur: a pointer to a zero terminated string
5738 * @URL: the base URL to use for the document
5739 * @encoding: the document encoding, or NULL
5740 * @options: a combination of htmlParserOption(s)
5741 *
5742 * parse an XML in-memory document and build a tree.
5743 *
5744 * Returns the resulting document tree
5745 */
5746htmlDocPtr
5747htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
5748{
5749 htmlParserCtxtPtr ctxt;
5750
5751 if (cur == NULL)
5752 return (NULL);
5753
5754 ctxt = xmlCreateDocParserCtxt(cur);
5755 if (ctxt == NULL)
5756 return (NULL);
5757 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5758}
5759
5760/**
5761 * htmlReadFile:
5762 * @filename: a file or URL
5763 * @encoding: the document encoding, or NULL
5764 * @options: a combination of htmlParserOption(s)
5765 *
5766 * parse an XML file from the filesystem or the network.
5767 *
5768 * Returns the resulting document tree
5769 */
5770htmlDocPtr
5771htmlReadFile(const char *filename, const char *encoding, int options)
5772{
5773 htmlParserCtxtPtr ctxt;
5774
5775 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5776 if (ctxt == NULL)
5777 return (NULL);
5778 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
5779}
5780
5781/**
5782 * htmlReadMemory:
5783 * @buffer: a pointer to a char array
5784 * @size: the size of the array
5785 * @URL: the base URL to use for the document
5786 * @encoding: the document encoding, or NULL
5787 * @options: a combination of htmlParserOption(s)
5788 *
5789 * parse an XML in-memory document and build a tree.
5790 *
5791 * Returns the resulting document tree
5792 */
5793htmlDocPtr
5794htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
5795{
5796 htmlParserCtxtPtr ctxt;
5797
5798 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
5799 if (ctxt == NULL)
5800 return (NULL);
5801 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5802}
5803
5804/**
5805 * htmlReadFd:
5806 * @fd: an open file descriptor
5807 * @URL: the base URL to use for the document
5808 * @encoding: the document encoding, or NULL
5809 * @options: a combination of htmlParserOption(s)
5810 *
5811 * parse an XML from a file descriptor and build a tree.
5812 *
5813 * Returns the resulting document tree
5814 */
5815htmlDocPtr
5816htmlReadFd(int fd, const char *URL, const char *encoding, int options)
5817{
5818 htmlParserCtxtPtr ctxt;
5819 xmlParserInputBufferPtr input;
5820 xmlParserInputPtr stream;
5821
5822 if (fd < 0)
5823 return (NULL);
5824
5825 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
5826 if (input == NULL)
5827 return (NULL);
5828 ctxt = xmlNewParserCtxt();
5829 if (ctxt == NULL) {
5830 xmlFreeParserInputBuffer(input);
5831 return (NULL);
5832 }
5833 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5834 if (stream == NULL) {
5835 xmlFreeParserInputBuffer(input);
5836 xmlFreeParserCtxt(ctxt);
5837 return (NULL);
5838 }
5839 inputPush(ctxt, stream);
5840 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5841}
5842
5843/**
5844 * htmlReadIO:
5845 * @ioread: an I/O read function
5846 * @ioclose: an I/O close function
5847 * @ioctx: an I/O handler
5848 * @URL: the base URL to use for the document
5849 * @encoding: the document encoding, or NULL
5850 * @options: a combination of htmlParserOption(s)
5851 *
5852 * parse an HTML document from I/O functions and source and build a tree.
5853 *
5854 * Returns the resulting document tree
5855 */
5856htmlDocPtr
5857htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
5858 void *ioctx, const char *URL, const char *encoding, int options)
5859{
5860 htmlParserCtxtPtr ctxt;
5861 xmlParserInputBufferPtr input;
5862 xmlParserInputPtr stream;
5863
5864 if (ioread == NULL)
5865 return (NULL);
5866
5867 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
5868 XML_CHAR_ENCODING_NONE);
5869 if (input == NULL)
5870 return (NULL);
5871 ctxt = xmlNewParserCtxt();
5872 if (ctxt == NULL) {
5873 xmlFreeParserInputBuffer(input);
5874 return (NULL);
5875 }
5876 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5877 if (stream == NULL) {
5878 xmlFreeParserInputBuffer(input);
5879 xmlFreeParserCtxt(ctxt);
5880 return (NULL);
5881 }
5882 inputPush(ctxt, stream);
5883 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5884}
5885
5886/**
5887 * htmlCtxtReadDoc:
5888 * @ctxt: an HTML parser context
5889 * @cur: a pointer to a zero terminated string
5890 * @URL: the base URL to use for the document
5891 * @encoding: the document encoding, or NULL
5892 * @options: a combination of htmlParserOption(s)
5893 *
5894 * parse an XML in-memory document and build a tree.
5895 * This reuses the existing @ctxt parser context
5896 *
5897 * Returns the resulting document tree
5898 */
5899htmlDocPtr
5900htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
5901 const char *URL, const char *encoding, int options)
5902{
5903 xmlParserInputPtr stream;
5904
5905 if (cur == NULL)
5906 return (NULL);
5907 if (ctxt == NULL)
5908 return (NULL);
5909
5910 htmlCtxtReset(ctxt);
5911
5912 stream = xmlNewStringInputStream(ctxt, cur);
5913 if (stream == NULL) {
5914 return (NULL);
5915 }
5916 inputPush(ctxt, stream);
5917 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5918}
5919
5920/**
5921 * htmlCtxtReadFile:
5922 * @ctxt: an HTML parser context
5923 * @filename: a file or URL
5924 * @encoding: the document encoding, or NULL
5925 * @options: a combination of htmlParserOption(s)
5926 *
5927 * parse an XML file from the filesystem or the network.
5928 * This reuses the existing @ctxt parser context
5929 *
5930 * Returns the resulting document tree
5931 */
5932htmlDocPtr
5933htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
5934 const char *encoding, int options)
5935{
5936 xmlParserInputPtr stream;
5937
5938 if (filename == NULL)
5939 return (NULL);
5940 if (ctxt == NULL)
5941 return (NULL);
5942
5943 htmlCtxtReset(ctxt);
5944
5945 stream = xmlNewInputFromFile(ctxt, filename);
5946 if (stream == NULL) {
5947 return (NULL);
5948 }
5949 inputPush(ctxt, stream);
5950 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
5951}
5952
5953/**
5954 * htmlCtxtReadMemory:
5955 * @ctxt: an HTML parser context
5956 * @buffer: a pointer to a char array
5957 * @size: the size of the array
5958 * @URL: the base URL to use for the document
5959 * @encoding: the document encoding, or NULL
5960 * @options: a combination of htmlParserOption(s)
5961 *
5962 * parse an XML in-memory document and build a tree.
5963 * This reuses the existing @ctxt parser context
5964 *
5965 * Returns the resulting document tree
5966 */
5967htmlDocPtr
5968htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
5969 const char *URL, const char *encoding, int options)
5970{
5971 xmlParserInputBufferPtr input;
5972 xmlParserInputPtr stream;
5973
5974 if (ctxt == NULL)
5975 return (NULL);
5976 if (buffer == NULL)
5977 return (NULL);
5978
5979 htmlCtxtReset(ctxt);
5980
5981 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5982 if (input == NULL) {
5983 return(NULL);
5984 }
5985
5986 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5987 if (stream == NULL) {
5988 xmlFreeParserInputBuffer(input);
5989 return(NULL);
5990 }
5991
5992 inputPush(ctxt, stream);
5993 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5994}
5995
5996/**
5997 * htmlCtxtReadFd:
5998 * @ctxt: an HTML parser context
5999 * @fd: an open file descriptor
6000 * @URL: the base URL to use for the document
6001 * @encoding: the document encoding, or NULL
6002 * @options: a combination of htmlParserOption(s)
6003 *
6004 * parse an XML from a file descriptor and build a tree.
6005 * This reuses the existing @ctxt parser context
6006 *
6007 * Returns the resulting document tree
6008 */
6009htmlDocPtr
6010htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6011 const char *URL, const char *encoding, int options)
6012{
6013 xmlParserInputBufferPtr input;
6014 xmlParserInputPtr stream;
6015
6016 if (fd < 0)
6017 return (NULL);
6018 if (ctxt == NULL)
6019 return (NULL);
6020
6021 htmlCtxtReset(ctxt);
6022
6023
6024 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6025 if (input == NULL)
6026 return (NULL);
6027 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6028 if (stream == NULL) {
6029 xmlFreeParserInputBuffer(input);
6030 return (NULL);
6031 }
6032 inputPush(ctxt, stream);
6033 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6034}
6035
6036/**
6037 * htmlCtxtReadIO:
6038 * @ctxt: an HTML parser context
6039 * @ioread: an I/O read function
6040 * @ioclose: an I/O close function
6041 * @ioctx: an I/O handler
6042 * @URL: the base URL to use for the document
6043 * @encoding: the document encoding, or NULL
6044 * @options: a combination of htmlParserOption(s)
6045 *
6046 * parse an HTML document from I/O functions and source and build a tree.
6047 * This reuses the existing @ctxt parser context
6048 *
6049 * Returns the resulting document tree
6050 */
6051htmlDocPtr
6052htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6053 xmlInputCloseCallback ioclose, void *ioctx,
6054 const char *URL,
6055 const char *encoding, int options)
6056{
6057 xmlParserInputBufferPtr input;
6058 xmlParserInputPtr stream;
6059
6060 if (ioread == NULL)
6061 return (NULL);
6062 if (ctxt == NULL)
6063 return (NULL);
6064
6065 htmlCtxtReset(ctxt);
6066
6067 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6068 XML_CHAR_ENCODING_NONE);
6069 if (input == NULL)
6070 return (NULL);
6071 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6072 if (stream == NULL) {
6073 xmlFreeParserInputBuffer(input);
6074 return (NULL);
6075 }
6076 inputPush(ctxt, stream);
6077 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6078}
6079
Owen Taylor3473f882001-02-23 17:55:21 +00006080#endif /* LIBXML_HTML_ENABLED */