blob: e7dcb344d430d26f01e7bc659c55ce0b6e201ebd [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Owen Taylor3473f882001-02-23 17:55:21 +000062 * Parser stacks related functions and macros *
63 * *
64 ************************************************************************/
65
Daniel Veillard1c732d22002-11-30 11:22:59 +000066/**
67 * htmlnamePush:
68 * @ctxt: an HTML parser context
69 * @value: the element name
70 *
71 * Pushes a new element name on top of the name stack
72 *
73 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +000074 */
Daniel Veillard1c732d22002-11-30 11:22:59 +000075static int
76htmlnamePush(htmlParserCtxtPtr ctxt, xmlChar * value)
77{
78 if (ctxt->nameNr >= ctxt->nameMax) {
79 ctxt->nameMax *= 2;
80 ctxt->nameTab =
81 (xmlChar * *)xmlRealloc(ctxt->nameTab,
82 ctxt->nameMax *
83 sizeof(ctxt->nameTab[0]));
84 if (ctxt->nameTab == NULL) {
85 xmlGenericError(xmlGenericErrorContext, "realloc failed !\n");
86 return (0);
87 }
88 }
89 ctxt->nameTab[ctxt->nameNr] = value;
90 ctxt->name = value;
91 return (ctxt->nameNr++);
92}
93/**
94 * htmlnamePop:
95 * @ctxt: an HTML parser context
96 *
97 * Pops the top element name from the name stack
98 *
99 * Returns the name just removed
100 */
101static xmlChar *
102htmlnamePop(htmlParserCtxtPtr ctxt)
103{
104 xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000105
Daniel Veillard1c732d22002-11-30 11:22:59 +0000106 if (ctxt->nameNr <= 0)
107 return (0);
108 ctxt->nameNr--;
109 if (ctxt->nameNr < 0)
110 return (0);
111 if (ctxt->nameNr > 0)
112 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
113 else
114 ctxt->name = NULL;
115 ret = ctxt->nameTab[ctxt->nameNr];
116 ctxt->nameTab[ctxt->nameNr] = 0;
117 return (ret);
118}
Owen Taylor3473f882001-02-23 17:55:21 +0000119
120/*
121 * Macros for accessing the content. Those should be used only by the parser,
122 * and not exported.
123 *
124 * Dirty macros, i.e. one need to make assumption on the context to use them
125 *
126 * CUR_PTR return the current pointer to the xmlChar to be parsed.
127 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
128 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
129 * in UNICODE mode. This should be used internally by the parser
130 * only to compare to ASCII values otherwise it would break when
131 * running with UTF-8 encoding.
132 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
133 * to compare on ASCII based substring.
134 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
135 * it should be used only to compare on ASCII based substring.
136 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000137 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000138 *
139 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
140 *
141 * CURRENT Returns the current char value, with the full decoding of
142 * UTF-8 if we are using this mode. It returns an int.
143 * NEXT Skip to the next character, this does the proper decoding
144 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000145 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000146 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
147 */
148
149#define UPPER (toupper(*ctxt->input->cur))
150
Daniel Veillard77a90a72003-03-22 00:04:05 +0000151#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000152
153#define NXT(val) ctxt->input->cur[(val)]
154
155#define UPP(val) (toupper(ctxt->input->cur[(val)]))
156
157#define CUR_PTR ctxt->input->cur
158
159#define SHRINK xmlParserInputShrink(ctxt->input)
160
161#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
162
163#define CURRENT ((int) (*ctxt->input->cur))
164
165#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
166
167/* Inported from XML */
168
Daniel Veillard561b7f82002-03-20 21:55:57 +0000169/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
170#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000171#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000172
Daniel Veillard561b7f82002-03-20 21:55:57 +0000173#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000174#define NXT(val) ctxt->input->cur[(val)]
175#define CUR_PTR ctxt->input->cur
176
177
178#define NEXTL(l) do { \
179 if (*(ctxt->input->cur) == '\n') { \
180 ctxt->input->line++; ctxt->input->col = 1; \
181 } else ctxt->input->col++; \
182 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
183 } while (0)
184
185/************
186 \
187 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
188 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
189 ************/
190
191#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
192#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
193
194#define COPY_BUF(l,b,i,v) \
195 if (l == 1) b[i++] = (xmlChar) v; \
196 else i += xmlCopyChar(l,&b[i],v)
197
198/**
199 * htmlCurrentChar:
200 * @ctxt: the HTML parser context
201 * @len: pointer to the length of the char read
202 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000203 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000204 * bytes in the input buffer. Implement the end of line normalization:
205 * 2.11 End-of-Line Handling
206 * If the encoding is unspecified, in the case we find an ISO-Latin-1
207 * char, then the encoding converter is plugged in automatically.
208 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000209 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000210 */
211
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000212static int
Owen Taylor3473f882001-02-23 17:55:21 +0000213htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
214 if (ctxt->instate == XML_PARSER_EOF)
215 return(0);
216
217 if (ctxt->token != 0) {
218 *len = 0;
219 return(ctxt->token);
220 }
221 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
222 /*
223 * We are supposed to handle UTF8, check it's valid
224 * From rfc2044: encoding of the Unicode values on UTF-8:
225 *
226 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
227 * 0000 0000-0000 007F 0xxxxxxx
228 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
229 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
230 *
231 * Check for the 0x110000 limit too
232 */
233 const unsigned char *cur = ctxt->input->cur;
234 unsigned char c;
235 unsigned int val;
236
237 c = *cur;
238 if (c & 0x80) {
239 if (cur[1] == 0)
240 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
241 if ((cur[1] & 0xc0) != 0x80)
242 goto encoding_error;
243 if ((c & 0xe0) == 0xe0) {
244
245 if (cur[2] == 0)
246 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
247 if ((cur[2] & 0xc0) != 0x80)
248 goto encoding_error;
249 if ((c & 0xf0) == 0xf0) {
250 if (cur[3] == 0)
251 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
252 if (((c & 0xf8) != 0xf0) ||
253 ((cur[3] & 0xc0) != 0x80))
254 goto encoding_error;
255 /* 4-byte code */
256 *len = 4;
257 val = (cur[0] & 0x7) << 18;
258 val |= (cur[1] & 0x3f) << 12;
259 val |= (cur[2] & 0x3f) << 6;
260 val |= cur[3] & 0x3f;
261 } else {
262 /* 3-byte code */
263 *len = 3;
264 val = (cur[0] & 0xf) << 12;
265 val |= (cur[1] & 0x3f) << 6;
266 val |= cur[2] & 0x3f;
267 }
268 } else {
269 /* 2-byte code */
270 *len = 2;
271 val = (cur[0] & 0x1f) << 6;
272 val |= cur[1] & 0x3f;
273 }
274 if (!IS_CHAR(val)) {
275 ctxt->errNo = XML_ERR_INVALID_ENCODING;
276 if ((ctxt->sax != NULL) &&
277 (ctxt->sax->error != NULL))
278 ctxt->sax->error(ctxt->userData,
279 "Char 0x%X out of allowed range\n", val);
280 ctxt->wellFormed = 0;
Daniel Veillarddad3f682002-11-17 16:47:27 +0000281 if (ctxt->recovery == 0) ctxt->disableSAX = 1;
Owen Taylor3473f882001-02-23 17:55:21 +0000282 }
283 return(val);
284 } else {
285 /* 1-byte code */
286 *len = 1;
287 return((int) *ctxt->input->cur);
288 }
289 }
290 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000291 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000292 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000293 * XML constructs only use < 128 chars
294 */
295 *len = 1;
296 if ((int) *ctxt->input->cur < 0x80)
297 return((int) *ctxt->input->cur);
298
299 /*
300 * Humm this is bad, do an automatic flow conversion
301 */
302 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
303 ctxt->charset = XML_CHAR_ENCODING_UTF8;
304 return(xmlCurrentChar(ctxt, len));
305
306encoding_error:
307 /*
308 * If we detect an UTF8 error that probably mean that the
309 * input encoding didn't get properly advertized in the
310 * declaration header. Report the error and switch the encoding
311 * to ISO-Latin-1 (if you don't like this policy, just declare the
312 * encoding !)
313 */
314 ctxt->errNo = XML_ERR_INVALID_ENCODING;
315 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
316 ctxt->sax->error(ctxt->userData,
317 "Input is not proper UTF-8, indicate encoding !\n");
318 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
319 ctxt->input->cur[0], ctxt->input->cur[1],
320 ctxt->input->cur[2], ctxt->input->cur[3]);
321 }
322
323 ctxt->charset = XML_CHAR_ENCODING_8859_1;
324 *len = 1;
325 return((int) *ctxt->input->cur);
326}
327
328/**
Owen Taylor3473f882001-02-23 17:55:21 +0000329 * htmlSkipBlankChars:
330 * @ctxt: the HTML parser context
331 *
332 * skip all blanks character found at that point in the input streams.
333 *
334 * Returns the number of space chars skipped
335 */
336
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000337static int
Owen Taylor3473f882001-02-23 17:55:21 +0000338htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
339 int res = 0;
340
341 while (IS_BLANK(*(ctxt->input->cur))) {
342 if ((*ctxt->input->cur == 0) &&
343 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
344 xmlPopInput(ctxt);
345 } else {
346 if (*(ctxt->input->cur) == '\n') {
347 ctxt->input->line++; ctxt->input->col = 1;
348 } else ctxt->input->col++;
349 ctxt->input->cur++;
350 ctxt->nbChars++;
351 if (*ctxt->input->cur == 0)
352 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
353 }
354 res++;
355 }
356 return(res);
357}
358
359
360
361/************************************************************************
362 * *
363 * The list of HTML elements and their properties *
364 * *
365 ************************************************************************/
366
367/*
368 * Start Tag: 1 means the start tag can be ommited
369 * End Tag: 1 means the end tag can be ommited
370 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000371 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000372 * Depr: this element is deprecated
373 * DTD: 1 means that this element is valid only in the Loose DTD
374 * 2 means that this element is valid only in the Frameset DTD
375 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000376 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000377 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000378 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000379
380/* Definitions and a couple of vars for HTML Elements */
381
382#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
383#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
384#define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
385#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
386#define BLOCK HEADING LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
387#define FORMCTRL "input", "select", "textarea", "label", "button"
388#define PCDATA
389#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
390#define LIST "ul", "ol", "dir", "menu"
391#define MODIFIER
392#define FLOW BLOCK,INLINE
393#define EMPTY NULL
394
395
396static const char* html_flow[] = { FLOW, NULL } ;
397static const char* html_inline[] = { INLINE, NULL } ;
398
399/* placeholders: elts with content but no subelements */
400static const char* html_pcdata[] = { NULL } ;
401#define html_cdata html_pcdata
402
403
404/* ... and for HTML Attributes */
405
406#define COREATTRS "id", "class", "style", "title"
407#define I18N "lang", "dir"
408#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
409#define ATTRS COREATTRS,I18N,EVENTS
410#define CELLHALIGN "align", "char", "charoff"
411#define CELLVALIGN "valign"
412
413static const char* html_attrs[] = { ATTRS, NULL } ;
414static const char* core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
415static const char* core_attrs[] = { COREATTRS, NULL } ;
416static const char* i18n_attrs[] = { I18N, NULL } ;
417
418
419/* Other declarations that should go inline ... */
420static const char* a_attrs[] = { ATTRS, "charset", "type", "name",
421 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
422 "tabindex", "onfocus", "onblur", NULL } ;
423static const char* target_attr[] = { "target", NULL } ;
424static const char* rows_cols_attr[] = { "rows", "cols", NULL } ;
425static const char* alt_attr[] = { "alt", NULL } ;
426static const char* src_alt_attrs[] = { "src", "alt", NULL } ;
427static const char* href_attrs[] = { "href", NULL } ;
428static const char* clear_attrs[] = { "clear", NULL } ;
429static const char* inline_p[] = { INLINE, "p", NULL } ;
430static const char* flow_param[] = { FLOW, "param", NULL } ;
431static const char* applet_attrs[] = { COREATTRS , "codebase",
432 "archive", "alt", "name", "height", "width", "align",
433 "hspace", "vspace", NULL } ;
434static const char* area_attrs[] = { "shape", "coords", "href", "nohref",
435 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
436static const char* basefont_attrs[] =
437 { "id", "size", "color", "face", NULL } ;
438static const char* quote_attrs[] = { ATTRS, "cite", NULL } ;
439static const char* body_contents[] = { FLOW, "ins", "del", NULL } ;
440static const char* body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
441static const char* body_depr[] = { "background", "bgcolor", "text",
442 "link", "vlink", "alink", NULL } ;
443static const char* button_attrs[] = { ATTRS, "name", "value", "type",
444 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
445
446
447static const char* col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
448static const char* col_elt[] = { "col", NULL } ;
449static const char* edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
450static const char* compact_attrs[] = { ATTRS, "compact", NULL } ;
451static const char* dl_contents[] = { "dt", "dd", NULL } ;
452static const char* compact_attr[] = { "compact", NULL } ;
453static const char* label_attr[] = { "label", NULL } ;
454static const char* fieldset_contents[] = { FLOW, "legend" } ;
455static const char* font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
456static const char* form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
457static const char* form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
458static const char* frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
459static const char* frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
460static const char* frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
461static const char* head_attrs[] = { I18N, "profile", NULL } ;
462static const char* head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
463static const char* hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
464static const char* version_attr[] = { "version", NULL } ;
465static const char* html_content[] = { "head", "body", "frameset", NULL } ;
466static const char* iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
467static const char* img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
468static const char* input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
469static const char* prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
470static const char* label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
471static const char* legend_attrs[] = { ATTRS, "accesskey", NULL } ;
472static const char* align_attr[] = { "align", NULL } ;
473static const char* link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
474static const char* map_contents[] = { BLOCK, "area", NULL } ;
475static const char* name_attr[] = { "name", NULL } ;
476static const char* action_attr[] = { "action", NULL } ;
477static const char* blockli_elt[] = { BLOCK, "li", NULL } ;
478static const char* meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
479static const char* content_attr[] = { "content", NULL } ;
480static const char* type_attr[] = { "type", NULL } ;
481static const char* noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
482static const char* object_contents[] = { FLOW, "param", NULL } ;
483static const char* object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
484static const char* object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
485static const char* ol_attrs[] = { "type", "compact", "start", NULL} ;
486static const char* option_elt[] = { "option", NULL } ;
487static const char* optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
488static const char* option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
489static const char* param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
490static const char* width_attr[] = { "width", NULL } ;
491static const char* pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
492static const char* script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
493static const char* language_attr[] = { "language", NULL } ;
494static const char* select_content[] = { "optgroup", "option", NULL } ;
495static const char* select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
496static const char* style_attrs[] = { I18N, "media", "title", NULL } ;
497static const char* table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
498static const char* table_depr[] = { "align", "bgcolor", NULL } ;
499static const char* table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
500static const char* tr_elt[] = { "tr", NULL } ;
501static const char* talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
502static const char* th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
503static const char* th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
504static const char* textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
505static const char* tr_contents[] = { "th", "td", NULL } ;
506static const char* bgcolor_attr[] = { "bgcolor", NULL } ;
507static const char* li_elt[] = { "li", NULL } ;
508static const char* ul_depr[] = { "type", "compact", NULL} ;
509static const char* dir_attr[] = { "dir", NULL} ;
510
511#define DECL (const char**)
512
Daniel Veillard22090732001-07-16 00:06:07 +0000513static const htmlElemDesc
514html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000515{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
516 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
517},
518{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
519 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
520},
521{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
522 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
523},
524{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
525 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
526},
527{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
528 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
529},
530{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
531 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
532},
533{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
534 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
535},
536{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
537 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
538},
539{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
540 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
541},
542{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
543 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
544},
545{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
546 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
547},
548{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
549 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
550},
551{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
552 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
553},
554{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
555 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
556},
557{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
558 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
559},
560{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
561 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
562},
563{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
564 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
565},
566{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
567 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
568},
569{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
570 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
571},
572{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
573 EMPTY , NULL , DECL col_attrs , NULL, NULL
574},
575{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
576 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
577},
578{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
579 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
580},
581{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
582 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
583},
584{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
585 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
586},
587{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
588 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
589},
590{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
591 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
592},
593{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
594 DECL dl_contents , "dd" , html_attrs, DECL compact_attr, NULL
595},
596{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
597 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
598},
599{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
600 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
601},
602{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
603 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
604},
605{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
606 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
607},
608{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
609 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
610},
611{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
612 EMPTY, NULL, NULL, DECL frame_attrs, NULL
613},
614{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
615 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
616},
617{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
618 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
619},
620{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
621 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
622},
623{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
624 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
625},
626{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
627 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
628},
629{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
630 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
631},
632{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
633 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
634},
635{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
636 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
637},
638{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
639 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
640},
641{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
642 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
643},
644{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
645 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
646},
647{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
648 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
649},
650{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
651 EMPTY, NULL, DECL img_attrs, DECL align_attr, src_alt_attrs
652},
653{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
654 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
655},
656{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
657 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
658},
659{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
660 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
661},
662{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
663 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
664},
665{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
666 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
667},
668{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
669 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
670},
671{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
672 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
673},
674{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
675 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
676},
677{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
678 DECL map_contents , NULL, DECL html_attrs , NULL, name_attr
679},
680{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
681 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
682},
683{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
684 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
685},
686{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
687 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
688},
689{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
690 DECL html_flow, "div", DECL html_attrs, NULL, NULL
691},
692{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
693 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
694},
695{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
696 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
697},
698{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
699 option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
700},
701{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
702 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
703},
704{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
705 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
706},
707{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
708 EMPTY, NULL, DECL param_attrs, NULL, name_attr
709},
710{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
711 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
712},
713{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
714 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
715},
716{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
717 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
718},
719{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
720 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
721},
722{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
723 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
724},
725{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
726 DECL select_content, NULL, DECL select_attrs, NULL, NULL
727},
728{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
729 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
730},
731{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
732 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
733},
734{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
735 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
736},
737{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
738 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
739},
740{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
741 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
742},
743{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
744 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
745},
746{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
747 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
748},
749{ "table", 0, 0, 0, 0, 0, 0, 0, "",
750 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
751},
752{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
753 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
754},
755{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
756 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
757},
758{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
759 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
760},
761{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
762 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
763},
764{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
765 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
766},
767{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
768 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
769},
770{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
771 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
772},
773{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
774 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
775},
776{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
777 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
778},
779{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
780 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
781},
782{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
783 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
784},
785{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
786 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
787}
Owen Taylor3473f882001-02-23 17:55:21 +0000788};
789
790/*
Owen Taylor3473f882001-02-23 17:55:21 +0000791 * start tags that imply the end of current element
792 */
Daniel Veillard22090732001-07-16 00:06:07 +0000793static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000794"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
795 "dl", "ul", "ol", "menu", "dir", "address", "pre",
796 "listing", "xmp", "head", NULL,
797"head", "p", NULL,
798"title", "p", NULL,
799"body", "head", "style", "link", "title", "p", NULL,
800"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
801 "pre", "listing", "xmp", "head", "li", NULL,
802"hr", "p", "head", NULL,
803"h1", "p", "head", NULL,
804"h2", "p", "head", NULL,
805"h3", "p", "head", NULL,
806"h4", "p", "head", NULL,
807"h5", "p", "head", NULL,
808"h6", "p", "head", NULL,
809"dir", "p", "head", NULL,
810"address", "p", "head", "ul", NULL,
811"pre", "p", "head", "ul", NULL,
812"listing", "p", "head", NULL,
813"xmp", "p", "head", NULL,
814"blockquote", "p", "head", NULL,
815"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
816 "xmp", "head", NULL,
817"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
818 "head", "dd", NULL,
819"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
820 "head", "dt", NULL,
821"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
822 "listing", "xmp", NULL,
823"ol", "p", "head", "ul", NULL,
824"menu", "p", "head", "ul", NULL,
825"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
826"div", "p", "head", NULL,
827"noscript", "p", "head", NULL,
828"center", "font", "b", "i", "p", "head", NULL,
829"a", "a", NULL,
830"caption", "p", NULL,
831"colgroup", "caption", "colgroup", "col", "p", NULL,
832"col", "caption", "col", "p", NULL,
833"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
834 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000835"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
836"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000837"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
838"thead", "caption", "col", "colgroup", NULL,
839"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
840 "tbody", "p", NULL,
841"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
842 "tfoot", "tbody", "p", NULL,
843"optgroup", "option", NULL,
844"option", "option", NULL,
845"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
846 "pre", "listing", "xmp", "a", NULL,
847NULL
848};
849
850/*
851 * The list of HTML elements which are supposed not to have
852 * CDATA content and where a p element will be implied
853 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000854 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000855 * implied paragraph
856 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000857static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000858 "html",
859 "head",
860 "body",
861 NULL
862};
863
864/*
865 * The list of HTML attributes which are of content %Script;
866 * NOTE: when adding ones, check htmlIsScriptAttribute() since
867 * it assumes the name starts with 'on'
868 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000869static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000870 "onclick",
871 "ondblclick",
872 "onmousedown",
873 "onmouseup",
874 "onmouseover",
875 "onmousemove",
876 "onmouseout",
877 "onkeypress",
878 "onkeydown",
879 "onkeyup",
880 "onload",
881 "onunload",
882 "onfocus",
883 "onblur",
884 "onsubmit",
885 "onrest",
886 "onchange",
887 "onselect"
888};
889
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000890/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000891 * This table is used by the htmlparser to know what to do with
892 * broken html pages. By assigning different priorities to different
893 * elements the parser can decide how to handle extra endtags.
894 * Endtags are only allowed to close elements with lower or equal
895 * priority.
896 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000897
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000898typedef struct {
899 const char *name;
900 int priority;
901} elementPriority;
902
Daniel Veillard22090732001-07-16 00:06:07 +0000903static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000904 {"div", 150},
905 {"td", 160},
906 {"th", 160},
907 {"tr", 170},
908 {"thead", 180},
909 {"tbody", 180},
910 {"tfoot", 180},
911 {"table", 190},
912 {"head", 200},
913 {"body", 200},
914 {"html", 220},
915 {NULL, 100} /* Default priority */
916};
Owen Taylor3473f882001-02-23 17:55:21 +0000917
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000918static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000919static int htmlStartCloseIndexinitialized = 0;
920
921/************************************************************************
922 * *
923 * functions to handle HTML specific data *
924 * *
925 ************************************************************************/
926
927/**
928 * htmlInitAutoClose:
929 *
930 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
931 * This is not reentrant. Call xmlInitParser() once before processing in
932 * case of use in multithreaded programs.
933 */
934void
935htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000936 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000937
938 if (htmlStartCloseIndexinitialized) return;
939
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000940 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
941 indx = 0;
942 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
943 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000944 while (htmlStartClose[i] != NULL) i++;
945 i++;
946 }
947 htmlStartCloseIndexinitialized = 1;
948}
949
950/**
951 * htmlTagLookup:
952 * @tag: The tag name in lowercase
953 *
954 * Lookup the HTML tag in the ElementTable
955 *
956 * Returns the related htmlElemDescPtr or NULL if not found.
957 */
Daniel Veillardbb371292001-08-16 23:26:59 +0000958const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +0000959htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000960 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000961
962 for (i = 0; i < (sizeof(html40ElementTable) /
963 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000964 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard22090732001-07-16 00:06:07 +0000965 return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +0000966 }
967 return(NULL);
968}
969
970/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000971 * htmlGetEndPriority:
972 * @name: The name of the element to look up the priority for.
973 *
974 * Return value: The "endtag" priority.
975 **/
976static int
977htmlGetEndPriority (const xmlChar *name) {
978 int i = 0;
979
980 while ((htmlEndPriority[i].name != NULL) &&
981 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
982 i++;
983
984 return(htmlEndPriority[i].priority);
985}
986
987/**
Owen Taylor3473f882001-02-23 17:55:21 +0000988 * htmlCheckAutoClose:
989 * @newtag: The new tag name
990 * @oldtag: The old tag name
991 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000992 * Checks whether the new tag is one of the registered valid tags for
993 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +0000994 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
995 *
996 * Returns 0 if no, 1 if yes.
997 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000998static int
Owen Taylor3473f882001-02-23 17:55:21 +0000999htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001000 int i, indx;
1001 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001002
1003 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
1004
1005 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001006 for (indx = 0; indx < 100;indx++) {
1007 closed = htmlStartCloseIndex[indx];
1008 if (closed == NULL) return(0);
1009 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +00001010 }
1011
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001012 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001013 i++;
1014 while (htmlStartClose[i] != NULL) {
1015 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1016 return(1);
1017 }
1018 i++;
1019 }
1020 return(0);
1021}
1022
1023/**
1024 * htmlAutoCloseOnClose:
1025 * @ctxt: an HTML parser context
1026 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001027 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001028 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001029 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001030 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001031static void
Owen Taylor3473f882001-02-23 17:55:21 +00001032htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardbb371292001-08-16 23:26:59 +00001033 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00001034 xmlChar *oldname;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001035 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001036
1037#ifdef DEBUG
1038 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
1039 for (i = 0;i < ctxt->nameNr;i++)
1040 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
1041#endif
1042
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001043 priority = htmlGetEndPriority (newtag);
1044
Owen Taylor3473f882001-02-23 17:55:21 +00001045 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001046
Owen Taylor3473f882001-02-23 17:55:21 +00001047 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001048 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001049 * A missplaced endtag can only close elements with lower
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001050 * or equal priority, so if we find an element with higher
1051 * priority before we find an element with
1052 * matching name, we just ignore this endtag
1053 */
1054 if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor3473f882001-02-23 17:55:21 +00001055 }
1056 if (i < 0) return;
1057
1058 while (!xmlStrEqual(newtag, ctxt->name)) {
1059 info = htmlTagLookup(ctxt->name);
1060 if ((info == NULL) || (info->endTag == 1)) {
1061#ifdef DEBUG
1062 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
1063#endif
Daniel Veillard56098d42001-04-24 12:51:09 +00001064 } else if (info->endTag == 3) {
1065#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001066 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack1633d182001-10-05 15:41:19 +00001067
Daniel Veillard56098d42001-04-24 12:51:09 +00001068#endif
1069 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1070 ctxt->sax->error(ctxt->userData,
1071 "Opening and ending tag mismatch: %s and %s\n",
1072 newtag, ctxt->name);
1073 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001074 }
1075 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1076 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1077 oldname = htmlnamePop(ctxt);
1078 if (oldname != NULL) {
1079#ifdef DEBUG
1080 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
1081#endif
1082 xmlFree(oldname);
1083 }
1084 }
1085}
1086
1087/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001088 * htmlAutoCloseOnEnd:
1089 * @ctxt: an HTML parser context
1090 *
1091 * Close all remaining tags at the end of the stream
1092 */
1093static void
1094htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
1095 xmlChar *oldname;
1096 int i;
1097
1098 if (ctxt->nameNr == 0)
1099 return;
1100#ifdef DEBUG
1101 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
1102#endif
1103
1104 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
1105#ifdef DEBUG
1106 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
1107#endif
1108 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1109 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1110 oldname = htmlnamePop(ctxt);
1111 if (oldname != NULL) {
1112#ifdef DEBUG
1113 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
1114#endif
1115 xmlFree(oldname);
1116 }
1117 }
1118}
1119
1120/**
Owen Taylor3473f882001-02-23 17:55:21 +00001121 * htmlAutoClose:
1122 * @ctxt: an HTML parser context
1123 * @newtag: The new tag name or NULL
1124 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001125 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001126 * The list is kept in htmlStartClose array. This function is
1127 * called when a new tag has been detected and generates the
1128 * appropriates closes if possible/needed.
1129 * If newtag is NULL this mean we are at the end of the resource
1130 * and we should check
1131 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001132static void
Owen Taylor3473f882001-02-23 17:55:21 +00001133htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1134 xmlChar *oldname;
1135 while ((newtag != NULL) && (ctxt->name != NULL) &&
1136 (htmlCheckAutoClose(newtag, ctxt->name))) {
1137#ifdef DEBUG
1138 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
1139#endif
1140 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1141 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1142 oldname = htmlnamePop(ctxt);
1143 if (oldname != NULL) {
1144#ifdef DEBUG
1145 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
1146#endif
1147 xmlFree(oldname);
1148 }
1149 }
1150 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001151 htmlAutoCloseOnEnd(ctxt);
1152 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001153 }
1154 while ((newtag == NULL) && (ctxt->name != NULL) &&
1155 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
1156 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
1157 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
1158#ifdef DEBUG
1159 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
1160#endif
1161 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1162 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1163 oldname = htmlnamePop(ctxt);
1164 if (oldname != NULL) {
1165#ifdef DEBUG
1166 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
1167#endif
1168 xmlFree(oldname);
1169 }
1170 }
1171
1172}
1173
1174/**
1175 * htmlAutoCloseTag:
1176 * @doc: the HTML document
1177 * @name: The tag name
1178 * @elem: the HTML element
1179 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001180 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001181 * The list is kept in htmlStartClose array. This function checks
1182 * if the element or one of it's children would autoclose the
1183 * given tag.
1184 *
1185 * Returns 1 if autoclose, 0 otherwise
1186 */
1187int
1188htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1189 htmlNodePtr child;
1190
1191 if (elem == NULL) return(1);
1192 if (xmlStrEqual(name, elem->name)) return(0);
1193 if (htmlCheckAutoClose(elem->name, name)) return(1);
1194 child = elem->children;
1195 while (child != NULL) {
1196 if (htmlAutoCloseTag(doc, name, child)) return(1);
1197 child = child->next;
1198 }
1199 return(0);
1200}
1201
1202/**
1203 * htmlIsAutoClosed:
1204 * @doc: the HTML document
1205 * @elem: the HTML element
1206 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001207 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001208 * The list is kept in htmlStartClose array. This function checks
1209 * if a tag is autoclosed by one of it's child
1210 *
1211 * Returns 1 if autoclosed, 0 otherwise
1212 */
1213int
1214htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1215 htmlNodePtr child;
1216
1217 if (elem == NULL) return(1);
1218 child = elem->children;
1219 while (child != NULL) {
1220 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1221 child = child->next;
1222 }
1223 return(0);
1224}
1225
1226/**
1227 * htmlCheckImplied:
1228 * @ctxt: an HTML parser context
1229 * @newtag: The new tag name
1230 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001231 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001232 * called when a new tag has been detected and generates the
1233 * appropriates implicit tags if missing
1234 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001235static void
Owen Taylor3473f882001-02-23 17:55:21 +00001236htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1237 if (!htmlOmittedDefaultValue)
1238 return;
1239 if (xmlStrEqual(newtag, BAD_CAST"html"))
1240 return;
1241 if (ctxt->nameNr <= 0) {
1242#ifdef DEBUG
1243 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
1244#endif
1245 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
1246 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1247 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1248 }
1249 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1250 return;
1251 if ((ctxt->nameNr <= 1) &&
1252 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1253 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1254 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1255 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1256 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1257 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1258 /*
1259 * dropped OBJECT ... i you put it first BODY will be
1260 * assumed !
1261 */
1262#ifdef DEBUG
1263 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
1264#endif
1265 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
1266 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1267 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1268 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1269 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1270 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1271 int i;
1272 for (i = 0;i < ctxt->nameNr;i++) {
1273 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1274 return;
1275 }
1276 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1277 return;
1278 }
1279 }
1280
1281#ifdef DEBUG
1282 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
1283#endif
1284 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
1285 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1286 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1287 }
1288}
1289
1290/**
1291 * htmlCheckParagraph
1292 * @ctxt: an HTML parser context
1293 *
1294 * Check whether a p element need to be implied before inserting
1295 * characters in the current element.
1296 *
1297 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1298 * in case of error.
1299 */
1300
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001301static int
Owen Taylor3473f882001-02-23 17:55:21 +00001302htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1303 const xmlChar *tag;
1304 int i;
1305
1306 if (ctxt == NULL)
1307 return(-1);
1308 tag = ctxt->name;
1309 if (tag == NULL) {
1310 htmlAutoClose(ctxt, BAD_CAST"p");
1311 htmlCheckImplied(ctxt, BAD_CAST"p");
1312 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
1313 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1314 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1315 return(1);
1316 }
1317 if (!htmlOmittedDefaultValue)
1318 return(0);
1319 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1320 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1321#ifdef DEBUG
1322 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
1323#endif
1324 htmlAutoClose(ctxt, BAD_CAST"p");
1325 htmlCheckImplied(ctxt, BAD_CAST"p");
1326 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
1327 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1328 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1329 return(1);
1330 }
1331 }
1332 return(0);
1333}
1334
1335/**
1336 * htmlIsScriptAttribute:
1337 * @name: an attribute name
1338 *
1339 * Check if an attribute is of content type Script
1340 *
1341 * Returns 1 is the attribute is a script 0 otherwise
1342 */
1343int
1344htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001345 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001346
1347 if (name == NULL)
1348 return(0);
1349 /*
1350 * all script attributes start with 'on'
1351 */
1352 if ((name[0] != 'o') || (name[1] != 'n'))
1353 return(0);
1354 for (i = 0;
1355 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1356 i++) {
1357 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1358 return(1);
1359 }
1360 return(0);
1361}
1362
1363/************************************************************************
1364 * *
1365 * The list of HTML predefined entities *
1366 * *
1367 ************************************************************************/
1368
1369
Daniel Veillard22090732001-07-16 00:06:07 +00001370static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001371/*
1372 * the 4 absolute ones, plus apostrophe.
1373 */
1374{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1375{ 38, "amp", "ampersand, U+0026 ISOnum" },
1376{ 39, "apos", "single quote" },
1377{ 60, "lt", "less-than sign, U+003C ISOnum" },
1378{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1379
1380/*
1381 * A bunch still in the 128-255 range
1382 * Replacing them depend really on the charset used.
1383 */
1384{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1385{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1386{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1387{ 163, "pound","pound sign, U+00A3 ISOnum" },
1388{ 164, "curren","currency sign, U+00A4 ISOnum" },
1389{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1390{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1391{ 167, "sect", "section sign, U+00A7 ISOnum" },
1392{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1393{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1394{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1395{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1396{ 172, "not", "not sign, U+00AC ISOnum" },
1397{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1398{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1399{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1400{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1401{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1402{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1403{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1404{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1405{ 181, "micro","micro sign, U+00B5 ISOnum" },
1406{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1407{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1408{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1409{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1410{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1411{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1412{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1413{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1414{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1415{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1416{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1417{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1418{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1419{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1420{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1421{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1422{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1423{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1424{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1425{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1426{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1427{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1428{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1429{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1430{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1431{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1432{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1433{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1434{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1435{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1436{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1437{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1438{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1439{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1440{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1441{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1442{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1443{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1444{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1445{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1446{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1447{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1448{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1449{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1450{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1451{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1452{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1453{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1454{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1455{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1456{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1457{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1458{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1459{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1460{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1461{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1462{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1463{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1464{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1465{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1466{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1467{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1468{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1469{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1470{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1471{ 247, "divide","division sign, U+00F7 ISOnum" },
1472{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1473{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1474{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1475{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1476{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1477{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1478{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1479{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1480
1481{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1482{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1483{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1484{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1485{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1486
1487/*
1488 * Anything below should really be kept as entities references
1489 */
1490{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1491
1492{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1493{ 732, "tilde","small tilde, U+02DC ISOdia" },
1494
1495{ 913, "Alpha","greek capital letter alpha, U+0391" },
1496{ 914, "Beta", "greek capital letter beta, U+0392" },
1497{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1498{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1499{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1500{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1501{ 919, "Eta", "greek capital letter eta, U+0397" },
1502{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1503{ 921, "Iota", "greek capital letter iota, U+0399" },
1504{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001505{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001506{ 924, "Mu", "greek capital letter mu, U+039C" },
1507{ 925, "Nu", "greek capital letter nu, U+039D" },
1508{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1509{ 927, "Omicron","greek capital letter omicron, U+039F" },
1510{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1511{ 929, "Rho", "greek capital letter rho, U+03A1" },
1512{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1513{ 932, "Tau", "greek capital letter tau, U+03A4" },
1514{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1515{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1516{ 935, "Chi", "greek capital letter chi, U+03A7" },
1517{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1518{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1519
1520{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1521{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1522{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1523{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1524{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1525{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1526{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1527{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1528{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1529{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1530{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1531{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1532{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1533{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1534{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1535{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1536{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1537{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1538{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1539{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1540{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1541{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1542{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1543{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1544{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1545{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1546{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1547{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1548
1549{ 8194, "ensp", "en space, U+2002 ISOpub" },
1550{ 8195, "emsp", "em space, U+2003 ISOpub" },
1551{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1552{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1553{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1554{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1555{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1556{ 8211, "ndash","en dash, U+2013 ISOpub" },
1557{ 8212, "mdash","em dash, U+2014 ISOpub" },
1558{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1559{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1560{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1561{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1562{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1563{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1564{ 8224, "dagger","dagger, U+2020 ISOpub" },
1565{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1566
1567{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1568{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1569
1570{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1571
1572{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1573{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1574
1575{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1576{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1577
1578{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1579{ 8260, "frasl","fraction slash, U+2044 NEW" },
1580
1581{ 8364, "euro", "euro sign, U+20AC NEW" },
1582
1583{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1584{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1585{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1586{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1587{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1588{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1589{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1590{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1591{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1592{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1593{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1594{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1595{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1596{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1597{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1598{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1599
1600{ 8704, "forall","for all, U+2200 ISOtech" },
1601{ 8706, "part", "partial differential, U+2202 ISOtech" },
1602{ 8707, "exist","there exists, U+2203 ISOtech" },
1603{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1604{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1605{ 8712, "isin", "element of, U+2208 ISOtech" },
1606{ 8713, "notin","not an element of, U+2209 ISOtech" },
1607{ 8715, "ni", "contains as member, U+220B ISOtech" },
1608{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001609{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001610{ 8722, "minus","minus sign, U+2212 ISOtech" },
1611{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1612{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1613{ 8733, "prop", "proportional to, U+221D ISOtech" },
1614{ 8734, "infin","infinity, U+221E ISOtech" },
1615{ 8736, "ang", "angle, U+2220 ISOamso" },
1616{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1617{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1618{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1619{ 8746, "cup", "union = cup, U+222A ISOtech" },
1620{ 8747, "int", "integral, U+222B ISOtech" },
1621{ 8756, "there4","therefore, U+2234 ISOtech" },
1622{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1623{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1624{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1625{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1626{ 8801, "equiv","identical to, U+2261 ISOtech" },
1627{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1628{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1629{ 8834, "sub", "subset of, U+2282 ISOtech" },
1630{ 8835, "sup", "superset of, U+2283 ISOtech" },
1631{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1632{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1633{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1634{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1635{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1636{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1637{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1638{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1639{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1640{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1641{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1642{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1643{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1644{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1645
1646{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1647{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1648{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1649{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1650
1651};
1652
1653/************************************************************************
1654 * *
1655 * Commodity functions to handle entities *
1656 * *
1657 ************************************************************************/
1658
1659/*
1660 * Macro used to grow the current buffer.
1661 */
1662#define growBuffer(buffer) { \
1663 buffer##_size *= 2; \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001664 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Owen Taylor3473f882001-02-23 17:55:21 +00001665 if (buffer == NULL) { \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001666 xmlGenericError(xmlGenericErrorContext, "realloc failed\n"); \
Owen Taylor3473f882001-02-23 17:55:21 +00001667 return(NULL); \
1668 } \
1669}
1670
1671/**
1672 * htmlEntityLookup:
1673 * @name: the entity name
1674 *
1675 * Lookup the given entity in EntitiesTable
1676 *
1677 * TODO: the linear scan is really ugly, an hash table is really needed.
1678 *
1679 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1680 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001681const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001682htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001683 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001684
1685 for (i = 0;i < (sizeof(html40EntitiesTable)/
1686 sizeof(html40EntitiesTable[0]));i++) {
1687 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1688#ifdef DEBUG
1689 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1690#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001691 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001692 }
1693 }
1694 return(NULL);
1695}
1696
1697/**
1698 * htmlEntityValueLookup:
1699 * @value: the entity's unicode value
1700 *
1701 * Lookup the given entity in EntitiesTable
1702 *
1703 * TODO: the linear scan is really ugly, an hash table is really needed.
1704 *
1705 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1706 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001707const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001708htmlEntityValueLookup(unsigned int value) {
1709 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001710#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001711 unsigned int lv = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001712#endif
1713
1714 for (i = 0;i < (sizeof(html40EntitiesTable)/
1715 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001716 if (html40EntitiesTable[i].value >= value) {
1717 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001718 break;
1719#ifdef DEBUG
1720 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1721#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001722 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001723 }
1724#ifdef DEBUG
1725 if (lv > html40EntitiesTable[i].value) {
1726 xmlGenericError(xmlGenericErrorContext,
1727 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1728 lv, html40EntitiesTable[i].value);
1729 }
1730 lv = html40EntitiesTable[i].value;
1731#endif
1732 }
1733 return(NULL);
1734}
1735
1736/**
1737 * UTF8ToHtml:
1738 * @out: a pointer to an array of bytes to store the result
1739 * @outlen: the length of @out
1740 * @in: a pointer to an array of UTF-8 chars
1741 * @inlen: the length of @in
1742 *
1743 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1744 * plus HTML entities block of chars out.
1745 *
1746 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1747 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001748 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001749 * The value of @outlen after return is the number of octets consumed.
1750 */
1751int
1752UTF8ToHtml(unsigned char* out, int *outlen,
1753 const unsigned char* in, int *inlen) {
1754 const unsigned char* processed = in;
1755 const unsigned char* outend;
1756 const unsigned char* outstart = out;
1757 const unsigned char* instart = in;
1758 const unsigned char* inend;
1759 unsigned int c, d;
1760 int trailing;
1761
1762 if (in == NULL) {
1763 /*
1764 * initialization nothing to do
1765 */
1766 *outlen = 0;
1767 *inlen = 0;
1768 return(0);
1769 }
1770 inend = in + (*inlen);
1771 outend = out + (*outlen);
1772 while (in < inend) {
1773 d = *in++;
1774 if (d < 0x80) { c= d; trailing= 0; }
1775 else if (d < 0xC0) {
1776 /* trailing byte in leading position */
1777 *outlen = out - outstart;
1778 *inlen = processed - instart;
1779 return(-2);
1780 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1781 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1782 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1783 else {
1784 /* no chance for this in Ascii */
1785 *outlen = out - outstart;
1786 *inlen = processed - instart;
1787 return(-2);
1788 }
1789
1790 if (inend - in < trailing) {
1791 break;
1792 }
1793
1794 for ( ; trailing; trailing--) {
1795 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1796 break;
1797 c <<= 6;
1798 c |= d & 0x3F;
1799 }
1800
1801 /* assertion: c is a single UTF-4 value */
1802 if (c < 0x80) {
1803 if (out + 1 >= outend)
1804 break;
1805 *out++ = c;
1806 } else {
1807 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001808 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001809
1810 /*
1811 * Try to lookup a predefined HTML entity for it
1812 */
1813
1814 ent = htmlEntityValueLookup(c);
1815 if (ent == NULL) {
1816 /* no chance for this in Ascii */
1817 *outlen = out - outstart;
1818 *inlen = processed - instart;
1819 return(-2);
1820 }
1821 len = strlen(ent->name);
1822 if (out + 2 + len >= outend)
1823 break;
1824 *out++ = '&';
1825 memcpy(out, ent->name, len);
1826 out += len;
1827 *out++ = ';';
1828 }
1829 processed = in;
1830 }
1831 *outlen = out - outstart;
1832 *inlen = processed - instart;
1833 return(0);
1834}
1835
1836/**
1837 * htmlEncodeEntities:
1838 * @out: a pointer to an array of bytes to store the result
1839 * @outlen: the length of @out
1840 * @in: a pointer to an array of UTF-8 chars
1841 * @inlen: the length of @in
1842 * @quoteChar: the quote character to escape (' or ") or zero.
1843 *
1844 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1845 * plus HTML entities block of chars out.
1846 *
1847 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1848 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001849 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001850 * The value of @outlen after return is the number of octets consumed.
1851 */
1852int
1853htmlEncodeEntities(unsigned char* out, int *outlen,
1854 const unsigned char* in, int *inlen, int quoteChar) {
1855 const unsigned char* processed = in;
1856 const unsigned char* outend = out + (*outlen);
1857 const unsigned char* outstart = out;
1858 const unsigned char* instart = in;
1859 const unsigned char* inend = in + (*inlen);
1860 unsigned int c, d;
1861 int trailing;
1862
1863 while (in < inend) {
1864 d = *in++;
1865 if (d < 0x80) { c= d; trailing= 0; }
1866 else if (d < 0xC0) {
1867 /* trailing byte in leading position */
1868 *outlen = out - outstart;
1869 *inlen = processed - instart;
1870 return(-2);
1871 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1872 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1873 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1874 else {
1875 /* no chance for this in Ascii */
1876 *outlen = out - outstart;
1877 *inlen = processed - instart;
1878 return(-2);
1879 }
1880
1881 if (inend - in < trailing)
1882 break;
1883
1884 while (trailing--) {
1885 if (((d= *in++) & 0xC0) != 0x80) {
1886 *outlen = out - outstart;
1887 *inlen = processed - instart;
1888 return(-2);
1889 }
1890 c <<= 6;
1891 c |= d & 0x3F;
1892 }
1893
1894 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001895 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1896 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001897 if (out >= outend)
1898 break;
1899 *out++ = c;
1900 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001901 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001902 const char *cp;
1903 char nbuf[16];
1904 int len;
1905
1906 /*
1907 * Try to lookup a predefined HTML entity for it
1908 */
1909 ent = htmlEntityValueLookup(c);
1910 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001911 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001912 cp = nbuf;
1913 }
1914 else
1915 cp = ent->name;
1916 len = strlen(cp);
1917 if (out + 2 + len > outend)
1918 break;
1919 *out++ = '&';
1920 memcpy(out, cp, len);
1921 out += len;
1922 *out++ = ';';
1923 }
1924 processed = in;
1925 }
1926 *outlen = out - outstart;
1927 *inlen = processed - instart;
1928 return(0);
1929}
1930
1931/**
1932 * htmlDecodeEntities:
1933 * @ctxt: the parser context
1934 * @len: the len to decode (in bytes !), -1 for no size limit
1935 * @end: an end marker xmlChar, 0 if none
1936 * @end2: an end marker xmlChar, 0 if none
1937 * @end3: an end marker xmlChar, 0 if none
1938 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001939 * Substitute the HTML entities by their value
Owen Taylor3473f882001-02-23 17:55:21 +00001940 *
1941 * DEPRECATED !!!!
1942 *
1943 * Returns A newly allocated string with the substitution done. The caller
1944 * must deallocate it !
1945 */
1946xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001947htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1948 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001949 static int deprecated = 0;
1950 if (!deprecated) {
1951 xmlGenericError(xmlGenericErrorContext,
1952 "htmlDecodeEntities() deprecated function reached\n");
1953 deprecated = 1;
1954 }
1955 return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001956}
1957
1958/************************************************************************
1959 * *
1960 * Commodity functions to handle streams *
1961 * *
1962 ************************************************************************/
1963
1964/**
Owen Taylor3473f882001-02-23 17:55:21 +00001965 * htmlNewInputStream:
1966 * @ctxt: an HTML parser context
1967 *
1968 * Create a new input stream structure
1969 * Returns the new input stream or NULL
1970 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001971static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001972htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1973 htmlParserInputPtr input;
1974
1975 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1976 if (input == NULL) {
1977 ctxt->errNo = XML_ERR_NO_MEMORY;
1978 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1979 ctxt->sax->error(ctxt->userData,
1980 "malloc: couldn't allocate a new input stream\n");
1981 return(NULL);
1982 }
1983 memset(input, 0, sizeof(htmlParserInput));
1984 input->filename = NULL;
1985 input->directory = NULL;
1986 input->base = NULL;
1987 input->cur = NULL;
1988 input->buf = NULL;
1989 input->line = 1;
1990 input->col = 1;
1991 input->buf = NULL;
1992 input->free = NULL;
1993 input->version = NULL;
1994 input->consumed = 0;
1995 input->length = 0;
1996 return(input);
1997}
1998
1999
2000/************************************************************************
2001 * *
2002 * Commodity functions, cleanup needed ? *
2003 * *
2004 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002005/*
2006 * all tags allowing pc data from the html 4.01 loose dtd
2007 * NOTE: it might be more apropriate to integrate this information
2008 * into the html40ElementTable array but I don't want to risk any
2009 * binary incomptibility
2010 */
2011static const char *allowPCData[] = {
2012 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2013 "blockquote", "body", "button", "caption", "center", "cite", "code",
2014 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2015 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2016 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2017 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2018};
Owen Taylor3473f882001-02-23 17:55:21 +00002019
2020/**
2021 * areBlanks:
2022 * @ctxt: an HTML parser context
2023 * @str: a xmlChar *
2024 * @len: the size of @str
2025 *
2026 * Is this a sequence of blank chars that one can ignore ?
2027 *
2028 * Returns 1 if ignorable 0 otherwise.
2029 */
2030
2031static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002032 unsigned int i;
2033 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002034 xmlNodePtr lastChild;
2035
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002036 for (j = 0;j < len;j++)
2037 if (!(IS_BLANK(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002038
2039 if (CUR == 0) return(1);
2040 if (CUR != '<') return(0);
2041 if (ctxt->name == NULL)
2042 return(1);
2043 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2044 return(1);
2045 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2046 return(1);
2047 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
2048 return(1);
2049 if (ctxt->node == NULL) return(0);
2050 lastChild = xmlGetLastChild(ctxt->node);
2051 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002052 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2053 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002054 /* keep ws in constructs like ...<b> </b>...
2055 for all tags "b" allowing PCDATA */
2056 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2057 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2058 return(0);
2059 }
2060 }
Owen Taylor3473f882001-02-23 17:55:21 +00002061 } else if (xmlNodeIsText(lastChild)) {
2062 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002063 } else {
2064 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2065 for all tags "p" allowing PCDATA */
2066 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2067 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2068 return(0);
2069 }
2070 }
Owen Taylor3473f882001-02-23 17:55:21 +00002071 }
2072 return(1);
2073}
2074
2075/**
Owen Taylor3473f882001-02-23 17:55:21 +00002076 * htmlNewDocNoDtD:
2077 * @URI: URI for the dtd, or NULL
2078 * @ExternalID: the external ID of the DTD, or NULL
2079 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002080 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2081 * are NULL
2082 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002083 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002084 */
2085htmlDocPtr
2086htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2087 xmlDocPtr cur;
2088
2089 /*
2090 * Allocate a new document and fill the fields.
2091 */
2092 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2093 if (cur == NULL) {
2094 xmlGenericError(xmlGenericErrorContext,
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002095 "htmlNewDocNoDtD : malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002096 return(NULL);
2097 }
2098 memset(cur, 0, sizeof(xmlDoc));
2099
2100 cur->type = XML_HTML_DOCUMENT_NODE;
2101 cur->version = NULL;
2102 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002103 cur->doc = cur;
2104 cur->name = NULL;
2105 cur->children = NULL;
2106 cur->extSubset = NULL;
2107 cur->oldNs = NULL;
2108 cur->encoding = NULL;
2109 cur->standalone = 1;
2110 cur->compression = 0;
2111 cur->ids = NULL;
2112 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002113 cur->_private = NULL;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002114 if ((ExternalID != NULL) ||
2115 (URI != NULL))
Daniel Veillard5151c062001-10-23 13:10:19 +00002116 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002117 return(cur);
2118}
2119
2120/**
2121 * htmlNewDoc:
2122 * @URI: URI for the dtd, or NULL
2123 * @ExternalID: the external ID of the DTD, or NULL
2124 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002125 * Creates a new HTML document
2126 *
Owen Taylor3473f882001-02-23 17:55:21 +00002127 * Returns a new document
2128 */
2129htmlDocPtr
2130htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2131 if ((URI == NULL) && (ExternalID == NULL))
2132 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002133 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2134 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002135
2136 return(htmlNewDocNoDtD(URI, ExternalID));
2137}
2138
2139
2140/************************************************************************
2141 * *
2142 * The parser itself *
2143 * Relates to http://www.w3.org/TR/html40 *
2144 * *
2145 ************************************************************************/
2146
2147/************************************************************************
2148 * *
2149 * The parser itself *
2150 * *
2151 ************************************************************************/
2152
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002153static xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2154
Owen Taylor3473f882001-02-23 17:55:21 +00002155/**
2156 * htmlParseHTMLName:
2157 * @ctxt: an HTML parser context
2158 *
2159 * parse an HTML tag or attribute name, note that we convert it to lowercase
2160 * since HTML names are not case-sensitive.
2161 *
2162 * Returns the Tag Name parsed or NULL
2163 */
2164
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002165static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002166htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2167 xmlChar *ret = NULL;
2168 int i = 0;
2169 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2170
2171 if (!IS_LETTER(CUR) && (CUR != '_') &&
2172 (CUR != ':')) return(NULL);
2173
2174 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2175 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2176 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2177 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2178 else loc[i] = CUR;
2179 i++;
2180
2181 NEXT;
2182 }
2183
2184 ret = xmlStrndup(loc, i);
2185
2186 return(ret);
2187}
2188
2189/**
2190 * htmlParseName:
2191 * @ctxt: an HTML parser context
2192 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002193 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002194 *
2195 * Returns the Name parsed or NULL
2196 */
2197
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002198static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002199htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002200 const xmlChar *in;
2201 xmlChar *ret;
2202 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002203
2204 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002205
2206 /*
2207 * Accelerator for simple ASCII names
2208 */
2209 in = ctxt->input->cur;
2210 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2211 ((*in >= 0x41) && (*in <= 0x5A)) ||
2212 (*in == '_') || (*in == ':')) {
2213 in++;
2214 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2215 ((*in >= 0x41) && (*in <= 0x5A)) ||
2216 ((*in >= 0x30) && (*in <= 0x39)) ||
2217 (*in == '_') || (*in == '-') ||
2218 (*in == ':') || (*in == '.'))
2219 in++;
2220 if ((*in > 0) && (*in < 0x80)) {
2221 count = in - ctxt->input->cur;
2222 ret = xmlStrndup(ctxt->input->cur, count);
2223 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002224 ctxt->nbChars += count;
2225 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002226 return(ret);
2227 }
2228 }
2229 return(htmlParseNameComplex(ctxt));
2230}
2231
2232static xmlChar *
2233htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2234 xmlChar buf[XML_MAX_NAMELEN + 5];
2235 int len = 0, l;
2236 int c;
2237 int count = 0;
2238
2239 /*
2240 * Handler for more complex cases
2241 */
2242 GROW;
2243 c = CUR_CHAR(l);
2244 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2245 (!IS_LETTER(c) && (c != '_') &&
2246 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002247 return(NULL);
2248 }
2249
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002250 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2251 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2252 (c == '.') || (c == '-') ||
2253 (c == '_') || (c == ':') ||
2254 (IS_COMBINING(c)) ||
2255 (IS_EXTENDER(c)))) {
2256 if (count++ > 100) {
2257 count = 0;
2258 GROW;
2259 }
2260 COPY_BUF(l,buf,len,c);
2261 NEXTL(l);
2262 c = CUR_CHAR(l);
2263 if (len >= XML_MAX_NAMELEN) {
2264 /*
2265 * Okay someone managed to make a huge name, so he's ready to pay
2266 * for the processing speed.
2267 */
2268 xmlChar *buffer;
2269 int max = len * 2;
2270
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002271 buffer = (xmlChar *) xmlMallocAtomic(max * sizeof(xmlChar));
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002272 if (buffer == NULL) {
2273 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2274 ctxt->sax->error(ctxt->userData,
2275 "htmlParseNameComplex: out of memory\n");
2276 return(NULL);
2277 }
2278 memcpy(buffer, buf, len);
2279 while ((IS_LETTER(c)) || (IS_DIGIT(c)) || /* test bigname.xml */
2280 (c == '.') || (c == '-') ||
2281 (c == '_') || (c == ':') ||
2282 (IS_COMBINING(c)) ||
2283 (IS_EXTENDER(c))) {
2284 if (count++ > 100) {
2285 count = 0;
2286 GROW;
2287 }
2288 if (len + 10 > max) {
2289 max *= 2;
2290 buffer = (xmlChar *) xmlRealloc(buffer,
2291 max * sizeof(xmlChar));
2292 if (buffer == NULL) {
2293 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2294 ctxt->sax->error(ctxt->userData,
2295 "htmlParseNameComplex: out of memory\n");
2296 return(NULL);
2297 }
2298 }
2299 COPY_BUF(l,buffer,len,c);
2300 NEXTL(l);
2301 c = CUR_CHAR(l);
2302 }
2303 buffer[len] = 0;
2304 return(buffer);
Owen Taylor3473f882001-02-23 17:55:21 +00002305 }
2306 }
2307 return(xmlStrndup(buf, len));
2308}
2309
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002310
Owen Taylor3473f882001-02-23 17:55:21 +00002311/**
2312 * htmlParseHTMLAttribute:
2313 * @ctxt: an HTML parser context
2314 * @stop: a char stop value
2315 *
2316 * parse an HTML attribute value till the stop (quote), if
2317 * stop is 0 then it stops at the first space
2318 *
2319 * Returns the attribute parsed or NULL
2320 */
2321
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002322static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002323htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2324 xmlChar *buffer = NULL;
2325 int buffer_size = 0;
2326 xmlChar *out = NULL;
2327 xmlChar *name = NULL;
2328
2329 xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002330 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002331
2332 /*
2333 * allocate a translation buffer.
2334 */
2335 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002336 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002337 if (buffer == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00002338 xmlGenericError(xmlGenericErrorContext,
2339 "htmlParseHTMLAttribute: malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002340 return(NULL);
2341 }
2342 out = buffer;
2343
2344 /*
2345 * Ok loop until we reach one of the ending chars
2346 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002347 while ((CUR != 0) && (CUR != stop)) {
2348 if ((stop == 0) && (CUR == '>')) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002349 if ((stop == 0) && (IS_BLANK(CUR))) break;
2350 if (CUR == '&') {
2351 if (NXT(1) == '#') {
2352 unsigned int c;
2353 int bits;
2354
2355 c = htmlParseCharRef(ctxt);
2356 if (c < 0x80)
2357 { *out++ = c; bits= -6; }
2358 else if (c < 0x800)
2359 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2360 else if (c < 0x10000)
2361 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2362 else
2363 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2364
2365 for ( ; bits >= 0; bits-= 6) {
2366 *out++ = ((c >> bits) & 0x3F) | 0x80;
2367 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002368
2369 if (out - buffer > buffer_size - 100) {
2370 int indx = out - buffer;
2371
2372 growBuffer(buffer);
2373 out = &buffer[indx];
2374 }
Owen Taylor3473f882001-02-23 17:55:21 +00002375 } else {
2376 ent = htmlParseEntityRef(ctxt, &name);
2377 if (name == NULL) {
2378 *out++ = '&';
2379 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002380 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002381
2382 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002383 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002384 }
2385 } else if (ent == NULL) {
2386 *out++ = '&';
2387 cur = name;
2388 while (*cur != 0) {
2389 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002390 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002391
2392 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002393 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002394 }
2395 *out++ = *cur++;
2396 }
2397 xmlFree(name);
2398 } else {
2399 unsigned int c;
2400 int bits;
2401
2402 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002403 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002404
2405 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002406 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002407 }
2408 c = (xmlChar)ent->value;
2409 if (c < 0x80)
2410 { *out++ = c; bits= -6; }
2411 else if (c < 0x800)
2412 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2413 else if (c < 0x10000)
2414 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2415 else
2416 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2417
2418 for ( ; bits >= 0; bits-= 6) {
2419 *out++ = ((c >> bits) & 0x3F) | 0x80;
2420 }
2421 xmlFree(name);
2422 }
2423 }
2424 } else {
2425 unsigned int c;
2426 int bits, l;
2427
2428 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002429 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002430
2431 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002432 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002433 }
2434 c = CUR_CHAR(l);
2435 if (c < 0x80)
2436 { *out++ = c; bits= -6; }
2437 else if (c < 0x800)
2438 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2439 else if (c < 0x10000)
2440 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2441 else
2442 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2443
2444 for ( ; bits >= 0; bits-= 6) {
2445 *out++ = ((c >> bits) & 0x3F) | 0x80;
2446 }
2447 NEXT;
2448 }
2449 }
2450 *out++ = 0;
2451 return(buffer);
2452}
2453
2454/**
Owen Taylor3473f882001-02-23 17:55:21 +00002455 * htmlParseEntityRef:
2456 * @ctxt: an HTML parser context
2457 * @str: location to store the entity name
2458 *
2459 * parse an HTML ENTITY references
2460 *
2461 * [68] EntityRef ::= '&' Name ';'
2462 *
2463 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2464 * if non-NULL *str will have to be freed by the caller.
2465 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002466const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00002467htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2468 xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002469 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002470 *str = NULL;
2471
2472 if (CUR == '&') {
2473 NEXT;
2474 name = htmlParseName(ctxt);
2475 if (name == NULL) {
2476 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2477 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2478 ctxt->wellFormed = 0;
2479 } else {
2480 GROW;
2481 if (CUR == ';') {
2482 *str = name;
2483
2484 /*
2485 * Lookup the entity in the table.
2486 */
2487 ent = htmlEntityLookup(name);
2488 if (ent != NULL) /* OK that's ugly !!! */
2489 NEXT;
2490 } else {
2491 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2492 ctxt->sax->error(ctxt->userData,
2493 "htmlParseEntityRef: expecting ';'\n");
2494 *str = name;
2495 }
2496 }
2497 }
2498 return(ent);
2499}
2500
2501/**
2502 * htmlParseAttValue:
2503 * @ctxt: an HTML parser context
2504 *
2505 * parse a value for an attribute
2506 * Note: the parser won't do substitution of entities here, this
2507 * will be handled later in xmlStringGetNodeList, unless it was
2508 * asked for ctxt->replaceEntities != 0
2509 *
2510 * Returns the AttValue parsed or NULL.
2511 */
2512
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002513static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002514htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2515 xmlChar *ret = NULL;
2516
2517 if (CUR == '"') {
2518 NEXT;
2519 ret = htmlParseHTMLAttribute(ctxt, '"');
2520 if (CUR != '"') {
2521 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2522 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2523 ctxt->wellFormed = 0;
2524 } else
2525 NEXT;
2526 } else if (CUR == '\'') {
2527 NEXT;
2528 ret = htmlParseHTMLAttribute(ctxt, '\'');
2529 if (CUR != '\'') {
2530 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2531 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2532 ctxt->wellFormed = 0;
2533 } else
2534 NEXT;
2535 } else {
2536 /*
2537 * That's an HTMLism, the attribute value may not be quoted
2538 */
2539 ret = htmlParseHTMLAttribute(ctxt, 0);
2540 if (ret == NULL) {
2541 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2542 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2543 ctxt->wellFormed = 0;
2544 }
2545 }
2546 return(ret);
2547}
2548
2549/**
2550 * htmlParseSystemLiteral:
2551 * @ctxt: an HTML parser context
2552 *
2553 * parse an HTML Literal
2554 *
2555 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2556 *
2557 * Returns the SystemLiteral parsed or NULL
2558 */
2559
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002560static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002561htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2562 const xmlChar *q;
2563 xmlChar *ret = NULL;
2564
2565 if (CUR == '"') {
2566 NEXT;
2567 q = CUR_PTR;
Daniel Veillard34ba3872003-07-15 13:34:05 +00002568 while ((IS_CHAR((unsigned int) CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002569 NEXT;
Daniel Veillard34ba3872003-07-15 13:34:05 +00002570 if (!IS_CHAR((unsigned int) CUR)) {
Owen Taylor3473f882001-02-23 17:55:21 +00002571 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2572 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2573 ctxt->wellFormed = 0;
2574 } else {
2575 ret = xmlStrndup(q, CUR_PTR - q);
2576 NEXT;
2577 }
2578 } else if (CUR == '\'') {
2579 NEXT;
2580 q = CUR_PTR;
Daniel Veillard34ba3872003-07-15 13:34:05 +00002581 while ((IS_CHAR((unsigned int) CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002582 NEXT;
Daniel Veillard34ba3872003-07-15 13:34:05 +00002583 if (!IS_CHAR((unsigned int) CUR)) {
Owen Taylor3473f882001-02-23 17:55:21 +00002584 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2585 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2586 ctxt->wellFormed = 0;
2587 } else {
2588 ret = xmlStrndup(q, CUR_PTR - q);
2589 NEXT;
2590 }
2591 } else {
2592 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2593 ctxt->sax->error(ctxt->userData,
2594 "SystemLiteral \" or ' expected\n");
2595 ctxt->wellFormed = 0;
2596 }
2597
2598 return(ret);
2599}
2600
2601/**
2602 * htmlParsePubidLiteral:
2603 * @ctxt: an HTML parser context
2604 *
2605 * parse an HTML public literal
2606 *
2607 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2608 *
2609 * Returns the PubidLiteral parsed or NULL.
2610 */
2611
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002612static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002613htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2614 const xmlChar *q;
2615 xmlChar *ret = NULL;
2616 /*
2617 * Name ::= (Letter | '_') (NameChar)*
2618 */
2619 if (CUR == '"') {
2620 NEXT;
2621 q = CUR_PTR;
2622 while (IS_PUBIDCHAR(CUR)) NEXT;
2623 if (CUR != '"') {
2624 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2625 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2626 ctxt->wellFormed = 0;
2627 } else {
2628 ret = xmlStrndup(q, CUR_PTR - q);
2629 NEXT;
2630 }
2631 } else if (CUR == '\'') {
2632 NEXT;
2633 q = CUR_PTR;
Daniel Veillard6560a422003-03-27 21:25:38 +00002634 while ((IS_PUBIDCHAR(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002635 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002636 if (CUR != '\'') {
Owen Taylor3473f882001-02-23 17:55:21 +00002637 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2638 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2639 ctxt->wellFormed = 0;
2640 } else {
2641 ret = xmlStrndup(q, CUR_PTR - q);
2642 NEXT;
2643 }
2644 } else {
2645 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2646 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2647 ctxt->wellFormed = 0;
2648 }
2649
2650 return(ret);
2651}
2652
2653/**
2654 * htmlParseScript:
2655 * @ctxt: an HTML parser context
2656 *
2657 * parse the content of an HTML SCRIPT or STYLE element
2658 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2659 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2660 * http://www.w3.org/TR/html4/types.html#type-script
2661 * http://www.w3.org/TR/html4/types.html#h-6.15
2662 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2663 *
2664 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2665 * element and the value of intrinsic event attributes. User agents must
2666 * not evaluate script data as HTML markup but instead must pass it on as
2667 * data to a script engine.
2668 * NOTES:
2669 * - The content is passed like CDATA
2670 * - the attributes for style and scripting "onXXX" are also described
2671 * as CDATA but SGML allows entities references in attributes so their
2672 * processing is identical as other attributes
2673 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002674static void
Owen Taylor3473f882001-02-23 17:55:21 +00002675htmlParseScript(htmlParserCtxtPtr ctxt) {
2676 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2677 int nbchar = 0;
2678 xmlChar cur;
2679
2680 SHRINK;
2681 cur = CUR;
Daniel Veillard34ba3872003-07-15 13:34:05 +00002682 while (IS_CHAR((unsigned int) cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002683 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2684 (NXT(3) == '-')) {
2685 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2686 if (ctxt->sax->cdataBlock!= NULL) {
2687 /*
2688 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2689 */
2690 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002691 } else if (ctxt->sax->characters != NULL) {
2692 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Daniel Veillardc1f78342001-11-10 11:43:05 +00002693 }
2694 }
2695 nbchar = 0;
2696 htmlParseComment(ctxt);
2697 cur = CUR;
2698 continue;
2699 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002700 /*
2701 * One should break here, the specification is clear:
2702 * Authors should therefore escape "</" within the content.
2703 * Escape mechanisms are specific to each scripting or
2704 * style sheet language.
2705 */
2706 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2707 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2708 break; /* while */
2709 }
2710 buf[nbchar++] = cur;
2711 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2712 if (ctxt->sax->cdataBlock!= NULL) {
2713 /*
2714 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2715 */
2716 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002717 } else if (ctxt->sax->characters != NULL) {
2718 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002719 }
2720 nbchar = 0;
2721 }
2722 NEXT;
2723 cur = CUR;
2724 }
Daniel Veillard34ba3872003-07-15 13:34:05 +00002725 if (!(IS_CHAR((unsigned int) cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002726 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2727 ctxt->sax->error(ctxt->userData,
2728 "Invalid char in CDATA 0x%X\n", cur);
2729 ctxt->wellFormed = 0;
2730 NEXT;
2731 }
2732
2733 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2734 if (ctxt->sax->cdataBlock!= NULL) {
2735 /*
2736 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2737 */
2738 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002739 } else if (ctxt->sax->characters != NULL) {
2740 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002741 }
2742 }
2743}
2744
2745
2746/**
2747 * htmlParseCharData:
2748 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002749 *
2750 * parse a CharData section.
2751 * if we are within a CDATA section ']]>' marks an end of section.
2752 *
2753 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2754 */
2755
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002756static void
2757htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002758 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2759 int nbchar = 0;
2760 int cur, l;
2761
2762 SHRINK;
2763 cur = CUR_CHAR(l);
2764 while (((cur != '<') || (ctxt->token == '<')) &&
2765 ((cur != '&') || (ctxt->token == '&')) &&
2766 (IS_CHAR(cur))) {
2767 COPY_BUF(l,buf,nbchar,cur);
2768 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2769 /*
2770 * Ok the segment is to be consumed as chars.
2771 */
2772 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2773 if (areBlanks(ctxt, buf, nbchar)) {
2774 if (ctxt->sax->ignorableWhitespace != NULL)
2775 ctxt->sax->ignorableWhitespace(ctxt->userData,
2776 buf, nbchar);
2777 } else {
2778 htmlCheckParagraph(ctxt);
2779 if (ctxt->sax->characters != NULL)
2780 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2781 }
2782 }
2783 nbchar = 0;
2784 }
2785 NEXTL(l);
2786 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002787 if (cur == 0) {
2788 SHRINK;
2789 GROW;
2790 cur = CUR_CHAR(l);
2791 }
Owen Taylor3473f882001-02-23 17:55:21 +00002792 }
2793 if (nbchar != 0) {
2794 /*
2795 * Ok the segment is to be consumed as chars.
2796 */
2797 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2798 if (areBlanks(ctxt, buf, nbchar)) {
2799 if (ctxt->sax->ignorableWhitespace != NULL)
2800 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2801 } else {
2802 htmlCheckParagraph(ctxt);
2803 if (ctxt->sax->characters != NULL)
2804 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2805 }
2806 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002807 } else {
2808 /*
2809 * Loop detection
2810 */
2811 if (cur == 0)
2812 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002813 }
2814}
2815
2816/**
2817 * htmlParseExternalID:
2818 * @ctxt: an HTML parser context
2819 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002820 *
2821 * Parse an External ID or a Public ID
2822 *
Owen Taylor3473f882001-02-23 17:55:21 +00002823 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2824 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2825 *
2826 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2827 *
2828 * Returns the function returns SystemLiteral and in the second
2829 * case publicID receives PubidLiteral, is strict is off
2830 * it is possible to return NULL and have publicID set.
2831 */
2832
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002833static xmlChar *
2834htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002835 xmlChar *URI = NULL;
2836
2837 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2838 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2839 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2840 SKIP(6);
2841 if (!IS_BLANK(CUR)) {
2842 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2843 ctxt->sax->error(ctxt->userData,
2844 "Space required after 'SYSTEM'\n");
2845 ctxt->wellFormed = 0;
2846 }
2847 SKIP_BLANKS;
2848 URI = htmlParseSystemLiteral(ctxt);
2849 if (URI == NULL) {
2850 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2851 ctxt->sax->error(ctxt->userData,
2852 "htmlParseExternalID: SYSTEM, no URI\n");
2853 ctxt->wellFormed = 0;
2854 }
2855 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2856 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2857 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2858 SKIP(6);
2859 if (!IS_BLANK(CUR)) {
2860 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2861 ctxt->sax->error(ctxt->userData,
2862 "Space required after 'PUBLIC'\n");
2863 ctxt->wellFormed = 0;
2864 }
2865 SKIP_BLANKS;
2866 *publicID = htmlParsePubidLiteral(ctxt);
2867 if (*publicID == NULL) {
2868 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2869 ctxt->sax->error(ctxt->userData,
2870 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2871 ctxt->wellFormed = 0;
2872 }
2873 SKIP_BLANKS;
2874 if ((CUR == '"') || (CUR == '\'')) {
2875 URI = htmlParseSystemLiteral(ctxt);
2876 }
2877 }
2878 return(URI);
2879}
2880
2881/**
2882 * htmlParseComment:
2883 * @ctxt: an HTML parser context
2884 *
2885 * Parse an XML (SGML) comment <!-- .... -->
2886 *
2887 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2888 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002889static void
Owen Taylor3473f882001-02-23 17:55:21 +00002890htmlParseComment(htmlParserCtxtPtr ctxt) {
2891 xmlChar *buf = NULL;
2892 int len;
2893 int size = HTML_PARSER_BUFFER_SIZE;
2894 int q, ql;
2895 int r, rl;
2896 int cur, l;
2897 xmlParserInputState state;
2898
2899 /*
2900 * Check that there is a comment right here.
2901 */
2902 if ((RAW != '<') || (NXT(1) != '!') ||
2903 (NXT(2) != '-') || (NXT(3) != '-')) return;
2904
2905 state = ctxt->instate;
2906 ctxt->instate = XML_PARSER_COMMENT;
2907 SHRINK;
2908 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002909 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002910 if (buf == NULL) {
2911 xmlGenericError(xmlGenericErrorContext,
2912 "malloc of %d byte failed\n", size);
2913 ctxt->instate = state;
2914 return;
2915 }
2916 q = CUR_CHAR(ql);
2917 NEXTL(ql);
2918 r = CUR_CHAR(rl);
2919 NEXTL(rl);
2920 cur = CUR_CHAR(l);
2921 len = 0;
2922 while (IS_CHAR(cur) &&
2923 ((cur != '>') ||
2924 (r != '-') || (q != '-'))) {
2925 if (len + 5 >= size) {
2926 size *= 2;
2927 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2928 if (buf == NULL) {
2929 xmlGenericError(xmlGenericErrorContext,
2930 "realloc of %d byte failed\n", size);
2931 ctxt->instate = state;
2932 return;
2933 }
2934 }
2935 COPY_BUF(ql,buf,len,q);
2936 q = r;
2937 ql = rl;
2938 r = cur;
2939 rl = l;
2940 NEXTL(l);
2941 cur = CUR_CHAR(l);
2942 if (cur == 0) {
2943 SHRINK;
2944 GROW;
2945 cur = CUR_CHAR(l);
2946 }
2947 }
2948 buf[len] = 0;
2949 if (!IS_CHAR(cur)) {
2950 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2951 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2952 ctxt->sax->error(ctxt->userData,
2953 "Comment not terminated \n<!--%.50s\n", buf);
2954 ctxt->wellFormed = 0;
2955 xmlFree(buf);
2956 } else {
2957 NEXT;
2958 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2959 (!ctxt->disableSAX))
2960 ctxt->sax->comment(ctxt->userData, buf);
2961 xmlFree(buf);
2962 }
2963 ctxt->instate = state;
2964}
2965
2966/**
2967 * htmlParseCharRef:
2968 * @ctxt: an HTML parser context
2969 *
2970 * parse Reference declarations
2971 *
2972 * [66] CharRef ::= '&#' [0-9]+ ';' |
2973 * '&#x' [0-9a-fA-F]+ ';'
2974 *
2975 * Returns the value parsed (as an int)
2976 */
2977int
2978htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2979 int val = 0;
2980
2981 if ((CUR == '&') && (NXT(1) == '#') &&
2982 (NXT(2) == 'x')) {
2983 SKIP(3);
2984 while (CUR != ';') {
2985 if ((CUR >= '0') && (CUR <= '9'))
2986 val = val * 16 + (CUR - '0');
2987 else if ((CUR >= 'a') && (CUR <= 'f'))
2988 val = val * 16 + (CUR - 'a') + 10;
2989 else if ((CUR >= 'A') && (CUR <= 'F'))
2990 val = val * 16 + (CUR - 'A') + 10;
2991 else {
2992 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2993 ctxt->sax->error(ctxt->userData,
2994 "htmlParseCharRef: invalid hexadecimal value\n");
2995 ctxt->wellFormed = 0;
2996 return(0);
2997 }
2998 NEXT;
2999 }
3000 if (CUR == ';')
3001 NEXT;
3002 } else if ((CUR == '&') && (NXT(1) == '#')) {
3003 SKIP(2);
3004 while (CUR != ';') {
3005 if ((CUR >= '0') && (CUR <= '9'))
3006 val = val * 10 + (CUR - '0');
3007 else {
3008 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3009 ctxt->sax->error(ctxt->userData,
3010 "htmlParseCharRef: invalid decimal value\n");
3011 ctxt->wellFormed = 0;
3012 return(0);
3013 }
3014 NEXT;
3015 }
3016 if (CUR == ';')
3017 NEXT;
3018 } else {
3019 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3020 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
3021 ctxt->wellFormed = 0;
3022 }
3023 /*
3024 * Check the value IS_CHAR ...
3025 */
3026 if (IS_CHAR(val)) {
3027 return(val);
3028 } else {
3029 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3030 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
3031 val);
3032 ctxt->wellFormed = 0;
3033 }
3034 return(0);
3035}
3036
3037
3038/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003039 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003040 * @ctxt: an HTML parser context
3041 *
3042 * parse a DOCTYPE declaration
3043 *
3044 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3045 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3046 */
3047
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003048static void
Owen Taylor3473f882001-02-23 17:55:21 +00003049htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3050 xmlChar *name;
3051 xmlChar *ExternalID = NULL;
3052 xmlChar *URI = NULL;
3053
3054 /*
3055 * We know that '<!DOCTYPE' has been detected.
3056 */
3057 SKIP(9);
3058
3059 SKIP_BLANKS;
3060
3061 /*
3062 * Parse the DOCTYPE name.
3063 */
3064 name = htmlParseName(ctxt);
3065 if (name == NULL) {
3066 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3067 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
3068 ctxt->wellFormed = 0;
3069 }
3070 /*
3071 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3072 */
3073
3074 SKIP_BLANKS;
3075
3076 /*
3077 * Check for SystemID and ExternalID
3078 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003079 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003080 SKIP_BLANKS;
3081
3082 /*
3083 * We should be at the end of the DOCTYPE declaration.
3084 */
3085 if (CUR != '>') {
3086 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardf6ed8bc2001-10-02 09:22:47 +00003087 ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003088 ctxt->wellFormed = 0;
3089 /* We shouldn't try to resynchronize ... */
3090 }
3091 NEXT;
3092
3093 /*
3094 * Create or update the document accordingly to the DOCTYPE
3095 */
3096 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3097 (!ctxt->disableSAX))
3098 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3099
3100 /*
3101 * Cleanup, since we don't use all those identifiers
3102 */
3103 if (URI != NULL) xmlFree(URI);
3104 if (ExternalID != NULL) xmlFree(ExternalID);
3105 if (name != NULL) xmlFree(name);
3106}
3107
3108/**
3109 * htmlParseAttribute:
3110 * @ctxt: an HTML parser context
3111 * @value: a xmlChar ** used to store the value of the attribute
3112 *
3113 * parse an attribute
3114 *
3115 * [41] Attribute ::= Name Eq AttValue
3116 *
3117 * [25] Eq ::= S? '=' S?
3118 *
3119 * With namespace:
3120 *
3121 * [NS 11] Attribute ::= QName Eq AttValue
3122 *
3123 * Also the case QName == xmlns:??? is handled independently as a namespace
3124 * definition.
3125 *
3126 * Returns the attribute name, and the value in *value.
3127 */
3128
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003129static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003130htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3131 xmlChar *name, *val = NULL;
3132
3133 *value = NULL;
3134 name = htmlParseHTMLName(ctxt);
3135 if (name == NULL) {
3136 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3137 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
3138 ctxt->wellFormed = 0;
3139 return(NULL);
3140 }
3141
3142 /*
3143 * read the value
3144 */
3145 SKIP_BLANKS;
3146 if (CUR == '=') {
3147 NEXT;
3148 SKIP_BLANKS;
3149 val = htmlParseAttValue(ctxt);
3150 /******
3151 } else {
3152 * TODO : some attribute must have values, some may not
3153 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3154 ctxt->sax->warning(ctxt->userData,
3155 "No value for attribute %s\n", name); */
3156 }
3157
3158 *value = val;
3159 return(name);
3160}
3161
3162/**
3163 * htmlCheckEncoding:
3164 * @ctxt: an HTML parser context
3165 * @attvalue: the attribute value
3166 *
3167 * Checks an http-equiv attribute from a Meta tag to detect
3168 * the encoding
3169 * If a new encoding is detected the parser is switched to decode
3170 * it and pass UTF8
3171 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003172static void
Owen Taylor3473f882001-02-23 17:55:21 +00003173htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3174 const xmlChar *encoding;
3175
3176 if ((ctxt == NULL) || (attvalue == NULL))
3177 return;
3178
3179 /* do not change encoding */
3180 if (ctxt->input->encoding != NULL)
3181 return;
3182
3183 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3184 if (encoding != NULL) {
3185 encoding += 8;
3186 } else {
3187 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3188 if (encoding != NULL)
3189 encoding += 9;
3190 }
3191 if (encoding != NULL) {
3192 xmlCharEncoding enc;
3193 xmlCharEncodingHandlerPtr handler;
3194
3195 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3196
3197 if (ctxt->input->encoding != NULL)
3198 xmlFree((xmlChar *) ctxt->input->encoding);
3199 ctxt->input->encoding = xmlStrdup(encoding);
3200
3201 enc = xmlParseCharEncoding((const char *) encoding);
3202 /*
3203 * registered set of known encodings
3204 */
3205 if (enc != XML_CHAR_ENCODING_ERROR) {
3206 xmlSwitchEncoding(ctxt, enc);
3207 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3208 } else {
3209 /*
3210 * fallback for unknown encodings
3211 */
3212 handler = xmlFindCharEncodingHandler((const char *) encoding);
3213 if (handler != NULL) {
3214 xmlSwitchToEncoding(ctxt, handler);
3215 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3216 } else {
3217 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3218 }
3219 }
3220
3221 if ((ctxt->input->buf != NULL) &&
3222 (ctxt->input->buf->encoder != NULL) &&
3223 (ctxt->input->buf->raw != NULL) &&
3224 (ctxt->input->buf->buffer != NULL)) {
3225 int nbchars;
3226 int processed;
3227
3228 /*
3229 * convert as much as possible to the parser reading buffer.
3230 */
3231 processed = ctxt->input->cur - ctxt->input->base;
3232 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3233 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3234 ctxt->input->buf->buffer,
3235 ctxt->input->buf->raw);
3236 if (nbchars < 0) {
3237 ctxt->errNo = XML_ERR_INVALID_ENCODING;
3238 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3239 ctxt->sax->error(ctxt->userData,
3240 "htmlCheckEncoding: encoder error\n");
3241 }
3242 ctxt->input->base =
3243 ctxt->input->cur = ctxt->input->buf->buffer->content;
3244 }
3245 }
3246}
3247
3248/**
3249 * htmlCheckMeta:
3250 * @ctxt: an HTML parser context
3251 * @atts: the attributes values
3252 *
3253 * Checks an attributes from a Meta tag
3254 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003255static void
Owen Taylor3473f882001-02-23 17:55:21 +00003256htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3257 int i;
3258 const xmlChar *att, *value;
3259 int http = 0;
3260 const xmlChar *content = NULL;
3261
3262 if ((ctxt == NULL) || (atts == NULL))
3263 return;
3264
3265 i = 0;
3266 att = atts[i++];
3267 while (att != NULL) {
3268 value = atts[i++];
3269 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3270 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3271 http = 1;
3272 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3273 content = value;
3274 att = atts[i++];
3275 }
3276 if ((http) && (content != NULL))
3277 htmlCheckEncoding(ctxt, content);
3278
3279}
3280
3281/**
3282 * htmlParseStartTag:
3283 * @ctxt: an HTML parser context
3284 *
3285 * parse a start of tag either for rule element or
3286 * EmptyElement. In both case we don't parse the tag closing chars.
3287 *
3288 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3289 *
3290 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3291 *
3292 * With namespace:
3293 *
3294 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3295 *
3296 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3297 *
3298 */
3299
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003300static void
Owen Taylor3473f882001-02-23 17:55:21 +00003301htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3302 xmlChar *name;
3303 xmlChar *attname;
3304 xmlChar *attvalue;
3305 const xmlChar **atts = NULL;
3306 int nbatts = 0;
3307 int maxatts = 0;
3308 int meta = 0;
3309 int i;
3310
3311 if (CUR != '<') return;
3312 NEXT;
3313
3314 GROW;
3315 name = htmlParseHTMLName(ctxt);
3316 if (name == NULL) {
3317 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3318 ctxt->sax->error(ctxt->userData,
3319 "htmlParseStartTag: invalid element name\n");
3320 ctxt->wellFormed = 0;
3321 /* Dump the bogus tag like browsers do */
Daniel Veillard34ba3872003-07-15 13:34:05 +00003322 while ((IS_CHAR((unsigned int) CUR)) && (CUR != '>'))
Owen Taylor3473f882001-02-23 17:55:21 +00003323 NEXT;
3324 return;
3325 }
3326 if (xmlStrEqual(name, BAD_CAST"meta"))
3327 meta = 1;
3328
3329 /*
3330 * Check for auto-closure of HTML elements.
3331 */
3332 htmlAutoClose(ctxt, name);
3333
3334 /*
3335 * Check for implied HTML elements.
3336 */
3337 htmlCheckImplied(ctxt, name);
3338
3339 /*
3340 * Avoid html at any level > 0, head at any level != 1
3341 * or any attempt to recurse body
3342 */
3343 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3344 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3345 ctxt->sax->error(ctxt->userData,
3346 "htmlParseStartTag: misplaced <html> tag\n");
3347 ctxt->wellFormed = 0;
3348 xmlFree(name);
3349 return;
3350 }
3351 if ((ctxt->nameNr != 1) &&
3352 (xmlStrEqual(name, BAD_CAST"head"))) {
3353 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3354 ctxt->sax->error(ctxt->userData,
3355 "htmlParseStartTag: misplaced <head> tag\n");
3356 ctxt->wellFormed = 0;
3357 xmlFree(name);
3358 return;
3359 }
3360 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003361 int indx;
3362 for (indx = 0;indx < ctxt->nameNr;indx++) {
3363 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00003364 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3365 ctxt->sax->error(ctxt->userData,
3366 "htmlParseStartTag: misplaced <body> tag\n");
3367 ctxt->wellFormed = 0;
3368 xmlFree(name);
3369 return;
3370 }
3371 }
3372 }
3373
3374 /*
3375 * Now parse the attributes, it ends up with the ending
3376 *
3377 * (S Attribute)* S?
3378 */
3379 SKIP_BLANKS;
Daniel Veillard34ba3872003-07-15 13:34:05 +00003380 while ((IS_CHAR((unsigned int) CUR)) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003381 (CUR != '>') &&
3382 ((CUR != '/') || (NXT(1) != '>'))) {
3383 long cons = ctxt->nbChars;
3384
3385 GROW;
3386 attname = htmlParseAttribute(ctxt, &attvalue);
3387 if (attname != NULL) {
3388
3389 /*
3390 * Well formedness requires at most one declaration of an attribute
3391 */
3392 for (i = 0; i < nbatts;i += 2) {
3393 if (xmlStrEqual(atts[i], attname)) {
3394 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3395 ctxt->sax->error(ctxt->userData,
3396 "Attribute %s redefined\n",
3397 attname);
3398 ctxt->wellFormed = 0;
3399 xmlFree(attname);
3400 if (attvalue != NULL)
3401 xmlFree(attvalue);
3402 goto failed;
3403 }
3404 }
3405
3406 /*
3407 * Add the pair to atts
3408 */
3409 if (atts == NULL) {
3410 maxatts = 10;
3411 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
3412 if (atts == NULL) {
3413 xmlGenericError(xmlGenericErrorContext,
3414 "malloc of %ld byte failed\n",
3415 maxatts * (long)sizeof(xmlChar *));
3416 if (name != NULL) xmlFree(name);
3417 return;
3418 }
3419 } else if (nbatts + 4 > maxatts) {
3420 maxatts *= 2;
3421 atts = (const xmlChar **) xmlRealloc((void *) atts,
3422 maxatts * sizeof(xmlChar *));
3423 if (atts == NULL) {
3424 xmlGenericError(xmlGenericErrorContext,
3425 "realloc of %ld byte failed\n",
3426 maxatts * (long)sizeof(xmlChar *));
3427 if (name != NULL) xmlFree(name);
3428 return;
3429 }
3430 }
3431 atts[nbatts++] = attname;
3432 atts[nbatts++] = attvalue;
3433 atts[nbatts] = NULL;
3434 atts[nbatts + 1] = NULL;
3435 }
3436 else {
3437 /* Dump the bogus attribute string up to the next blank or
3438 * the end of the tag. */
Daniel Veillard34ba3872003-07-15 13:34:05 +00003439 while ((IS_CHAR((unsigned int) CUR)) &&
3440 !(IS_BLANK(CUR)) && (CUR != '>') &&
3441 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003442 NEXT;
3443 }
3444
3445failed:
3446 SKIP_BLANKS;
3447 if (cons == ctxt->nbChars) {
3448 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3449 ctxt->sax->error(ctxt->userData,
3450 "htmlParseStartTag: problem parsing attributes\n");
3451 ctxt->wellFormed = 0;
3452 break;
3453 }
3454 }
3455
3456 /*
3457 * Handle specific association to the META tag
3458 */
3459 if (meta)
3460 htmlCheckMeta(ctxt, atts);
3461
3462 /*
3463 * SAX: Start of Element !
3464 */
3465 htmlnamePush(ctxt, xmlStrdup(name));
3466#ifdef DEBUG
3467 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3468#endif
3469 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3470 ctxt->sax->startElement(ctxt->userData, name, atts);
3471
3472 if (atts != NULL) {
3473 for (i = 0;i < nbatts;i++) {
3474 if (atts[i] != NULL)
3475 xmlFree((xmlChar *) atts[i]);
3476 }
3477 xmlFree((void *) atts);
3478 }
3479 if (name != NULL) xmlFree(name);
3480}
3481
3482/**
3483 * htmlParseEndTag:
3484 * @ctxt: an HTML parser context
3485 *
3486 * parse an end of tag
3487 *
3488 * [42] ETag ::= '</' Name S? '>'
3489 *
3490 * With namespace
3491 *
3492 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003493 *
3494 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003495 */
3496
Daniel Veillardf420ac52001-07-04 16:04:09 +00003497static int
Owen Taylor3473f882001-02-23 17:55:21 +00003498htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3499 xmlChar *name;
3500 xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003501 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003502
3503 if ((CUR != '<') || (NXT(1) != '/')) {
3504 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3505 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3506 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003507 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003508 }
3509 SKIP(2);
3510
3511 name = htmlParseHTMLName(ctxt);
Daniel Veillardf420ac52001-07-04 16:04:09 +00003512 if (name == NULL) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003513
3514 /*
3515 * We should definitely be at the ending "S? '>'" part
3516 */
3517 SKIP_BLANKS;
Daniel Veillard34ba3872003-07-15 13:34:05 +00003518 if ((!IS_CHAR((unsigned int) CUR)) || (CUR != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003519 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3520 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3521 ctxt->wellFormed = 0;
3522 } else
3523 NEXT;
3524
3525 /*
3526 * If the name read is not one of the element in the parsing stack
3527 * then return, it's just an error.
3528 */
3529 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3530 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3531 }
3532 if (i < 0) {
3533 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3534 ctxt->sax->error(ctxt->userData,
3535 "Unexpected end tag : %s\n", name);
3536 xmlFree(name);
3537 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003538 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003539 }
3540
3541
3542 /*
3543 * Check for auto-closure of HTML elements.
3544 */
3545
3546 htmlAutoCloseOnClose(ctxt, name);
3547
3548 /*
3549 * Well formedness constraints, opening and closing must match.
3550 * With the exception that the autoclose may have popped stuff out
3551 * of the stack.
3552 */
3553 if (!xmlStrEqual(name, ctxt->name)) {
3554#ifdef DEBUG
3555 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3556#endif
3557 if ((ctxt->name != NULL) &&
3558 (!xmlStrEqual(ctxt->name, name))) {
3559 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3560 ctxt->sax->error(ctxt->userData,
3561 "Opening and ending tag mismatch: %s and %s\n",
3562 name, ctxt->name);
3563 ctxt->wellFormed = 0;
3564 }
3565 }
3566
3567 /*
3568 * SAX: End of Tag
3569 */
3570 oldname = ctxt->name;
3571 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3572 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3573 ctxt->sax->endElement(ctxt->userData, name);
3574 oldname = htmlnamePop(ctxt);
3575 if (oldname != NULL) {
3576#ifdef DEBUG
3577 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3578#endif
3579 xmlFree(oldname);
3580#ifdef DEBUG
3581 } else {
3582 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3583#endif
3584 }
Daniel Veillardf420ac52001-07-04 16:04:09 +00003585 ret = 1;
3586 } else {
3587 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003588 }
3589
3590 if (name != NULL)
3591 xmlFree(name);
3592
Daniel Veillardf420ac52001-07-04 16:04:09 +00003593 return(ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003594}
3595
3596
3597/**
3598 * htmlParseReference:
3599 * @ctxt: an HTML parser context
3600 *
3601 * parse and handle entity references in content,
3602 * this will end-up in a call to character() since this is either a
3603 * CharRef, or a predefined entity.
3604 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003605static void
Owen Taylor3473f882001-02-23 17:55:21 +00003606htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003607 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003608 xmlChar out[6];
3609 xmlChar *name;
3610 if (CUR != '&') return;
3611
3612 if (NXT(1) == '#') {
3613 unsigned int c;
3614 int bits, i = 0;
3615
3616 c = htmlParseCharRef(ctxt);
3617 if (c == 0)
3618 return;
3619
3620 if (c < 0x80) { out[i++]= c; bits= -6; }
3621 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3622 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3623 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3624
3625 for ( ; bits >= 0; bits-= 6) {
3626 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3627 }
3628 out[i] = 0;
3629
3630 htmlCheckParagraph(ctxt);
3631 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3632 ctxt->sax->characters(ctxt->userData, out, i);
3633 } else {
3634 ent = htmlParseEntityRef(ctxt, &name);
3635 if (name == NULL) {
3636 htmlCheckParagraph(ctxt);
3637 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3638 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3639 return;
3640 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003641 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003642 htmlCheckParagraph(ctxt);
3643 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3644 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3645 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3646 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3647 }
3648 } else {
3649 unsigned int c;
3650 int bits, i = 0;
3651
3652 c = ent->value;
3653 if (c < 0x80)
3654 { out[i++]= c; bits= -6; }
3655 else if (c < 0x800)
3656 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3657 else if (c < 0x10000)
3658 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3659 else
3660 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3661
3662 for ( ; bits >= 0; bits-= 6) {
3663 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3664 }
3665 out[i] = 0;
3666
3667 htmlCheckParagraph(ctxt);
3668 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3669 ctxt->sax->characters(ctxt->userData, out, i);
3670 }
3671 xmlFree(name);
3672 }
3673}
3674
3675/**
3676 * htmlParseContent:
3677 * @ctxt: an HTML parser context
3678 * @name: the node name
3679 *
3680 * Parse a content: comment, sub-element, reference or text.
3681 *
3682 */
3683
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003684static void
Owen Taylor3473f882001-02-23 17:55:21 +00003685htmlParseContent(htmlParserCtxtPtr ctxt) {
3686 xmlChar *currentNode;
3687 int depth;
3688
3689 currentNode = xmlStrdup(ctxt->name);
3690 depth = ctxt->nameNr;
3691 while (1) {
3692 long cons = ctxt->nbChars;
3693
3694 GROW;
3695 /*
3696 * Our tag or one of it's parent or children is ending.
3697 */
3698 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003699 if (htmlParseEndTag(ctxt) &&
3700 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3701 if (currentNode != NULL)
3702 xmlFree(currentNode);
3703 return;
3704 }
3705 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003706 }
3707
3708 /*
3709 * Has this node been popped out during parsing of
3710 * the next element
3711 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003712 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3713 (!xmlStrEqual(currentNode, ctxt->name)))
3714 {
Owen Taylor3473f882001-02-23 17:55:21 +00003715 if (currentNode != NULL) xmlFree(currentNode);
3716 return;
3717 }
3718
Daniel Veillardf9533d12001-03-03 10:04:57 +00003719 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3720 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003721 /*
3722 * Handle SCRIPT/STYLE separately
3723 */
3724 htmlParseScript(ctxt);
3725 } else {
3726 /*
3727 * Sometimes DOCTYPE arrives in the middle of the document
3728 */
3729 if ((CUR == '<') && (NXT(1) == '!') &&
3730 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3731 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3732 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3733 (UPP(8) == 'E')) {
3734 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3735 ctxt->sax->error(ctxt->userData,
3736 "Misplaced DOCTYPE declaration\n");
3737 ctxt->wellFormed = 0;
3738 htmlParseDocTypeDecl(ctxt);
3739 }
3740
3741 /*
3742 * First case : a comment
3743 */
3744 if ((CUR == '<') && (NXT(1) == '!') &&
3745 (NXT(2) == '-') && (NXT(3) == '-')) {
3746 htmlParseComment(ctxt);
3747 }
3748
3749 /*
3750 * Second case : a sub-element.
3751 */
3752 else if (CUR == '<') {
3753 htmlParseElement(ctxt);
3754 }
3755
3756 /*
3757 * Third case : a reference. If if has not been resolved,
3758 * parsing returns it's Name, create the node
3759 */
3760 else if (CUR == '&') {
3761 htmlParseReference(ctxt);
3762 }
3763
3764 /*
3765 * Fourth : end of the resource
3766 */
3767 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003768 htmlAutoCloseOnEnd(ctxt);
3769 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003770 }
3771
3772 /*
3773 * Last case, text. Note that References are handled directly.
3774 */
3775 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003776 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003777 }
3778
3779 if (cons == ctxt->nbChars) {
3780 if (ctxt->node != NULL) {
3781 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3782 ctxt->sax->error(ctxt->userData,
3783 "detected an error in element content\n");
3784 ctxt->wellFormed = 0;
3785 }
3786 break;
3787 }
3788 }
3789 GROW;
3790 }
3791 if (currentNode != NULL) xmlFree(currentNode);
3792}
3793
3794/**
3795 * htmlParseElement:
3796 * @ctxt: an HTML parser context
3797 *
3798 * parse an HTML element, this is highly recursive
3799 *
3800 * [39] element ::= EmptyElemTag | STag content ETag
3801 *
3802 * [41] Attribute ::= Name Eq AttValue
3803 */
3804
3805void
3806htmlParseElement(htmlParserCtxtPtr ctxt) {
3807 xmlChar *name;
3808 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003809 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003810 htmlParserNodeInfo node_info;
3811 xmlChar *oldname;
3812 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003813 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003814
3815 /* Capture start position */
3816 if (ctxt->record_info) {
3817 node_info.begin_pos = ctxt->input->consumed +
3818 (CUR_PTR - ctxt->input->base);
3819 node_info.begin_line = ctxt->input->line;
3820 }
3821
3822 oldname = xmlStrdup(ctxt->name);
3823 htmlParseStartTag(ctxt);
3824 name = ctxt->name;
3825#ifdef DEBUG
3826 if (oldname == NULL)
3827 xmlGenericError(xmlGenericErrorContext,
3828 "Start of element %s\n", name);
3829 else if (name == NULL)
3830 xmlGenericError(xmlGenericErrorContext,
3831 "Start of element failed, was %s\n", oldname);
3832 else
3833 xmlGenericError(xmlGenericErrorContext,
3834 "Start of element %s, was %s\n", name, oldname);
3835#endif
3836 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3837 (name == NULL)) {
3838 if (CUR == '>')
3839 NEXT;
3840 if (oldname != NULL)
3841 xmlFree(oldname);
3842 return;
3843 }
3844 if (oldname != NULL)
3845 xmlFree(oldname);
3846
3847 /*
3848 * Lookup the info for that element.
3849 */
3850 info = htmlTagLookup(name);
3851 if (info == NULL) {
3852 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3853 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3854 name);
3855 ctxt->wellFormed = 0;
3856 } else if (info->depr) {
3857/***************************
3858 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3859 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3860 name);
3861 ***************************/
3862 }
3863
3864 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003865 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003866 */
3867 if ((CUR == '/') && (NXT(1) == '>')) {
3868 SKIP(2);
3869 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3870 ctxt->sax->endElement(ctxt->userData, name);
3871 oldname = htmlnamePop(ctxt);
3872#ifdef DEBUG
3873 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3874#endif
3875 if (oldname != NULL)
3876 xmlFree(oldname);
3877 return;
3878 }
3879
3880 if (CUR == '>') {
3881 NEXT;
3882 } else {
3883 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3884 ctxt->sax->error(ctxt->userData,
3885 "Couldn't find end of Start Tag %s\n",
3886 name);
3887 ctxt->wellFormed = 0;
3888
3889 /*
3890 * end of parsing of this node.
3891 */
3892 if (xmlStrEqual(name, ctxt->name)) {
3893 nodePop(ctxt);
3894 oldname = htmlnamePop(ctxt);
3895#ifdef DEBUG
3896 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3897#endif
3898 if (oldname != NULL)
3899 xmlFree(oldname);
3900 }
3901
3902 /*
3903 * Capture end position and add node
3904 */
3905 if ( currentNode != NULL && ctxt->record_info ) {
3906 node_info.end_pos = ctxt->input->consumed +
3907 (CUR_PTR - ctxt->input->base);
3908 node_info.end_line = ctxt->input->line;
3909 node_info.node = ctxt->node;
3910 xmlParserAddNodeInfo(ctxt, &node_info);
3911 }
3912 return;
3913 }
3914
3915 /*
3916 * Check for an Empty Element from DTD definition
3917 */
3918 if ((info != NULL) && (info->empty)) {
3919 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3920 ctxt->sax->endElement(ctxt->userData, name);
3921 oldname = htmlnamePop(ctxt);
3922#ifdef DEBUG
3923 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3924#endif
3925 if (oldname != NULL)
3926 xmlFree(oldname);
3927 return;
3928 }
3929
3930 /*
3931 * Parse the content of the element:
3932 */
3933 currentNode = xmlStrdup(ctxt->name);
3934 depth = ctxt->nameNr;
Daniel Veillard34ba3872003-07-15 13:34:05 +00003935 while (IS_CHAR((unsigned int) CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003936 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003937 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003938 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003939 if (ctxt->nameNr < depth) break;
3940 }
3941
Owen Taylor3473f882001-02-23 17:55:21 +00003942 /*
3943 * Capture end position and add node
3944 */
3945 if ( currentNode != NULL && ctxt->record_info ) {
3946 node_info.end_pos = ctxt->input->consumed +
3947 (CUR_PTR - ctxt->input->base);
3948 node_info.end_line = ctxt->input->line;
3949 node_info.node = ctxt->node;
3950 xmlParserAddNodeInfo(ctxt, &node_info);
3951 }
Daniel Veillard34ba3872003-07-15 13:34:05 +00003952 if (!IS_CHAR((unsigned int) CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003953 htmlAutoCloseOnEnd(ctxt);
3954 }
3955
Owen Taylor3473f882001-02-23 17:55:21 +00003956 if (currentNode != NULL)
3957 xmlFree(currentNode);
3958}
3959
3960/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003961 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00003962 * @ctxt: an HTML parser context
3963 *
3964 * parse an HTML document (and build a tree if using the standard SAX
3965 * interface).
3966 *
3967 * Returns 0, -1 in case of error. the parser context is augmented
3968 * as a result of the parsing.
3969 */
3970
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00003971int
Owen Taylor3473f882001-02-23 17:55:21 +00003972htmlParseDocument(htmlParserCtxtPtr ctxt) {
3973 xmlDtdPtr dtd;
3974
Daniel Veillardd0463562001-10-13 09:15:48 +00003975 xmlInitParser();
3976
Owen Taylor3473f882001-02-23 17:55:21 +00003977 htmlDefaultSAXHandlerInit();
3978 ctxt->html = 1;
3979
3980 GROW;
3981 /*
3982 * SAX: beginning of the document processing.
3983 */
3984 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3985 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3986
3987 /*
3988 * Wipe out everything which is before the first '<'
3989 */
3990 SKIP_BLANKS;
3991 if (CUR == 0) {
3992 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3993 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3994 ctxt->wellFormed = 0;
3995 }
3996
3997 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3998 ctxt->sax->startDocument(ctxt->userData);
3999
4000
4001 /*
4002 * Parse possible comments before any content
4003 */
4004 while ((CUR == '<') && (NXT(1) == '!') &&
4005 (NXT(2) == '-') && (NXT(3) == '-')) {
4006 htmlParseComment(ctxt);
4007 SKIP_BLANKS;
4008 }
4009
4010
4011 /*
4012 * Then possibly doc type declaration(s) and more Misc
4013 * (doctypedecl Misc*)?
4014 */
4015 if ((CUR == '<') && (NXT(1) == '!') &&
4016 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4017 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4018 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4019 (UPP(8) == 'E')) {
4020 htmlParseDocTypeDecl(ctxt);
4021 }
4022 SKIP_BLANKS;
4023
4024 /*
4025 * Parse possible comments before any content
4026 */
4027 while ((CUR == '<') && (NXT(1) == '!') &&
4028 (NXT(2) == '-') && (NXT(3) == '-')) {
4029 htmlParseComment(ctxt);
4030 SKIP_BLANKS;
4031 }
4032
4033 /*
4034 * Time to start parsing the tree itself
4035 */
4036 htmlParseContent(ctxt);
4037
4038 /*
4039 * autoclose
4040 */
4041 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004042 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004043
4044
4045 /*
4046 * SAX: end of the document processing.
4047 */
4048 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4049 ctxt->sax->endDocument(ctxt->userData);
4050
4051 if (ctxt->myDoc != NULL) {
4052 dtd = xmlGetIntSubset(ctxt->myDoc);
4053 if (dtd == NULL)
4054 ctxt->myDoc->intSubset =
4055 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4056 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4057 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4058 }
4059 if (! ctxt->wellFormed) return(-1);
4060 return(0);
4061}
4062
4063
4064/************************************************************************
4065 * *
4066 * Parser contexts handling *
4067 * *
4068 ************************************************************************/
4069
4070/**
4071 * xmlInitParserCtxt:
4072 * @ctxt: an HTML parser context
4073 *
4074 * Initialize a parser context
4075 */
4076
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004077static void
Owen Taylor3473f882001-02-23 17:55:21 +00004078htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4079{
4080 htmlSAXHandler *sax;
4081
4082 if (ctxt == NULL) return;
4083 memset(ctxt, 0, sizeof(htmlParserCtxt));
4084
4085 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4086 if (sax == NULL) {
4087 xmlGenericError(xmlGenericErrorContext,
4088 "htmlInitParserCtxt: out of memory\n");
4089 }
4090 else
4091 memset(sax, 0, sizeof(htmlSAXHandler));
4092
4093 /* Allocate the Input stack */
4094 ctxt->inputTab = (htmlParserInputPtr *)
4095 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4096 if (ctxt->inputTab == NULL) {
4097 xmlGenericError(xmlGenericErrorContext,
4098 "htmlInitParserCtxt: out of memory\n");
4099 ctxt->inputNr = 0;
4100 ctxt->inputMax = 0;
4101 ctxt->input = NULL;
4102 return;
4103 }
4104 ctxt->inputNr = 0;
4105 ctxt->inputMax = 5;
4106 ctxt->input = NULL;
4107 ctxt->version = NULL;
4108 ctxt->encoding = NULL;
4109 ctxt->standalone = -1;
4110 ctxt->instate = XML_PARSER_START;
4111
4112 /* Allocate the Node stack */
4113 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4114 if (ctxt->nodeTab == NULL) {
4115 xmlGenericError(xmlGenericErrorContext,
4116 "htmlInitParserCtxt: out of memory\n");
4117 ctxt->nodeNr = 0;
4118 ctxt->nodeMax = 0;
4119 ctxt->node = NULL;
4120 ctxt->inputNr = 0;
4121 ctxt->inputMax = 0;
4122 ctxt->input = NULL;
4123 return;
4124 }
4125 ctxt->nodeNr = 0;
4126 ctxt->nodeMax = 10;
4127 ctxt->node = NULL;
4128
4129 /* Allocate the Name stack */
4130 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4131 if (ctxt->nameTab == NULL) {
4132 xmlGenericError(xmlGenericErrorContext,
4133 "htmlInitParserCtxt: out of memory\n");
4134 ctxt->nameNr = 0;
4135 ctxt->nameMax = 10;
4136 ctxt->name = NULL;
4137 ctxt->nodeNr = 0;
4138 ctxt->nodeMax = 0;
4139 ctxt->node = NULL;
4140 ctxt->inputNr = 0;
4141 ctxt->inputMax = 0;
4142 ctxt->input = NULL;
4143 return;
4144 }
4145 ctxt->nameNr = 0;
4146 ctxt->nameMax = 10;
4147 ctxt->name = NULL;
4148
4149 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
4150 else {
4151 ctxt->sax = sax;
4152 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
4153 }
4154 ctxt->userData = ctxt;
4155 ctxt->myDoc = NULL;
4156 ctxt->wellFormed = 1;
4157 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004158 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004159 ctxt->html = 1;
4160 ctxt->record_info = 0;
4161 ctxt->validate = 0;
4162 ctxt->nbChars = 0;
4163 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004164 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004165 xmlInitNodeInfoSeq(&ctxt->node_seq);
4166}
4167
4168/**
4169 * htmlFreeParserCtxt:
4170 * @ctxt: an HTML parser context
4171 *
4172 * Free all the memory used by a parser context. However the parsed
4173 * document in ctxt->myDoc is not freed.
4174 */
4175
4176void
4177htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4178{
4179 xmlFreeParserCtxt(ctxt);
4180}
4181
4182/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004183 * htmlNewParserCtxt:
4184 *
4185 * Allocate and initialize a new parser context.
4186 *
4187 * Returns the xmlParserCtxtPtr or NULL
4188 */
4189
4190static htmlParserCtxtPtr
4191htmlNewParserCtxt(void)
4192{
4193 xmlParserCtxtPtr ctxt;
4194
4195 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4196 if (ctxt == NULL) {
4197 xmlGenericError(xmlGenericErrorContext,
4198 "xmlNewParserCtxt : cannot allocate context\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004199 return(NULL);
4200 }
4201 memset(ctxt, 0, sizeof(xmlParserCtxt));
4202 htmlInitParserCtxt(ctxt);
4203 return(ctxt);
4204}
4205
4206/**
4207 * htmlCreateMemoryParserCtxt:
4208 * @buffer: a pointer to a char array
4209 * @size: the size of the array
4210 *
4211 * Create a parser context for an HTML in-memory document.
4212 *
4213 * Returns the new parser context or NULL
4214 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004215htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004216htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4217 xmlParserCtxtPtr ctxt;
4218 xmlParserInputPtr input;
4219 xmlParserInputBufferPtr buf;
4220
4221 if (buffer == NULL)
4222 return(NULL);
4223 if (size <= 0)
4224 return(NULL);
4225
4226 ctxt = htmlNewParserCtxt();
4227 if (ctxt == NULL)
4228 return(NULL);
4229
4230 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4231 if (buf == NULL) return(NULL);
4232
4233 input = xmlNewInputStream(ctxt);
4234 if (input == NULL) {
4235 xmlFreeParserCtxt(ctxt);
4236 return(NULL);
4237 }
4238
4239 input->filename = NULL;
4240 input->buf = buf;
4241 input->base = input->buf->buffer->content;
4242 input->cur = input->buf->buffer->content;
4243 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4244
4245 inputPush(ctxt, input);
4246 return(ctxt);
4247}
4248
4249/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004250 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004251 * @cur: a pointer to an array of xmlChar
4252 * @encoding: a free form C string describing the HTML document encoding, or NULL
4253 *
4254 * Create a parser context for an HTML document.
4255 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004256 * TODO: check the need to add encoding handling there
4257 *
Owen Taylor3473f882001-02-23 17:55:21 +00004258 * Returns the new parser context or NULL
4259 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004260static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00004261htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004262 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004263 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004264
Daniel Veillard1d995272002-07-22 16:43:32 +00004265 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004266 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004267 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004268 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4269
4270 if (encoding != NULL) {
4271 xmlCharEncoding enc;
4272 xmlCharEncodingHandlerPtr handler;
4273
4274 if (ctxt->input->encoding != NULL)
4275 xmlFree((xmlChar *) ctxt->input->encoding);
4276 ctxt->input->encoding = (const xmlChar *) encoding;
4277
4278 enc = xmlParseCharEncoding(encoding);
4279 /*
4280 * registered set of known encodings
4281 */
4282 if (enc != XML_CHAR_ENCODING_ERROR) {
4283 xmlSwitchEncoding(ctxt, enc);
4284 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4285 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4286 ctxt->sax->error(ctxt->userData,
4287 "Unsupported encoding %s\n", encoding);
4288 ctxt->input->encoding = NULL;
4289 }
4290 } else {
4291 /*
4292 * fallback for unknown encodings
4293 */
4294 handler = xmlFindCharEncodingHandler((const char *) encoding);
4295 if (handler != NULL) {
4296 xmlSwitchToEncoding(ctxt, handler);
4297 } else {
4298 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
4299 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4300 ctxt->sax->error(ctxt->userData,
4301 "Unsupported encoding %s\n", encoding);
4302 }
4303 }
4304 }
4305 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004306}
4307
4308/************************************************************************
4309 * *
4310 * Progressive parsing interfaces *
4311 * *
4312 ************************************************************************/
4313
4314/**
4315 * htmlParseLookupSequence:
4316 * @ctxt: an HTML parser context
4317 * @first: the first char to lookup
4318 * @next: the next char to lookup or zero
4319 * @third: the next char to lookup or zero
4320 *
4321 * Try to find if a sequence (first, next, third) or just (first next) or
4322 * (first) is available in the input stream.
4323 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4324 * to avoid rescanning sequences of bytes, it DOES change the state of the
4325 * parser, do not use liberally.
4326 * This is basically similar to xmlParseLookupSequence()
4327 *
4328 * Returns the index to the current parsing point if the full sequence
4329 * is available, -1 otherwise.
4330 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004331static int
Owen Taylor3473f882001-02-23 17:55:21 +00004332htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
4333 xmlChar next, xmlChar third) {
4334 int base, len;
4335 htmlParserInputPtr in;
4336 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004337 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004338
4339 in = ctxt->input;
4340 if (in == NULL) return(-1);
4341 base = in->cur - in->base;
4342 if (base < 0) return(-1);
4343 if (ctxt->checkIndex > base)
4344 base = ctxt->checkIndex;
4345 if (in->buf == NULL) {
4346 buf = in->base;
4347 len = in->length;
4348 } else {
4349 buf = in->buf->buffer->content;
4350 len = in->buf->buffer->use;
4351 }
4352 /* take into account the sequence length */
4353 if (third) len -= 2;
4354 else if (next) len --;
4355 for (;base < len;base++) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004356 if (!incomment && (base + 4 < len)) {
4357 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4358 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4359 incomment = 1;
4360 }
4361 /* do not increment base, some people use <!--> */
4362 }
4363 if (incomment) {
4364 if (base + 3 < len)
4365 return(-1);
4366 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4367 (buf[base + 2] == '>')) {
4368 incomment = 0;
4369 base += 2;
4370 }
4371 continue;
4372 }
Owen Taylor3473f882001-02-23 17:55:21 +00004373 if (buf[base] == first) {
4374 if (third != 0) {
4375 if ((buf[base + 1] != next) ||
4376 (buf[base + 2] != third)) continue;
4377 } else if (next != 0) {
4378 if (buf[base + 1] != next) continue;
4379 }
4380 ctxt->checkIndex = 0;
4381#ifdef DEBUG_PUSH
4382 if (next == 0)
4383 xmlGenericError(xmlGenericErrorContext,
4384 "HPP: lookup '%c' found at %d\n",
4385 first, base);
4386 else if (third == 0)
4387 xmlGenericError(xmlGenericErrorContext,
4388 "HPP: lookup '%c%c' found at %d\n",
4389 first, next, base);
4390 else
4391 xmlGenericError(xmlGenericErrorContext,
4392 "HPP: lookup '%c%c%c' found at %d\n",
4393 first, next, third, base);
4394#endif
4395 return(base - (in->cur - in->base));
4396 }
4397 }
4398 ctxt->checkIndex = base;
4399#ifdef DEBUG_PUSH
4400 if (next == 0)
4401 xmlGenericError(xmlGenericErrorContext,
4402 "HPP: lookup '%c' failed\n", first);
4403 else if (third == 0)
4404 xmlGenericError(xmlGenericErrorContext,
4405 "HPP: lookup '%c%c' failed\n", first, next);
4406 else
4407 xmlGenericError(xmlGenericErrorContext,
4408 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4409#endif
4410 return(-1);
4411}
4412
4413/**
4414 * htmlParseTryOrFinish:
4415 * @ctxt: an HTML parser context
4416 * @terminate: last chunk indicator
4417 *
4418 * Try to progress on parsing
4419 *
4420 * Returns zero if no parsing was possible
4421 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004422static int
Owen Taylor3473f882001-02-23 17:55:21 +00004423htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4424 int ret = 0;
4425 htmlParserInputPtr in;
4426 int avail = 0;
4427 xmlChar cur, next;
4428
4429#ifdef DEBUG_PUSH
4430 switch (ctxt->instate) {
4431 case XML_PARSER_EOF:
4432 xmlGenericError(xmlGenericErrorContext,
4433 "HPP: try EOF\n"); break;
4434 case XML_PARSER_START:
4435 xmlGenericError(xmlGenericErrorContext,
4436 "HPP: try START\n"); break;
4437 case XML_PARSER_MISC:
4438 xmlGenericError(xmlGenericErrorContext,
4439 "HPP: try MISC\n");break;
4440 case XML_PARSER_COMMENT:
4441 xmlGenericError(xmlGenericErrorContext,
4442 "HPP: try COMMENT\n");break;
4443 case XML_PARSER_PROLOG:
4444 xmlGenericError(xmlGenericErrorContext,
4445 "HPP: try PROLOG\n");break;
4446 case XML_PARSER_START_TAG:
4447 xmlGenericError(xmlGenericErrorContext,
4448 "HPP: try START_TAG\n");break;
4449 case XML_PARSER_CONTENT:
4450 xmlGenericError(xmlGenericErrorContext,
4451 "HPP: try CONTENT\n");break;
4452 case XML_PARSER_CDATA_SECTION:
4453 xmlGenericError(xmlGenericErrorContext,
4454 "HPP: try CDATA_SECTION\n");break;
4455 case XML_PARSER_END_TAG:
4456 xmlGenericError(xmlGenericErrorContext,
4457 "HPP: try END_TAG\n");break;
4458 case XML_PARSER_ENTITY_DECL:
4459 xmlGenericError(xmlGenericErrorContext,
4460 "HPP: try ENTITY_DECL\n");break;
4461 case XML_PARSER_ENTITY_VALUE:
4462 xmlGenericError(xmlGenericErrorContext,
4463 "HPP: try ENTITY_VALUE\n");break;
4464 case XML_PARSER_ATTRIBUTE_VALUE:
4465 xmlGenericError(xmlGenericErrorContext,
4466 "HPP: try ATTRIBUTE_VALUE\n");break;
4467 case XML_PARSER_DTD:
4468 xmlGenericError(xmlGenericErrorContext,
4469 "HPP: try DTD\n");break;
4470 case XML_PARSER_EPILOG:
4471 xmlGenericError(xmlGenericErrorContext,
4472 "HPP: try EPILOG\n");break;
4473 case XML_PARSER_PI:
4474 xmlGenericError(xmlGenericErrorContext,
4475 "HPP: try PI\n");break;
4476 case XML_PARSER_SYSTEM_LITERAL:
4477 xmlGenericError(xmlGenericErrorContext,
4478 "HPP: try SYSTEM_LITERAL\n");break;
4479 }
4480#endif
4481
4482 while (1) {
4483
4484 in = ctxt->input;
4485 if (in == NULL) break;
4486 if (in->buf == NULL)
4487 avail = in->length - (in->cur - in->base);
4488 else
4489 avail = in->buf->buffer->use - (in->cur - in->base);
4490 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004491 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004492 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4493 /*
4494 * SAX: end of the document processing.
4495 */
4496 ctxt->instate = XML_PARSER_EOF;
4497 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4498 ctxt->sax->endDocument(ctxt->userData);
4499 }
4500 }
4501 if (avail < 1)
4502 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004503 cur = in->cur[0];
4504 if (cur == 0) {
4505 SKIP(1);
4506 continue;
4507 }
4508
Owen Taylor3473f882001-02-23 17:55:21 +00004509 switch (ctxt->instate) {
4510 case XML_PARSER_EOF:
4511 /*
4512 * Document parsing is done !
4513 */
4514 goto done;
4515 case XML_PARSER_START:
4516 /*
4517 * Very first chars read from the document flow.
4518 */
4519 cur = in->cur[0];
4520 if (IS_BLANK(cur)) {
4521 SKIP_BLANKS;
4522 if (in->buf == NULL)
4523 avail = in->length - (in->cur - in->base);
4524 else
4525 avail = in->buf->buffer->use - (in->cur - in->base);
4526 }
4527 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4528 ctxt->sax->setDocumentLocator(ctxt->userData,
4529 &xmlDefaultSAXLocator);
4530 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4531 (!ctxt->disableSAX))
4532 ctxt->sax->startDocument(ctxt->userData);
4533
4534 cur = in->cur[0];
4535 next = in->cur[1];
4536 if ((cur == '<') && (next == '!') &&
4537 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4538 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4539 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4540 (UPP(8) == 'E')) {
4541 if ((!terminate) &&
4542 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4543 goto done;
4544#ifdef DEBUG_PUSH
4545 xmlGenericError(xmlGenericErrorContext,
4546 "HPP: Parsing internal subset\n");
4547#endif
4548 htmlParseDocTypeDecl(ctxt);
4549 ctxt->instate = XML_PARSER_PROLOG;
4550#ifdef DEBUG_PUSH
4551 xmlGenericError(xmlGenericErrorContext,
4552 "HPP: entering PROLOG\n");
4553#endif
4554 } else {
4555 ctxt->instate = XML_PARSER_MISC;
4556 }
4557#ifdef DEBUG_PUSH
4558 xmlGenericError(xmlGenericErrorContext,
4559 "HPP: entering MISC\n");
4560#endif
4561 break;
4562 case XML_PARSER_MISC:
4563 SKIP_BLANKS;
4564 if (in->buf == NULL)
4565 avail = in->length - (in->cur - in->base);
4566 else
4567 avail = in->buf->buffer->use - (in->cur - in->base);
4568 if (avail < 2)
4569 goto done;
4570 cur = in->cur[0];
4571 next = in->cur[1];
4572 if ((cur == '<') && (next == '!') &&
4573 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4574 if ((!terminate) &&
4575 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4576 goto done;
4577#ifdef DEBUG_PUSH
4578 xmlGenericError(xmlGenericErrorContext,
4579 "HPP: Parsing Comment\n");
4580#endif
4581 htmlParseComment(ctxt);
4582 ctxt->instate = XML_PARSER_MISC;
4583 } else if ((cur == '<') && (next == '!') &&
4584 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4585 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4586 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4587 (UPP(8) == 'E')) {
4588 if ((!terminate) &&
4589 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4590 goto done;
4591#ifdef DEBUG_PUSH
4592 xmlGenericError(xmlGenericErrorContext,
4593 "HPP: Parsing internal subset\n");
4594#endif
4595 htmlParseDocTypeDecl(ctxt);
4596 ctxt->instate = XML_PARSER_PROLOG;
4597#ifdef DEBUG_PUSH
4598 xmlGenericError(xmlGenericErrorContext,
4599 "HPP: entering PROLOG\n");
4600#endif
4601 } else if ((cur == '<') && (next == '!') &&
4602 (avail < 9)) {
4603 goto done;
4604 } else {
4605 ctxt->instate = XML_PARSER_START_TAG;
4606#ifdef DEBUG_PUSH
4607 xmlGenericError(xmlGenericErrorContext,
4608 "HPP: entering START_TAG\n");
4609#endif
4610 }
4611 break;
4612 case XML_PARSER_PROLOG:
4613 SKIP_BLANKS;
4614 if (in->buf == NULL)
4615 avail = in->length - (in->cur - in->base);
4616 else
4617 avail = in->buf->buffer->use - (in->cur - in->base);
4618 if (avail < 2)
4619 goto done;
4620 cur = in->cur[0];
4621 next = in->cur[1];
4622 if ((cur == '<') && (next == '!') &&
4623 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4624 if ((!terminate) &&
4625 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4626 goto done;
4627#ifdef DEBUG_PUSH
4628 xmlGenericError(xmlGenericErrorContext,
4629 "HPP: Parsing Comment\n");
4630#endif
4631 htmlParseComment(ctxt);
4632 ctxt->instate = XML_PARSER_PROLOG;
4633 } else if ((cur == '<') && (next == '!') &&
4634 (avail < 4)) {
4635 goto done;
4636 } else {
4637 ctxt->instate = XML_PARSER_START_TAG;
4638#ifdef DEBUG_PUSH
4639 xmlGenericError(xmlGenericErrorContext,
4640 "HPP: entering START_TAG\n");
4641#endif
4642 }
4643 break;
4644 case XML_PARSER_EPILOG:
4645 if (in->buf == NULL)
4646 avail = in->length - (in->cur - in->base);
4647 else
4648 avail = in->buf->buffer->use - (in->cur - in->base);
4649 if (avail < 1)
4650 goto done;
4651 cur = in->cur[0];
4652 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004653 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004654 goto done;
4655 }
4656 if (avail < 2)
4657 goto done;
4658 next = in->cur[1];
4659 if ((cur == '<') && (next == '!') &&
4660 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4661 if ((!terminate) &&
4662 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4663 goto done;
4664#ifdef DEBUG_PUSH
4665 xmlGenericError(xmlGenericErrorContext,
4666 "HPP: Parsing Comment\n");
4667#endif
4668 htmlParseComment(ctxt);
4669 ctxt->instate = XML_PARSER_EPILOG;
4670 } else if ((cur == '<') && (next == '!') &&
4671 (avail < 4)) {
4672 goto done;
4673 } else {
4674 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004675 ctxt->wellFormed = 0;
4676 ctxt->instate = XML_PARSER_EOF;
4677#ifdef DEBUG_PUSH
4678 xmlGenericError(xmlGenericErrorContext,
4679 "HPP: entering EOF\n");
4680#endif
4681 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4682 ctxt->sax->endDocument(ctxt->userData);
4683 goto done;
4684 }
4685 break;
4686 case XML_PARSER_START_TAG: {
4687 xmlChar *name, *oldname;
4688 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004689 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004690
4691 if (avail < 2)
4692 goto done;
4693 cur = in->cur[0];
4694 if (cur != '<') {
4695 ctxt->instate = XML_PARSER_CONTENT;
4696#ifdef DEBUG_PUSH
4697 xmlGenericError(xmlGenericErrorContext,
4698 "HPP: entering CONTENT\n");
4699#endif
4700 break;
4701 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004702 if (in->cur[1] == '/') {
4703 ctxt->instate = XML_PARSER_END_TAG;
4704 ctxt->checkIndex = 0;
4705#ifdef DEBUG_PUSH
4706 xmlGenericError(xmlGenericErrorContext,
4707 "HPP: entering END_TAG\n");
4708#endif
4709 break;
4710 }
Owen Taylor3473f882001-02-23 17:55:21 +00004711 if ((!terminate) &&
4712 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4713 goto done;
4714
4715 oldname = xmlStrdup(ctxt->name);
4716 htmlParseStartTag(ctxt);
4717 name = ctxt->name;
4718#ifdef DEBUG
4719 if (oldname == NULL)
4720 xmlGenericError(xmlGenericErrorContext,
4721 "Start of element %s\n", name);
4722 else if (name == NULL)
4723 xmlGenericError(xmlGenericErrorContext,
4724 "Start of element failed, was %s\n",
4725 oldname);
4726 else
4727 xmlGenericError(xmlGenericErrorContext,
4728 "Start of element %s, was %s\n",
4729 name, oldname);
4730#endif
4731 if (((depth == ctxt->nameNr) &&
4732 (xmlStrEqual(oldname, ctxt->name))) ||
4733 (name == NULL)) {
4734 if (CUR == '>')
4735 NEXT;
4736 if (oldname != NULL)
4737 xmlFree(oldname);
4738 break;
4739 }
4740 if (oldname != NULL)
4741 xmlFree(oldname);
4742
4743 /*
4744 * Lookup the info for that element.
4745 */
4746 info = htmlTagLookup(name);
4747 if (info == NULL) {
4748 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4749 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4750 name);
4751 ctxt->wellFormed = 0;
4752 } else if (info->depr) {
4753 /***************************
4754 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4755 ctxt->sax->warning(ctxt->userData,
4756 "Tag %s is deprecated\n",
4757 name);
4758 ***************************/
4759 }
4760
4761 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004762 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004763 */
4764 if ((CUR == '/') && (NXT(1) == '>')) {
4765 SKIP(2);
4766 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4767 ctxt->sax->endElement(ctxt->userData, name);
4768 oldname = htmlnamePop(ctxt);
4769#ifdef DEBUG
4770 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4771 oldname);
4772#endif
4773 if (oldname != NULL)
4774 xmlFree(oldname);
4775 ctxt->instate = XML_PARSER_CONTENT;
4776#ifdef DEBUG_PUSH
4777 xmlGenericError(xmlGenericErrorContext,
4778 "HPP: entering CONTENT\n");
4779#endif
4780 break;
4781 }
4782
4783 if (CUR == '>') {
4784 NEXT;
4785 } else {
4786 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4787 ctxt->sax->error(ctxt->userData,
4788 "Couldn't find end of Start Tag %s\n",
4789 name);
4790 ctxt->wellFormed = 0;
4791
4792 /*
4793 * end of parsing of this node.
4794 */
4795 if (xmlStrEqual(name, ctxt->name)) {
4796 nodePop(ctxt);
4797 oldname = htmlnamePop(ctxt);
4798#ifdef DEBUG
4799 xmlGenericError(xmlGenericErrorContext,
4800 "End of start tag problem: popping out %s\n", oldname);
4801#endif
4802 if (oldname != NULL)
4803 xmlFree(oldname);
4804 }
4805
4806 ctxt->instate = XML_PARSER_CONTENT;
4807#ifdef DEBUG_PUSH
4808 xmlGenericError(xmlGenericErrorContext,
4809 "HPP: entering CONTENT\n");
4810#endif
4811 break;
4812 }
4813
4814 /*
4815 * Check for an Empty Element from DTD definition
4816 */
4817 if ((info != NULL) && (info->empty)) {
4818 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4819 ctxt->sax->endElement(ctxt->userData, name);
4820 oldname = htmlnamePop(ctxt);
4821#ifdef DEBUG
4822 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4823#endif
4824 if (oldname != NULL)
4825 xmlFree(oldname);
4826 }
4827 ctxt->instate = XML_PARSER_CONTENT;
4828#ifdef DEBUG_PUSH
4829 xmlGenericError(xmlGenericErrorContext,
4830 "HPP: entering CONTENT\n");
4831#endif
4832 break;
4833 }
4834 case XML_PARSER_CONTENT: {
4835 long cons;
4836 /*
4837 * Handle preparsed entities and charRef
4838 */
4839 if (ctxt->token != 0) {
4840 xmlChar chr[2] = { 0 , 0 } ;
4841
4842 chr[0] = (xmlChar) ctxt->token;
4843 htmlCheckParagraph(ctxt);
4844 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4845 ctxt->sax->characters(ctxt->userData, chr, 1);
4846 ctxt->token = 0;
4847 ctxt->checkIndex = 0;
4848 }
4849 if ((avail == 1) && (terminate)) {
4850 cur = in->cur[0];
4851 if ((cur != '<') && (cur != '&')) {
4852 if (ctxt->sax != NULL) {
4853 if (IS_BLANK(cur)) {
4854 if (ctxt->sax->ignorableWhitespace != NULL)
4855 ctxt->sax->ignorableWhitespace(
4856 ctxt->userData, &cur, 1);
4857 } else {
4858 htmlCheckParagraph(ctxt);
4859 if (ctxt->sax->characters != NULL)
4860 ctxt->sax->characters(
4861 ctxt->userData, &cur, 1);
4862 }
4863 }
4864 ctxt->token = 0;
4865 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00004866 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00004867 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004868 }
Owen Taylor3473f882001-02-23 17:55:21 +00004869 }
4870 if (avail < 2)
4871 goto done;
4872 cur = in->cur[0];
4873 next = in->cur[1];
4874 cons = ctxt->nbChars;
4875 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4876 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4877 /*
4878 * Handle SCRIPT/STYLE separately
4879 */
4880 if ((!terminate) &&
4881 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4882 goto done;
4883 htmlParseScript(ctxt);
4884 if ((cur == '<') && (next == '/')) {
4885 ctxt->instate = XML_PARSER_END_TAG;
4886 ctxt->checkIndex = 0;
4887#ifdef DEBUG_PUSH
4888 xmlGenericError(xmlGenericErrorContext,
4889 "HPP: entering END_TAG\n");
4890#endif
4891 break;
4892 }
4893 } else {
4894 /*
4895 * Sometimes DOCTYPE arrives in the middle of the document
4896 */
4897 if ((cur == '<') && (next == '!') &&
4898 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4899 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4900 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4901 (UPP(8) == 'E')) {
4902 if ((!terminate) &&
4903 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4904 goto done;
4905 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4906 ctxt->sax->error(ctxt->userData,
4907 "Misplaced DOCTYPE declaration\n");
4908 ctxt->wellFormed = 0;
4909 htmlParseDocTypeDecl(ctxt);
4910 } else if ((cur == '<') && (next == '!') &&
4911 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4912 if ((!terminate) &&
4913 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4914 goto done;
4915#ifdef DEBUG_PUSH
4916 xmlGenericError(xmlGenericErrorContext,
4917 "HPP: Parsing Comment\n");
4918#endif
4919 htmlParseComment(ctxt);
4920 ctxt->instate = XML_PARSER_CONTENT;
4921 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4922 goto done;
4923 } else if ((cur == '<') && (next == '/')) {
4924 ctxt->instate = XML_PARSER_END_TAG;
4925 ctxt->checkIndex = 0;
4926#ifdef DEBUG_PUSH
4927 xmlGenericError(xmlGenericErrorContext,
4928 "HPP: entering END_TAG\n");
4929#endif
4930 break;
4931 } else if (cur == '<') {
4932 ctxt->instate = XML_PARSER_START_TAG;
4933 ctxt->checkIndex = 0;
4934#ifdef DEBUG_PUSH
4935 xmlGenericError(xmlGenericErrorContext,
4936 "HPP: entering START_TAG\n");
4937#endif
4938 break;
4939 } else if (cur == '&') {
4940 if ((!terminate) &&
4941 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4942 goto done;
4943#ifdef DEBUG_PUSH
4944 xmlGenericError(xmlGenericErrorContext,
4945 "HPP: Parsing Reference\n");
4946#endif
4947 /* TODO: check generation of subtrees if noent !!! */
4948 htmlParseReference(ctxt);
4949 } else {
4950 /* TODO Avoid the extra copy, handle directly !!!!!! */
4951 /*
Daniel Veillard01c13b52002-12-10 15:19:08 +00004952 * Goal of the following test is:
Owen Taylor3473f882001-02-23 17:55:21 +00004953 * - minimize calls to the SAX 'character' callback
4954 * when they are mergeable
4955 */
4956 if ((ctxt->inputNr == 1) &&
4957 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4958 if ((!terminate) &&
4959 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4960 goto done;
4961 }
4962 ctxt->checkIndex = 0;
4963#ifdef DEBUG_PUSH
4964 xmlGenericError(xmlGenericErrorContext,
4965 "HPP: Parsing char data\n");
4966#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004967 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004968 }
4969 }
4970 if (cons == ctxt->nbChars) {
4971 if (ctxt->node != NULL) {
4972 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4973 ctxt->sax->error(ctxt->userData,
4974 "detected an error in element content\n");
4975 ctxt->wellFormed = 0;
4976 }
4977 NEXT;
4978 break;
4979 }
4980
4981 break;
4982 }
4983 case XML_PARSER_END_TAG:
4984 if (avail < 2)
4985 goto done;
4986 if ((!terminate) &&
4987 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4988 goto done;
4989 htmlParseEndTag(ctxt);
4990 if (ctxt->nameNr == 0) {
4991 ctxt->instate = XML_PARSER_EPILOG;
4992 } else {
4993 ctxt->instate = XML_PARSER_CONTENT;
4994 }
4995 ctxt->checkIndex = 0;
4996#ifdef DEBUG_PUSH
4997 xmlGenericError(xmlGenericErrorContext,
4998 "HPP: entering CONTENT\n");
4999#endif
5000 break;
5001 case XML_PARSER_CDATA_SECTION:
5002 xmlGenericError(xmlGenericErrorContext,
5003 "HPP: internal error, state == CDATA\n");
5004 ctxt->instate = XML_PARSER_CONTENT;
5005 ctxt->checkIndex = 0;
5006#ifdef DEBUG_PUSH
5007 xmlGenericError(xmlGenericErrorContext,
5008 "HPP: entering CONTENT\n");
5009#endif
5010 break;
5011 case XML_PARSER_DTD:
5012 xmlGenericError(xmlGenericErrorContext,
5013 "HPP: internal error, state == DTD\n");
5014 ctxt->instate = XML_PARSER_CONTENT;
5015 ctxt->checkIndex = 0;
5016#ifdef DEBUG_PUSH
5017 xmlGenericError(xmlGenericErrorContext,
5018 "HPP: entering CONTENT\n");
5019#endif
5020 break;
5021 case XML_PARSER_COMMENT:
5022 xmlGenericError(xmlGenericErrorContext,
5023 "HPP: internal error, state == COMMENT\n");
5024 ctxt->instate = XML_PARSER_CONTENT;
5025 ctxt->checkIndex = 0;
5026#ifdef DEBUG_PUSH
5027 xmlGenericError(xmlGenericErrorContext,
5028 "HPP: entering CONTENT\n");
5029#endif
5030 break;
5031 case XML_PARSER_PI:
5032 xmlGenericError(xmlGenericErrorContext,
5033 "HPP: internal error, state == PI\n");
5034 ctxt->instate = XML_PARSER_CONTENT;
5035 ctxt->checkIndex = 0;
5036#ifdef DEBUG_PUSH
5037 xmlGenericError(xmlGenericErrorContext,
5038 "HPP: entering CONTENT\n");
5039#endif
5040 break;
5041 case XML_PARSER_ENTITY_DECL:
5042 xmlGenericError(xmlGenericErrorContext,
5043 "HPP: internal error, state == ENTITY_DECL\n");
5044 ctxt->instate = XML_PARSER_CONTENT;
5045 ctxt->checkIndex = 0;
5046#ifdef DEBUG_PUSH
5047 xmlGenericError(xmlGenericErrorContext,
5048 "HPP: entering CONTENT\n");
5049#endif
5050 break;
5051 case XML_PARSER_ENTITY_VALUE:
5052 xmlGenericError(xmlGenericErrorContext,
5053 "HPP: internal error, state == ENTITY_VALUE\n");
5054 ctxt->instate = XML_PARSER_CONTENT;
5055 ctxt->checkIndex = 0;
5056#ifdef DEBUG_PUSH
5057 xmlGenericError(xmlGenericErrorContext,
5058 "HPP: entering DTD\n");
5059#endif
5060 break;
5061 case XML_PARSER_ATTRIBUTE_VALUE:
5062 xmlGenericError(xmlGenericErrorContext,
5063 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
5064 ctxt->instate = XML_PARSER_START_TAG;
5065 ctxt->checkIndex = 0;
5066#ifdef DEBUG_PUSH
5067 xmlGenericError(xmlGenericErrorContext,
5068 "HPP: entering START_TAG\n");
5069#endif
5070 break;
5071 case XML_PARSER_SYSTEM_LITERAL:
5072 xmlGenericError(xmlGenericErrorContext,
5073 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
5074 ctxt->instate = XML_PARSER_CONTENT;
5075 ctxt->checkIndex = 0;
5076#ifdef DEBUG_PUSH
5077 xmlGenericError(xmlGenericErrorContext,
5078 "HPP: entering CONTENT\n");
5079#endif
5080 break;
5081 case XML_PARSER_IGNORE:
5082 xmlGenericError(xmlGenericErrorContext,
5083 "HPP: internal error, state == XML_PARSER_IGNORE\n");
5084 ctxt->instate = XML_PARSER_CONTENT;
5085 ctxt->checkIndex = 0;
5086#ifdef DEBUG_PUSH
5087 xmlGenericError(xmlGenericErrorContext,
5088 "HPP: entering CONTENT\n");
5089#endif
5090 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005091 case XML_PARSER_PUBLIC_LITERAL:
5092 xmlGenericError(xmlGenericErrorContext,
5093 "HPP: internal error, state == XML_PARSER_LITERAL\n");
5094 ctxt->instate = XML_PARSER_CONTENT;
5095 ctxt->checkIndex = 0;
5096#ifdef DEBUG_PUSH
5097 xmlGenericError(xmlGenericErrorContext,
5098 "HPP: entering CONTENT\n");
5099#endif
5100 break;
5101
Owen Taylor3473f882001-02-23 17:55:21 +00005102 }
5103 }
5104done:
5105 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005106 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005107 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5108 /*
5109 * SAX: end of the document processing.
5110 */
5111 ctxt->instate = XML_PARSER_EOF;
5112 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5113 ctxt->sax->endDocument(ctxt->userData);
5114 }
5115 }
5116 if ((ctxt->myDoc != NULL) &&
5117 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5118 (ctxt->instate == XML_PARSER_EPILOG))) {
5119 xmlDtdPtr dtd;
5120 dtd = xmlGetIntSubset(ctxt->myDoc);
5121 if (dtd == NULL)
5122 ctxt->myDoc->intSubset =
5123 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
5124 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5125 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5126 }
5127#ifdef DEBUG_PUSH
5128 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5129#endif
5130 return(ret);
5131}
5132
5133/**
Owen Taylor3473f882001-02-23 17:55:21 +00005134 * htmlParseChunk:
5135 * @ctxt: an XML parser context
5136 * @chunk: an char array
5137 * @size: the size in byte of the chunk
5138 * @terminate: last chunk indicator
5139 *
5140 * Parse a Chunk of memory
5141 *
5142 * Returns zero if no error, the xmlParserErrors otherwise.
5143 */
5144int
5145htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5146 int terminate) {
5147 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5148 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5149 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5150 int cur = ctxt->input->cur - ctxt->input->base;
5151
5152 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5153 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5154 ctxt->input->cur = ctxt->input->base + cur;
5155#ifdef DEBUG_PUSH
5156 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5157#endif
5158
5159 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5160 htmlParseTryOrFinish(ctxt, terminate);
5161 } else if (ctxt->instate != XML_PARSER_EOF) {
5162 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
5163 htmlParseTryOrFinish(ctxt, terminate);
5164 }
5165 if (terminate) {
5166 if ((ctxt->instate != XML_PARSER_EOF) &&
5167 (ctxt->instate != XML_PARSER_EPILOG) &&
5168 (ctxt->instate != XML_PARSER_MISC)) {
5169 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005170 ctxt->wellFormed = 0;
5171 }
5172 if (ctxt->instate != XML_PARSER_EOF) {
5173 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5174 ctxt->sax->endDocument(ctxt->userData);
5175 }
5176 ctxt->instate = XML_PARSER_EOF;
5177 }
5178 return((xmlParserErrors) ctxt->errNo);
5179}
5180
5181/************************************************************************
5182 * *
5183 * User entry points *
5184 * *
5185 ************************************************************************/
5186
5187/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005188 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005189 * @sax: a SAX handler
5190 * @user_data: The user data returned on SAX callbacks
5191 * @chunk: a pointer to an array of chars
5192 * @size: number of chars in the array
5193 * @filename: an optional file name or URI
5194 * @enc: an optional encoding
5195 *
5196 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005197 * The value of @filename is used for fetching external entities
5198 * and error/warning reports.
5199 *
5200 * Returns the new parser context or NULL
5201 */
5202htmlParserCtxtPtr
5203htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5204 const char *chunk, int size, const char *filename,
5205 xmlCharEncoding enc) {
5206 htmlParserCtxtPtr ctxt;
5207 htmlParserInputPtr inputStream;
5208 xmlParserInputBufferPtr buf;
5209
Daniel Veillardd0463562001-10-13 09:15:48 +00005210 xmlInitParser();
5211
Owen Taylor3473f882001-02-23 17:55:21 +00005212 buf = xmlAllocParserInputBuffer(enc);
5213 if (buf == NULL) return(NULL);
5214
5215 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
5216 if (ctxt == NULL) {
5217 xmlFree(buf);
5218 return(NULL);
5219 }
5220 memset(ctxt, 0, sizeof(htmlParserCtxt));
5221 htmlInitParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005222 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5223 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005224 if (sax != NULL) {
5225 if (ctxt->sax != &htmlDefaultSAXHandler)
5226 xmlFree(ctxt->sax);
5227 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5228 if (ctxt->sax == NULL) {
5229 xmlFree(buf);
5230 xmlFree(ctxt);
5231 return(NULL);
5232 }
5233 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5234 if (user_data != NULL)
5235 ctxt->userData = user_data;
5236 }
5237 if (filename == NULL) {
5238 ctxt->directory = NULL;
5239 } else {
5240 ctxt->directory = xmlParserGetDirectory(filename);
5241 }
5242
5243 inputStream = htmlNewInputStream(ctxt);
5244 if (inputStream == NULL) {
5245 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005246 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005247 return(NULL);
5248 }
5249
5250 if (filename == NULL)
5251 inputStream->filename = NULL;
5252 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005253 inputStream->filename = (char *)
5254 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005255 inputStream->buf = buf;
5256 inputStream->base = inputStream->buf->buffer->content;
5257 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillard5f704af2003-03-05 10:01:43 +00005258 inputStream->end =
5259 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005260
5261 inputPush(ctxt, inputStream);
5262
5263 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5264 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005265 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5266 int cur = ctxt->input->cur - ctxt->input->base;
5267
Owen Taylor3473f882001-02-23 17:55:21 +00005268 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005269
5270 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5271 ctxt->input->cur = ctxt->input->base + cur;
5272 ctxt->input->end =
5273 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005274#ifdef DEBUG_PUSH
5275 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5276#endif
5277 }
5278
5279 return(ctxt);
5280}
5281
5282/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005283 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005284 * @cur: a pointer to an array of xmlChar
5285 * @encoding: a free form C string describing the HTML document encoding, or NULL
5286 * @sax: the SAX handler block
5287 * @userData: if using SAX, this pointer will be provided on callbacks.
5288 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005289 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5290 * to handle parse events. If sax is NULL, fallback to the default DOM
5291 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00005292 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005293 * Returns the resulting document tree unless SAX is NULL or the document is
5294 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005295 */
5296
5297htmlDocPtr
5298htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5299 htmlDocPtr ret;
5300 htmlParserCtxtPtr ctxt;
5301
Daniel Veillardd0463562001-10-13 09:15:48 +00005302 xmlInitParser();
5303
Owen Taylor3473f882001-02-23 17:55:21 +00005304 if (cur == NULL) return(NULL);
5305
5306
5307 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5308 if (ctxt == NULL) return(NULL);
5309 if (sax != NULL) {
5310 ctxt->sax = sax;
5311 ctxt->userData = userData;
5312 }
5313
5314 htmlParseDocument(ctxt);
5315 ret = ctxt->myDoc;
5316 if (sax != NULL) {
5317 ctxt->sax = NULL;
5318 ctxt->userData = NULL;
5319 }
5320 htmlFreeParserCtxt(ctxt);
5321
5322 return(ret);
5323}
5324
5325/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005326 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005327 * @cur: a pointer to an array of xmlChar
5328 * @encoding: a free form C string describing the HTML document encoding, or NULL
5329 *
5330 * parse an HTML in-memory document and build a tree.
5331 *
5332 * Returns the resulting document tree
5333 */
5334
5335htmlDocPtr
5336htmlParseDoc(xmlChar *cur, const char *encoding) {
5337 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5338}
5339
5340
5341/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005342 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005343 * @filename: the filename
5344 * @encoding: a free form C string describing the HTML document encoding, or NULL
5345 *
5346 * Create a parser context for a file content.
5347 * Automatic support for ZLIB/Compress compressed document is provided
5348 * by default if found at compile-time.
5349 *
5350 * Returns the new parser context or NULL
5351 */
5352htmlParserCtxtPtr
5353htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5354{
5355 htmlParserCtxtPtr ctxt;
5356 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005357 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005358 /* htmlCharEncoding enc; */
5359 xmlChar *content, *content_line = (xmlChar *) "charset=";
5360
Owen Taylor3473f882001-02-23 17:55:21 +00005361 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
5362 if (ctxt == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00005363 xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00005364 return(NULL);
5365 }
5366 memset(ctxt, 0, sizeof(htmlParserCtxt));
5367 htmlInitParserCtxt(ctxt);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005368 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5369 if (canonicFilename == NULL) {
5370 if (xmlDefaultSAXHandler.error != NULL) {
5371 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5372 }
Daniel Veillard104caa32003-05-13 22:54:05 +00005373 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005374 return(NULL);
5375 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005376
5377 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5378 xmlFree(canonicFilename);
5379 if (inputStream == NULL) {
5380 xmlFreeParserCtxt(ctxt);
5381 return(NULL);
5382 }
Owen Taylor3473f882001-02-23 17:55:21 +00005383
5384 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005385
Owen Taylor3473f882001-02-23 17:55:21 +00005386 /* set encoding */
5387 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005388 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Owen Taylor3473f882001-02-23 17:55:21 +00005389 if (content) {
5390 strcpy ((char *)content, (char *)content_line);
5391 strcat ((char *)content, (char *)encoding);
5392 htmlCheckEncoding (ctxt, content);
5393 xmlFree (content);
5394 }
5395 }
5396
5397 return(ctxt);
5398}
5399
5400/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005401 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005402 * @filename: the filename
5403 * @encoding: a free form C string describing the HTML document encoding, or NULL
5404 * @sax: the SAX handler block
5405 * @userData: if using SAX, this pointer will be provided on callbacks.
5406 *
5407 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5408 * compressed document is provided by default if found at compile-time.
5409 * It use the given SAX function block to handle the parsing callback.
5410 * If sax is NULL, fallback to the default DOM tree building routines.
5411 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005412 * Returns the resulting document tree unless SAX is NULL or the document is
5413 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005414 */
5415
5416htmlDocPtr
5417htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5418 void *userData) {
5419 htmlDocPtr ret;
5420 htmlParserCtxtPtr ctxt;
5421 htmlSAXHandlerPtr oldsax = NULL;
5422
Daniel Veillardd0463562001-10-13 09:15:48 +00005423 xmlInitParser();
5424
Owen Taylor3473f882001-02-23 17:55:21 +00005425 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5426 if (ctxt == NULL) return(NULL);
5427 if (sax != NULL) {
5428 oldsax = ctxt->sax;
5429 ctxt->sax = sax;
5430 ctxt->userData = userData;
5431 }
5432
5433 htmlParseDocument(ctxt);
5434
5435 ret = ctxt->myDoc;
5436 if (sax != NULL) {
5437 ctxt->sax = oldsax;
5438 ctxt->userData = NULL;
5439 }
5440 htmlFreeParserCtxt(ctxt);
5441
5442 return(ret);
5443}
5444
5445/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005446 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005447 * @filename: the filename
5448 * @encoding: a free form C string describing the HTML document encoding, or NULL
5449 *
5450 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5451 * compressed document is provided by default if found at compile-time.
5452 *
5453 * Returns the resulting document tree
5454 */
5455
5456htmlDocPtr
5457htmlParseFile(const char *filename, const char *encoding) {
5458 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5459}
5460
5461/**
5462 * htmlHandleOmittedElem:
5463 * @val: int 0 or 1
5464 *
5465 * Set and return the previous value for handling HTML omitted tags.
5466 *
5467 * Returns the last value for 0 for no handling, 1 for auto insertion.
5468 */
5469
5470int
5471htmlHandleOmittedElem(int val) {
5472 int old = htmlOmittedDefaultValue;
5473
5474 htmlOmittedDefaultValue = val;
5475 return(old);
5476}
5477
Daniel Veillard930dfb62003-02-05 10:17:38 +00005478/**
5479 * htmlElementAllowedHere:
5480 * @parent: HTML parent element
5481 * @elt: HTML element
5482 *
5483 * Checks whether an HTML element may be a direct child of a parent element.
5484 * Note - doesn't check for deprecated elements
5485 *
5486 * Returns 1 if allowed; 0 otherwise.
5487 */
5488int
5489htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5490 const char** p ;
5491
5492 if ( ! elt || ! parent || ! parent->subelts )
5493 return 0 ;
5494
5495 for ( p = parent->subelts; *p; ++p )
5496 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5497 return 1 ;
5498
5499 return 0 ;
5500}
5501/**
5502 * htmlElementStatusHere:
5503 * @parent: HTML parent element
5504 * @elt: HTML element
5505 *
5506 * Checks whether an HTML element may be a direct child of a parent element.
5507 * and if so whether it is valid or deprecated.
5508 *
5509 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5510 */
5511htmlStatus
5512htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5513 if ( ! parent || ! elt )
5514 return HTML_INVALID ;
5515 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5516 return HTML_INVALID ;
5517
5518 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5519}
5520/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005521 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005522 * @elt: HTML element
5523 * @attr: HTML attribute
5524 * @legacy: whether to allow deprecated attributes
5525 *
5526 * Checks whether an attribute is valid for an element
5527 * Has full knowledge of Required and Deprecated attributes
5528 *
5529 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5530 */
5531htmlStatus
5532htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5533 const char** p ;
5534
5535 if ( !elt || ! attr )
5536 return HTML_INVALID ;
5537
5538 if ( elt->attrs_req )
5539 for ( p = elt->attrs_req; *p; ++p)
5540 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5541 return HTML_REQUIRED ;
5542
5543 if ( elt->attrs_opt )
5544 for ( p = elt->attrs_opt; *p; ++p)
5545 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5546 return HTML_VALID ;
5547
5548 if ( legacy && elt->attrs_depr )
5549 for ( p = elt->attrs_depr; *p; ++p)
5550 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5551 return HTML_DEPRECATED ;
5552
5553 return HTML_INVALID ;
5554}
5555/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005556 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005557 * @node: an htmlNodePtr in a tree
5558 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005559 * for Element nodes)
5560 *
5561 * Checks whether the tree node is valid. Experimental (the author
5562 * only uses the HTML enhancements in a SAX parser)
5563 *
5564 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5565 * legacy allowed) or htmlElementStatusHere (otherwise).
5566 * for Attribute nodes, a return from htmlAttrAllowed
5567 * for other nodes, HTML_NA (no checks performed)
5568 */
5569htmlStatus
5570htmlNodeStatus(const htmlNodePtr node, int legacy) {
5571 if ( ! node )
5572 return HTML_INVALID ;
5573
5574 switch ( node->type ) {
5575 case XML_ELEMENT_NODE:
5576 return legacy
5577 ? ( htmlElementAllowedHere (
5578 htmlTagLookup(node->parent->name) , node->name
5579 ) ? HTML_VALID : HTML_INVALID )
5580 : htmlElementStatusHere(
5581 htmlTagLookup(node->parent->name) ,
5582 htmlTagLookup(node->name) )
5583 ;
5584 case XML_ATTRIBUTE_NODE:
5585 return htmlAttrAllowed(
5586 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5587 default: return HTML_NA ;
5588 }
5589}
Owen Taylor3473f882001-02-23 17:55:21 +00005590#endif /* LIBXML_HTML_ENABLED */