blob: d07cd46089f84c15bcd88a9beb00c0f1ad4c4e30 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Owen Taylor3473f882001-02-23 17:55:21 +000062 * Parser stacks related functions and macros *
63 * *
64 ************************************************************************/
65
Daniel Veillard1c732d22002-11-30 11:22:59 +000066/**
67 * htmlnamePush:
68 * @ctxt: an HTML parser context
69 * @value: the element name
70 *
71 * Pushes a new element name on top of the name stack
72 *
73 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +000074 */
Daniel Veillard1c732d22002-11-30 11:22:59 +000075static int
76htmlnamePush(htmlParserCtxtPtr ctxt, xmlChar * value)
77{
78 if (ctxt->nameNr >= ctxt->nameMax) {
79 ctxt->nameMax *= 2;
80 ctxt->nameTab =
81 (xmlChar * *)xmlRealloc(ctxt->nameTab,
82 ctxt->nameMax *
83 sizeof(ctxt->nameTab[0]));
84 if (ctxt->nameTab == NULL) {
85 xmlGenericError(xmlGenericErrorContext, "realloc failed !\n");
86 return (0);
87 }
88 }
89 ctxt->nameTab[ctxt->nameNr] = value;
90 ctxt->name = value;
91 return (ctxt->nameNr++);
92}
93/**
94 * htmlnamePop:
95 * @ctxt: an HTML parser context
96 *
97 * Pops the top element name from the name stack
98 *
99 * Returns the name just removed
100 */
101static xmlChar *
102htmlnamePop(htmlParserCtxtPtr ctxt)
103{
104 xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000105
Daniel Veillard1c732d22002-11-30 11:22:59 +0000106 if (ctxt->nameNr <= 0)
107 return (0);
108 ctxt->nameNr--;
109 if (ctxt->nameNr < 0)
110 return (0);
111 if (ctxt->nameNr > 0)
112 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
113 else
114 ctxt->name = NULL;
115 ret = ctxt->nameTab[ctxt->nameNr];
116 ctxt->nameTab[ctxt->nameNr] = 0;
117 return (ret);
118}
Owen Taylor3473f882001-02-23 17:55:21 +0000119
120/*
121 * Macros for accessing the content. Those should be used only by the parser,
122 * and not exported.
123 *
124 * Dirty macros, i.e. one need to make assumption on the context to use them
125 *
126 * CUR_PTR return the current pointer to the xmlChar to be parsed.
127 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
128 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
129 * in UNICODE mode. This should be used internally by the parser
130 * only to compare to ASCII values otherwise it would break when
131 * running with UTF-8 encoding.
132 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
133 * to compare on ASCII based substring.
134 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
135 * it should be used only to compare on ASCII based substring.
136 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000137 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000138 *
139 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
140 *
141 * CURRENT Returns the current char value, with the full decoding of
142 * UTF-8 if we are using this mode. It returns an int.
143 * NEXT Skip to the next character, this does the proper decoding
144 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000145 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000146 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
147 */
148
149#define UPPER (toupper(*ctxt->input->cur))
150
Daniel Veillard77a90a72003-03-22 00:04:05 +0000151#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000152
153#define NXT(val) ctxt->input->cur[(val)]
154
155#define UPP(val) (toupper(ctxt->input->cur[(val)]))
156
157#define CUR_PTR ctxt->input->cur
158
159#define SHRINK xmlParserInputShrink(ctxt->input)
160
161#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
162
163#define CURRENT ((int) (*ctxt->input->cur))
164
165#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
166
167/* Inported from XML */
168
Daniel Veillard561b7f82002-03-20 21:55:57 +0000169/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
170#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000171#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000172
Daniel Veillard561b7f82002-03-20 21:55:57 +0000173#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000174#define NXT(val) ctxt->input->cur[(val)]
175#define CUR_PTR ctxt->input->cur
176
177
178#define NEXTL(l) do { \
179 if (*(ctxt->input->cur) == '\n') { \
180 ctxt->input->line++; ctxt->input->col = 1; \
181 } else ctxt->input->col++; \
182 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
183 } while (0)
184
185/************
186 \
187 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
188 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
189 ************/
190
191#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
192#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
193
194#define COPY_BUF(l,b,i,v) \
195 if (l == 1) b[i++] = (xmlChar) v; \
196 else i += xmlCopyChar(l,&b[i],v)
197
198/**
199 * htmlCurrentChar:
200 * @ctxt: the HTML parser context
201 * @len: pointer to the length of the char read
202 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000203 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000204 * bytes in the input buffer. Implement the end of line normalization:
205 * 2.11 End-of-Line Handling
206 * If the encoding is unspecified, in the case we find an ISO-Latin-1
207 * char, then the encoding converter is plugged in automatically.
208 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000209 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000210 */
211
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000212static int
Owen Taylor3473f882001-02-23 17:55:21 +0000213htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
214 if (ctxt->instate == XML_PARSER_EOF)
215 return(0);
216
217 if (ctxt->token != 0) {
218 *len = 0;
219 return(ctxt->token);
220 }
221 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
222 /*
223 * We are supposed to handle UTF8, check it's valid
224 * From rfc2044: encoding of the Unicode values on UTF-8:
225 *
226 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
227 * 0000 0000-0000 007F 0xxxxxxx
228 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
229 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
230 *
231 * Check for the 0x110000 limit too
232 */
233 const unsigned char *cur = ctxt->input->cur;
234 unsigned char c;
235 unsigned int val;
236
237 c = *cur;
238 if (c & 0x80) {
239 if (cur[1] == 0)
240 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
241 if ((cur[1] & 0xc0) != 0x80)
242 goto encoding_error;
243 if ((c & 0xe0) == 0xe0) {
244
245 if (cur[2] == 0)
246 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
247 if ((cur[2] & 0xc0) != 0x80)
248 goto encoding_error;
249 if ((c & 0xf0) == 0xf0) {
250 if (cur[3] == 0)
251 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
252 if (((c & 0xf8) != 0xf0) ||
253 ((cur[3] & 0xc0) != 0x80))
254 goto encoding_error;
255 /* 4-byte code */
256 *len = 4;
257 val = (cur[0] & 0x7) << 18;
258 val |= (cur[1] & 0x3f) << 12;
259 val |= (cur[2] & 0x3f) << 6;
260 val |= cur[3] & 0x3f;
261 } else {
262 /* 3-byte code */
263 *len = 3;
264 val = (cur[0] & 0xf) << 12;
265 val |= (cur[1] & 0x3f) << 6;
266 val |= cur[2] & 0x3f;
267 }
268 } else {
269 /* 2-byte code */
270 *len = 2;
271 val = (cur[0] & 0x1f) << 6;
272 val |= cur[1] & 0x3f;
273 }
274 if (!IS_CHAR(val)) {
275 ctxt->errNo = XML_ERR_INVALID_ENCODING;
276 if ((ctxt->sax != NULL) &&
277 (ctxt->sax->error != NULL))
278 ctxt->sax->error(ctxt->userData,
279 "Char 0x%X out of allowed range\n", val);
280 ctxt->wellFormed = 0;
Daniel Veillarddad3f682002-11-17 16:47:27 +0000281 if (ctxt->recovery == 0) ctxt->disableSAX = 1;
Owen Taylor3473f882001-02-23 17:55:21 +0000282 }
283 return(val);
284 } else {
285 /* 1-byte code */
286 *len = 1;
287 return((int) *ctxt->input->cur);
288 }
289 }
290 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000291 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000292 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000293 * XML constructs only use < 128 chars
294 */
295 *len = 1;
296 if ((int) *ctxt->input->cur < 0x80)
297 return((int) *ctxt->input->cur);
298
299 /*
300 * Humm this is bad, do an automatic flow conversion
301 */
302 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
303 ctxt->charset = XML_CHAR_ENCODING_UTF8;
304 return(xmlCurrentChar(ctxt, len));
305
306encoding_error:
307 /*
308 * If we detect an UTF8 error that probably mean that the
309 * input encoding didn't get properly advertized in the
310 * declaration header. Report the error and switch the encoding
311 * to ISO-Latin-1 (if you don't like this policy, just declare the
312 * encoding !)
313 */
314 ctxt->errNo = XML_ERR_INVALID_ENCODING;
315 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
316 ctxt->sax->error(ctxt->userData,
317 "Input is not proper UTF-8, indicate encoding !\n");
318 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
319 ctxt->input->cur[0], ctxt->input->cur[1],
320 ctxt->input->cur[2], ctxt->input->cur[3]);
321 }
322
323 ctxt->charset = XML_CHAR_ENCODING_8859_1;
324 *len = 1;
325 return((int) *ctxt->input->cur);
326}
327
328/**
Owen Taylor3473f882001-02-23 17:55:21 +0000329 * htmlSkipBlankChars:
330 * @ctxt: the HTML parser context
331 *
332 * skip all blanks character found at that point in the input streams.
333 *
334 * Returns the number of space chars skipped
335 */
336
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000337static int
Owen Taylor3473f882001-02-23 17:55:21 +0000338htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
339 int res = 0;
340
341 while (IS_BLANK(*(ctxt->input->cur))) {
342 if ((*ctxt->input->cur == 0) &&
343 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
344 xmlPopInput(ctxt);
345 } else {
346 if (*(ctxt->input->cur) == '\n') {
347 ctxt->input->line++; ctxt->input->col = 1;
348 } else ctxt->input->col++;
349 ctxt->input->cur++;
350 ctxt->nbChars++;
351 if (*ctxt->input->cur == 0)
352 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
353 }
354 res++;
355 }
356 return(res);
357}
358
359
360
361/************************************************************************
362 * *
363 * The list of HTML elements and their properties *
364 * *
365 ************************************************************************/
366
367/*
368 * Start Tag: 1 means the start tag can be ommited
369 * End Tag: 1 means the end tag can be ommited
370 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000371 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000372 * Depr: this element is deprecated
373 * DTD: 1 means that this element is valid only in the Loose DTD
374 * 2 means that this element is valid only in the Frameset DTD
375 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000376 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000377 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000378 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000379
380/* Definitions and a couple of vars for HTML Elements */
381
382#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
383#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
384#define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
385#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
386#define BLOCK HEADING LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
387#define FORMCTRL "input", "select", "textarea", "label", "button"
388#define PCDATA
389#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
390#define LIST "ul", "ol", "dir", "menu"
391#define MODIFIER
392#define FLOW BLOCK,INLINE
393#define EMPTY NULL
394
395
396static const char* html_flow[] = { FLOW, NULL } ;
397static const char* html_inline[] = { INLINE, NULL } ;
398
399/* placeholders: elts with content but no subelements */
400static const char* html_pcdata[] = { NULL } ;
401#define html_cdata html_pcdata
402
403
404/* ... and for HTML Attributes */
405
406#define COREATTRS "id", "class", "style", "title"
407#define I18N "lang", "dir"
408#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
409#define ATTRS COREATTRS,I18N,EVENTS
410#define CELLHALIGN "align", "char", "charoff"
411#define CELLVALIGN "valign"
412
413static const char* html_attrs[] = { ATTRS, NULL } ;
414static const char* core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
415static const char* core_attrs[] = { COREATTRS, NULL } ;
416static const char* i18n_attrs[] = { I18N, NULL } ;
417
418
419/* Other declarations that should go inline ... */
420static const char* a_attrs[] = { ATTRS, "charset", "type", "name",
421 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
422 "tabindex", "onfocus", "onblur", NULL } ;
423static const char* target_attr[] = { "target", NULL } ;
424static const char* rows_cols_attr[] = { "rows", "cols", NULL } ;
425static const char* alt_attr[] = { "alt", NULL } ;
426static const char* src_alt_attrs[] = { "src", "alt", NULL } ;
427static const char* href_attrs[] = { "href", NULL } ;
428static const char* clear_attrs[] = { "clear", NULL } ;
429static const char* inline_p[] = { INLINE, "p", NULL } ;
430static const char* flow_param[] = { FLOW, "param", NULL } ;
431static const char* applet_attrs[] = { COREATTRS , "codebase",
432 "archive", "alt", "name", "height", "width", "align",
433 "hspace", "vspace", NULL } ;
434static const char* area_attrs[] = { "shape", "coords", "href", "nohref",
435 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
436static const char* basefont_attrs[] =
437 { "id", "size", "color", "face", NULL } ;
438static const char* quote_attrs[] = { ATTRS, "cite", NULL } ;
439static const char* body_contents[] = { FLOW, "ins", "del", NULL } ;
440static const char* body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
441static const char* body_depr[] = { "background", "bgcolor", "text",
442 "link", "vlink", "alink", NULL } ;
443static const char* button_attrs[] = { ATTRS, "name", "value", "type",
444 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
445
446
447static const char* col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
448static const char* col_elt[] = { "col", NULL } ;
449static const char* edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
450static const char* compact_attrs[] = { ATTRS, "compact", NULL } ;
451static const char* dl_contents[] = { "dt", "dd", NULL } ;
452static const char* compact_attr[] = { "compact", NULL } ;
453static const char* label_attr[] = { "label", NULL } ;
454static const char* fieldset_contents[] = { FLOW, "legend" } ;
455static const char* font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
456static const char* form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
457static const char* form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
458static const char* frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
459static const char* frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
460static const char* frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
461static const char* head_attrs[] = { I18N, "profile", NULL } ;
462static const char* head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
463static const char* hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
464static const char* version_attr[] = { "version", NULL } ;
465static const char* html_content[] = { "head", "body", "frameset", NULL } ;
466static const char* iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
467static const char* img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
468static const char* input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
469static const char* prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
470static const char* label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
471static const char* legend_attrs[] = { ATTRS, "accesskey", NULL } ;
472static const char* align_attr[] = { "align", NULL } ;
473static const char* link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
474static const char* map_contents[] = { BLOCK, "area", NULL } ;
475static const char* name_attr[] = { "name", NULL } ;
476static const char* action_attr[] = { "action", NULL } ;
477static const char* blockli_elt[] = { BLOCK, "li", NULL } ;
478static const char* meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
479static const char* content_attr[] = { "content", NULL } ;
480static const char* type_attr[] = { "type", NULL } ;
481static const char* noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
482static const char* object_contents[] = { FLOW, "param", NULL } ;
483static const char* object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
484static const char* object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
485static const char* ol_attrs[] = { "type", "compact", "start", NULL} ;
486static const char* option_elt[] = { "option", NULL } ;
487static const char* optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
488static const char* option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
489static const char* param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
490static const char* width_attr[] = { "width", NULL } ;
491static const char* pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
492static const char* script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
493static const char* language_attr[] = { "language", NULL } ;
494static const char* select_content[] = { "optgroup", "option", NULL } ;
495static const char* select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
496static const char* style_attrs[] = { I18N, "media", "title", NULL } ;
497static const char* table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
498static const char* table_depr[] = { "align", "bgcolor", NULL } ;
499static const char* table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
500static const char* tr_elt[] = { "tr", NULL } ;
501static const char* talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
502static const char* th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
503static const char* th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
504static const char* textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
505static const char* tr_contents[] = { "th", "td", NULL } ;
506static const char* bgcolor_attr[] = { "bgcolor", NULL } ;
507static const char* li_elt[] = { "li", NULL } ;
508static const char* ul_depr[] = { "type", "compact", NULL} ;
509static const char* dir_attr[] = { "dir", NULL} ;
510
511#define DECL (const char**)
512
Daniel Veillard22090732001-07-16 00:06:07 +0000513static const htmlElemDesc
514html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000515{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
516 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
517},
518{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
519 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
520},
521{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
522 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
523},
524{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
525 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
526},
527{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
528 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
529},
530{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
531 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
532},
533{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
534 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
535},
536{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
537 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
538},
539{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
540 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
541},
542{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
543 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
544},
545{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
546 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
547},
548{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
549 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
550},
551{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
552 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
553},
554{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
555 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
556},
557{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
558 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
559},
560{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
561 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
562},
563{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
564 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
565},
566{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
567 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
568},
569{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
570 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
571},
572{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
573 EMPTY , NULL , DECL col_attrs , NULL, NULL
574},
575{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
576 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
577},
578{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
579 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
580},
581{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
582 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
583},
584{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
585 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
586},
587{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
588 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
589},
590{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
591 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
592},
593{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
594 DECL dl_contents , "dd" , html_attrs, DECL compact_attr, NULL
595},
596{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
597 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
598},
599{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
600 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
601},
602{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
603 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
604},
605{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
606 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
607},
608{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
609 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
610},
611{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
612 EMPTY, NULL, NULL, DECL frame_attrs, NULL
613},
614{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
615 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
616},
617{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
618 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
619},
620{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
621 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
622},
623{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
624 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
625},
626{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
627 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
628},
629{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
630 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
631},
632{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
633 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
634},
635{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
636 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
637},
638{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
639 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
640},
641{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
642 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
643},
644{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
645 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
646},
647{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
648 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
649},
650{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
651 EMPTY, NULL, DECL img_attrs, DECL align_attr, src_alt_attrs
652},
653{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
654 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
655},
656{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
657 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
658},
659{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
660 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
661},
662{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
663 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
664},
665{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
666 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
667},
668{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
669 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
670},
671{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
672 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
673},
674{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
675 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
676},
677{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
678 DECL map_contents , NULL, DECL html_attrs , NULL, name_attr
679},
680{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
681 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
682},
683{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
684 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
685},
686{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
687 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
688},
689{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
690 DECL html_flow, "div", DECL html_attrs, NULL, NULL
691},
692{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
693 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
694},
695{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
696 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
697},
698{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
699 option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
700},
701{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
702 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
703},
704{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
705 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
706},
707{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
708 EMPTY, NULL, DECL param_attrs, NULL, name_attr
709},
710{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
711 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
712},
713{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
714 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
715},
716{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
717 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
718},
719{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
720 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
721},
722{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
723 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
724},
725{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
726 DECL select_content, NULL, DECL select_attrs, NULL, NULL
727},
728{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
729 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
730},
731{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
732 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
733},
734{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
735 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
736},
737{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
738 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
739},
740{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
741 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
742},
743{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
744 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
745},
746{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
747 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
748},
749{ "table", 0, 0, 0, 0, 0, 0, 0, "",
750 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
751},
752{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
753 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
754},
755{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
756 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
757},
758{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
759 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
760},
761{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
762 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
763},
764{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
765 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
766},
767{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
768 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
769},
770{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
771 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
772},
773{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
774 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
775},
776{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
777 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
778},
779{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
780 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
781},
782{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
783 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
784},
785{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
786 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
787}
Owen Taylor3473f882001-02-23 17:55:21 +0000788};
789
790/*
Owen Taylor3473f882001-02-23 17:55:21 +0000791 * start tags that imply the end of current element
792 */
Daniel Veillard22090732001-07-16 00:06:07 +0000793static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000794"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
795 "dl", "ul", "ol", "menu", "dir", "address", "pre",
796 "listing", "xmp", "head", NULL,
797"head", "p", NULL,
798"title", "p", NULL,
799"body", "head", "style", "link", "title", "p", NULL,
800"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
801 "pre", "listing", "xmp", "head", "li", NULL,
802"hr", "p", "head", NULL,
803"h1", "p", "head", NULL,
804"h2", "p", "head", NULL,
805"h3", "p", "head", NULL,
806"h4", "p", "head", NULL,
807"h5", "p", "head", NULL,
808"h6", "p", "head", NULL,
809"dir", "p", "head", NULL,
810"address", "p", "head", "ul", NULL,
811"pre", "p", "head", "ul", NULL,
812"listing", "p", "head", NULL,
813"xmp", "p", "head", NULL,
814"blockquote", "p", "head", NULL,
815"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
816 "xmp", "head", NULL,
817"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
818 "head", "dd", NULL,
819"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
820 "head", "dt", NULL,
821"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
822 "listing", "xmp", NULL,
823"ol", "p", "head", "ul", NULL,
824"menu", "p", "head", "ul", NULL,
825"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
826"div", "p", "head", NULL,
827"noscript", "p", "head", NULL,
828"center", "font", "b", "i", "p", "head", NULL,
829"a", "a", NULL,
830"caption", "p", NULL,
831"colgroup", "caption", "colgroup", "col", "p", NULL,
832"col", "caption", "col", "p", NULL,
833"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
834 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000835"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
836"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000837"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
838"thead", "caption", "col", "colgroup", NULL,
839"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
840 "tbody", "p", NULL,
841"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
842 "tfoot", "tbody", "p", NULL,
843"optgroup", "option", NULL,
844"option", "option", NULL,
845"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
846 "pre", "listing", "xmp", "a", NULL,
847NULL
848};
849
850/*
851 * The list of HTML elements which are supposed not to have
852 * CDATA content and where a p element will be implied
853 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000854 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000855 * implied paragraph
856 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000857static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000858 "html",
859 "head",
860 "body",
861 NULL
862};
863
864/*
865 * The list of HTML attributes which are of content %Script;
866 * NOTE: when adding ones, check htmlIsScriptAttribute() since
867 * it assumes the name starts with 'on'
868 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000869static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000870 "onclick",
871 "ondblclick",
872 "onmousedown",
873 "onmouseup",
874 "onmouseover",
875 "onmousemove",
876 "onmouseout",
877 "onkeypress",
878 "onkeydown",
879 "onkeyup",
880 "onload",
881 "onunload",
882 "onfocus",
883 "onblur",
884 "onsubmit",
885 "onrest",
886 "onchange",
887 "onselect"
888};
889
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000890/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000891 * This table is used by the htmlparser to know what to do with
892 * broken html pages. By assigning different priorities to different
893 * elements the parser can decide how to handle extra endtags.
894 * Endtags are only allowed to close elements with lower or equal
895 * priority.
896 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000897
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000898typedef struct {
899 const char *name;
900 int priority;
901} elementPriority;
902
Daniel Veillard22090732001-07-16 00:06:07 +0000903static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000904 {"div", 150},
905 {"td", 160},
906 {"th", 160},
907 {"tr", 170},
908 {"thead", 180},
909 {"tbody", 180},
910 {"tfoot", 180},
911 {"table", 190},
912 {"head", 200},
913 {"body", 200},
914 {"html", 220},
915 {NULL, 100} /* Default priority */
916};
Owen Taylor3473f882001-02-23 17:55:21 +0000917
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000918static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000919static int htmlStartCloseIndexinitialized = 0;
920
921/************************************************************************
922 * *
923 * functions to handle HTML specific data *
924 * *
925 ************************************************************************/
926
927/**
928 * htmlInitAutoClose:
929 *
930 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
931 * This is not reentrant. Call xmlInitParser() once before processing in
932 * case of use in multithreaded programs.
933 */
934void
935htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000936 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000937
938 if (htmlStartCloseIndexinitialized) return;
939
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000940 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
941 indx = 0;
942 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
943 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000944 while (htmlStartClose[i] != NULL) i++;
945 i++;
946 }
947 htmlStartCloseIndexinitialized = 1;
948}
949
950/**
951 * htmlTagLookup:
952 * @tag: The tag name in lowercase
953 *
954 * Lookup the HTML tag in the ElementTable
955 *
956 * Returns the related htmlElemDescPtr or NULL if not found.
957 */
Daniel Veillardbb371292001-08-16 23:26:59 +0000958const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +0000959htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000960 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000961
962 for (i = 0; i < (sizeof(html40ElementTable) /
963 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000964 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard22090732001-07-16 00:06:07 +0000965 return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +0000966 }
967 return(NULL);
968}
969
970/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000971 * htmlGetEndPriority:
972 * @name: The name of the element to look up the priority for.
973 *
974 * Return value: The "endtag" priority.
975 **/
976static int
977htmlGetEndPriority (const xmlChar *name) {
978 int i = 0;
979
980 while ((htmlEndPriority[i].name != NULL) &&
981 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
982 i++;
983
984 return(htmlEndPriority[i].priority);
985}
986
987/**
Owen Taylor3473f882001-02-23 17:55:21 +0000988 * htmlCheckAutoClose:
989 * @newtag: The new tag name
990 * @oldtag: The old tag name
991 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000992 * Checks whether the new tag is one of the registered valid tags for
993 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +0000994 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
995 *
996 * Returns 0 if no, 1 if yes.
997 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000998static int
Owen Taylor3473f882001-02-23 17:55:21 +0000999htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001000 int i, indx;
1001 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001002
1003 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
1004
1005 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001006 for (indx = 0; indx < 100;indx++) {
1007 closed = htmlStartCloseIndex[indx];
1008 if (closed == NULL) return(0);
1009 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +00001010 }
1011
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001012 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001013 i++;
1014 while (htmlStartClose[i] != NULL) {
1015 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1016 return(1);
1017 }
1018 i++;
1019 }
1020 return(0);
1021}
1022
1023/**
1024 * htmlAutoCloseOnClose:
1025 * @ctxt: an HTML parser context
1026 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001027 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001028 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001029 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001030 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001031static void
Owen Taylor3473f882001-02-23 17:55:21 +00001032htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardbb371292001-08-16 23:26:59 +00001033 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00001034 xmlChar *oldname;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001035 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001036
1037#ifdef DEBUG
1038 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
1039 for (i = 0;i < ctxt->nameNr;i++)
1040 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
1041#endif
1042
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001043 priority = htmlGetEndPriority (newtag);
1044
Owen Taylor3473f882001-02-23 17:55:21 +00001045 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001046
Owen Taylor3473f882001-02-23 17:55:21 +00001047 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001048 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001049 * A missplaced endtag can only close elements with lower
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001050 * or equal priority, so if we find an element with higher
1051 * priority before we find an element with
1052 * matching name, we just ignore this endtag
1053 */
1054 if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor3473f882001-02-23 17:55:21 +00001055 }
1056 if (i < 0) return;
1057
1058 while (!xmlStrEqual(newtag, ctxt->name)) {
1059 info = htmlTagLookup(ctxt->name);
1060 if ((info == NULL) || (info->endTag == 1)) {
1061#ifdef DEBUG
1062 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
1063#endif
Daniel Veillard56098d42001-04-24 12:51:09 +00001064 } else if (info->endTag == 3) {
1065#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001066 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack1633d182001-10-05 15:41:19 +00001067
Daniel Veillard56098d42001-04-24 12:51:09 +00001068#endif
1069 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1070 ctxt->sax->error(ctxt->userData,
1071 "Opening and ending tag mismatch: %s and %s\n",
1072 newtag, ctxt->name);
1073 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001074 }
1075 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1076 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1077 oldname = htmlnamePop(ctxt);
1078 if (oldname != NULL) {
1079#ifdef DEBUG
1080 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
1081#endif
1082 xmlFree(oldname);
1083 }
1084 }
1085}
1086
1087/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001088 * htmlAutoCloseOnEnd:
1089 * @ctxt: an HTML parser context
1090 *
1091 * Close all remaining tags at the end of the stream
1092 */
1093static void
1094htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
1095 xmlChar *oldname;
1096 int i;
1097
1098 if (ctxt->nameNr == 0)
1099 return;
1100#ifdef DEBUG
1101 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
1102#endif
1103
1104 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
1105#ifdef DEBUG
1106 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
1107#endif
1108 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1109 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1110 oldname = htmlnamePop(ctxt);
1111 if (oldname != NULL) {
1112#ifdef DEBUG
1113 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
1114#endif
1115 xmlFree(oldname);
1116 }
1117 }
1118}
1119
1120/**
Owen Taylor3473f882001-02-23 17:55:21 +00001121 * htmlAutoClose:
1122 * @ctxt: an HTML parser context
1123 * @newtag: The new tag name or NULL
1124 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001125 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001126 * The list is kept in htmlStartClose array. This function is
1127 * called when a new tag has been detected and generates the
1128 * appropriates closes if possible/needed.
1129 * If newtag is NULL this mean we are at the end of the resource
1130 * and we should check
1131 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001132static void
Owen Taylor3473f882001-02-23 17:55:21 +00001133htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1134 xmlChar *oldname;
1135 while ((newtag != NULL) && (ctxt->name != NULL) &&
1136 (htmlCheckAutoClose(newtag, ctxt->name))) {
1137#ifdef DEBUG
1138 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
1139#endif
1140 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1141 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1142 oldname = htmlnamePop(ctxt);
1143 if (oldname != NULL) {
1144#ifdef DEBUG
1145 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
1146#endif
1147 xmlFree(oldname);
1148 }
1149 }
1150 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001151 htmlAutoCloseOnEnd(ctxt);
1152 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001153 }
1154 while ((newtag == NULL) && (ctxt->name != NULL) &&
1155 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
1156 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
1157 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
1158#ifdef DEBUG
1159 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
1160#endif
1161 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1162 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1163 oldname = htmlnamePop(ctxt);
1164 if (oldname != NULL) {
1165#ifdef DEBUG
1166 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
1167#endif
1168 xmlFree(oldname);
1169 }
1170 }
1171
1172}
1173
1174/**
1175 * htmlAutoCloseTag:
1176 * @doc: the HTML document
1177 * @name: The tag name
1178 * @elem: the HTML element
1179 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001180 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001181 * The list is kept in htmlStartClose array. This function checks
1182 * if the element or one of it's children would autoclose the
1183 * given tag.
1184 *
1185 * Returns 1 if autoclose, 0 otherwise
1186 */
1187int
1188htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1189 htmlNodePtr child;
1190
1191 if (elem == NULL) return(1);
1192 if (xmlStrEqual(name, elem->name)) return(0);
1193 if (htmlCheckAutoClose(elem->name, name)) return(1);
1194 child = elem->children;
1195 while (child != NULL) {
1196 if (htmlAutoCloseTag(doc, name, child)) return(1);
1197 child = child->next;
1198 }
1199 return(0);
1200}
1201
1202/**
1203 * htmlIsAutoClosed:
1204 * @doc: the HTML document
1205 * @elem: the HTML element
1206 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001207 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001208 * The list is kept in htmlStartClose array. This function checks
1209 * if a tag is autoclosed by one of it's child
1210 *
1211 * Returns 1 if autoclosed, 0 otherwise
1212 */
1213int
1214htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1215 htmlNodePtr child;
1216
1217 if (elem == NULL) return(1);
1218 child = elem->children;
1219 while (child != NULL) {
1220 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1221 child = child->next;
1222 }
1223 return(0);
1224}
1225
1226/**
1227 * htmlCheckImplied:
1228 * @ctxt: an HTML parser context
1229 * @newtag: The new tag name
1230 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001231 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001232 * called when a new tag has been detected and generates the
1233 * appropriates implicit tags if missing
1234 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001235static void
Owen Taylor3473f882001-02-23 17:55:21 +00001236htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1237 if (!htmlOmittedDefaultValue)
1238 return;
1239 if (xmlStrEqual(newtag, BAD_CAST"html"))
1240 return;
1241 if (ctxt->nameNr <= 0) {
1242#ifdef DEBUG
1243 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
1244#endif
1245 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
1246 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1247 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1248 }
1249 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1250 return;
1251 if ((ctxt->nameNr <= 1) &&
1252 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1253 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1254 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1255 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1256 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1257 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1258 /*
1259 * dropped OBJECT ... i you put it first BODY will be
1260 * assumed !
1261 */
1262#ifdef DEBUG
1263 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
1264#endif
1265 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
1266 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1267 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1268 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1269 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1270 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1271 int i;
1272 for (i = 0;i < ctxt->nameNr;i++) {
1273 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1274 return;
1275 }
1276 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1277 return;
1278 }
1279 }
1280
1281#ifdef DEBUG
1282 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
1283#endif
1284 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
1285 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1286 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1287 }
1288}
1289
1290/**
1291 * htmlCheckParagraph
1292 * @ctxt: an HTML parser context
1293 *
1294 * Check whether a p element need to be implied before inserting
1295 * characters in the current element.
1296 *
1297 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1298 * in case of error.
1299 */
1300
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001301static int
Owen Taylor3473f882001-02-23 17:55:21 +00001302htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1303 const xmlChar *tag;
1304 int i;
1305
1306 if (ctxt == NULL)
1307 return(-1);
1308 tag = ctxt->name;
1309 if (tag == NULL) {
1310 htmlAutoClose(ctxt, BAD_CAST"p");
1311 htmlCheckImplied(ctxt, BAD_CAST"p");
1312 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
1313 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1314 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1315 return(1);
1316 }
1317 if (!htmlOmittedDefaultValue)
1318 return(0);
1319 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1320 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1321#ifdef DEBUG
1322 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
1323#endif
1324 htmlAutoClose(ctxt, BAD_CAST"p");
1325 htmlCheckImplied(ctxt, BAD_CAST"p");
1326 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
1327 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1328 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1329 return(1);
1330 }
1331 }
1332 return(0);
1333}
1334
1335/**
1336 * htmlIsScriptAttribute:
1337 * @name: an attribute name
1338 *
1339 * Check if an attribute is of content type Script
1340 *
1341 * Returns 1 is the attribute is a script 0 otherwise
1342 */
1343int
1344htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001345 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001346
1347 if (name == NULL)
1348 return(0);
1349 /*
1350 * all script attributes start with 'on'
1351 */
1352 if ((name[0] != 'o') || (name[1] != 'n'))
1353 return(0);
1354 for (i = 0;
1355 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1356 i++) {
1357 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1358 return(1);
1359 }
1360 return(0);
1361}
1362
1363/************************************************************************
1364 * *
1365 * The list of HTML predefined entities *
1366 * *
1367 ************************************************************************/
1368
1369
Daniel Veillard22090732001-07-16 00:06:07 +00001370static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001371/*
1372 * the 4 absolute ones, plus apostrophe.
1373 */
1374{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1375{ 38, "amp", "ampersand, U+0026 ISOnum" },
1376{ 39, "apos", "single quote" },
1377{ 60, "lt", "less-than sign, U+003C ISOnum" },
1378{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1379
1380/*
1381 * A bunch still in the 128-255 range
1382 * Replacing them depend really on the charset used.
1383 */
1384{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1385{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1386{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1387{ 163, "pound","pound sign, U+00A3 ISOnum" },
1388{ 164, "curren","currency sign, U+00A4 ISOnum" },
1389{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1390{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1391{ 167, "sect", "section sign, U+00A7 ISOnum" },
1392{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1393{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1394{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1395{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1396{ 172, "not", "not sign, U+00AC ISOnum" },
1397{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1398{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1399{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1400{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1401{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1402{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1403{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1404{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1405{ 181, "micro","micro sign, U+00B5 ISOnum" },
1406{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1407{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1408{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1409{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1410{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1411{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1412{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1413{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1414{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1415{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1416{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1417{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1418{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1419{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1420{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1421{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1422{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1423{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1424{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1425{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1426{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1427{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1428{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1429{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1430{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1431{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1432{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1433{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1434{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1435{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1436{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1437{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1438{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1439{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1440{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1441{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1442{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1443{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1444{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1445{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1446{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1447{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1448{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1449{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1450{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1451{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1452{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1453{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1454{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1455{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1456{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1457{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1458{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1459{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1460{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1461{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1462{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1463{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1464{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1465{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1466{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1467{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1468{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1469{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1470{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1471{ 247, "divide","division sign, U+00F7 ISOnum" },
1472{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1473{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1474{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1475{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1476{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1477{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1478{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1479{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1480
1481{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1482{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1483{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1484{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1485{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1486
1487/*
1488 * Anything below should really be kept as entities references
1489 */
1490{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1491
1492{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1493{ 732, "tilde","small tilde, U+02DC ISOdia" },
1494
1495{ 913, "Alpha","greek capital letter alpha, U+0391" },
1496{ 914, "Beta", "greek capital letter beta, U+0392" },
1497{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1498{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1499{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1500{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1501{ 919, "Eta", "greek capital letter eta, U+0397" },
1502{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1503{ 921, "Iota", "greek capital letter iota, U+0399" },
1504{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001505{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001506{ 924, "Mu", "greek capital letter mu, U+039C" },
1507{ 925, "Nu", "greek capital letter nu, U+039D" },
1508{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1509{ 927, "Omicron","greek capital letter omicron, U+039F" },
1510{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1511{ 929, "Rho", "greek capital letter rho, U+03A1" },
1512{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1513{ 932, "Tau", "greek capital letter tau, U+03A4" },
1514{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1515{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1516{ 935, "Chi", "greek capital letter chi, U+03A7" },
1517{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1518{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1519
1520{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1521{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1522{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1523{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1524{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1525{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1526{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1527{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1528{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1529{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1530{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1531{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1532{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1533{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1534{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1535{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1536{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1537{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1538{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1539{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1540{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1541{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1542{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1543{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1544{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1545{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1546{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1547{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1548
1549{ 8194, "ensp", "en space, U+2002 ISOpub" },
1550{ 8195, "emsp", "em space, U+2003 ISOpub" },
1551{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1552{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1553{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1554{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1555{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1556{ 8211, "ndash","en dash, U+2013 ISOpub" },
1557{ 8212, "mdash","em dash, U+2014 ISOpub" },
1558{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1559{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1560{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1561{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1562{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1563{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1564{ 8224, "dagger","dagger, U+2020 ISOpub" },
1565{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1566
1567{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1568{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1569
1570{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1571
1572{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1573{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1574
1575{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1576{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1577
1578{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1579{ 8260, "frasl","fraction slash, U+2044 NEW" },
1580
1581{ 8364, "euro", "euro sign, U+20AC NEW" },
1582
1583{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1584{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1585{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1586{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1587{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1588{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1589{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1590{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1591{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1592{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1593{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1594{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1595{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1596{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1597{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1598{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1599
1600{ 8704, "forall","for all, U+2200 ISOtech" },
1601{ 8706, "part", "partial differential, U+2202 ISOtech" },
1602{ 8707, "exist","there exists, U+2203 ISOtech" },
1603{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1604{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1605{ 8712, "isin", "element of, U+2208 ISOtech" },
1606{ 8713, "notin","not an element of, U+2209 ISOtech" },
1607{ 8715, "ni", "contains as member, U+220B ISOtech" },
1608{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001609{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001610{ 8722, "minus","minus sign, U+2212 ISOtech" },
1611{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1612{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1613{ 8733, "prop", "proportional to, U+221D ISOtech" },
1614{ 8734, "infin","infinity, U+221E ISOtech" },
1615{ 8736, "ang", "angle, U+2220 ISOamso" },
1616{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1617{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1618{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1619{ 8746, "cup", "union = cup, U+222A ISOtech" },
1620{ 8747, "int", "integral, U+222B ISOtech" },
1621{ 8756, "there4","therefore, U+2234 ISOtech" },
1622{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1623{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1624{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1625{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1626{ 8801, "equiv","identical to, U+2261 ISOtech" },
1627{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1628{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1629{ 8834, "sub", "subset of, U+2282 ISOtech" },
1630{ 8835, "sup", "superset of, U+2283 ISOtech" },
1631{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1632{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1633{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1634{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1635{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1636{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1637{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1638{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1639{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1640{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1641{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1642{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1643{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1644{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1645
1646{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1647{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1648{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1649{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1650
1651};
1652
1653/************************************************************************
1654 * *
1655 * Commodity functions to handle entities *
1656 * *
1657 ************************************************************************/
1658
1659/*
1660 * Macro used to grow the current buffer.
1661 */
1662#define growBuffer(buffer) { \
1663 buffer##_size *= 2; \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001664 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Owen Taylor3473f882001-02-23 17:55:21 +00001665 if (buffer == NULL) { \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001666 xmlGenericError(xmlGenericErrorContext, "realloc failed\n"); \
Owen Taylor3473f882001-02-23 17:55:21 +00001667 return(NULL); \
1668 } \
1669}
1670
1671/**
1672 * htmlEntityLookup:
1673 * @name: the entity name
1674 *
1675 * Lookup the given entity in EntitiesTable
1676 *
1677 * TODO: the linear scan is really ugly, an hash table is really needed.
1678 *
1679 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1680 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001681const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001682htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001683 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001684
1685 for (i = 0;i < (sizeof(html40EntitiesTable)/
1686 sizeof(html40EntitiesTable[0]));i++) {
1687 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1688#ifdef DEBUG
1689 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1690#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001691 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001692 }
1693 }
1694 return(NULL);
1695}
1696
1697/**
1698 * htmlEntityValueLookup:
1699 * @value: the entity's unicode value
1700 *
1701 * Lookup the given entity in EntitiesTable
1702 *
1703 * TODO: the linear scan is really ugly, an hash table is really needed.
1704 *
1705 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1706 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001707const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001708htmlEntityValueLookup(unsigned int value) {
1709 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001710#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001711 unsigned int lv = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001712#endif
1713
1714 for (i = 0;i < (sizeof(html40EntitiesTable)/
1715 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001716 if (html40EntitiesTable[i].value >= value) {
1717 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001718 break;
1719#ifdef DEBUG
1720 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1721#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001722 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001723 }
1724#ifdef DEBUG
1725 if (lv > html40EntitiesTable[i].value) {
1726 xmlGenericError(xmlGenericErrorContext,
1727 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1728 lv, html40EntitiesTable[i].value);
1729 }
1730 lv = html40EntitiesTable[i].value;
1731#endif
1732 }
1733 return(NULL);
1734}
1735
1736/**
1737 * UTF8ToHtml:
1738 * @out: a pointer to an array of bytes to store the result
1739 * @outlen: the length of @out
1740 * @in: a pointer to an array of UTF-8 chars
1741 * @inlen: the length of @in
1742 *
1743 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1744 * plus HTML entities block of chars out.
1745 *
1746 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1747 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001748 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001749 * The value of @outlen after return is the number of octets consumed.
1750 */
1751int
1752UTF8ToHtml(unsigned char* out, int *outlen,
1753 const unsigned char* in, int *inlen) {
1754 const unsigned char* processed = in;
1755 const unsigned char* outend;
1756 const unsigned char* outstart = out;
1757 const unsigned char* instart = in;
1758 const unsigned char* inend;
1759 unsigned int c, d;
1760 int trailing;
1761
1762 if (in == NULL) {
1763 /*
1764 * initialization nothing to do
1765 */
1766 *outlen = 0;
1767 *inlen = 0;
1768 return(0);
1769 }
1770 inend = in + (*inlen);
1771 outend = out + (*outlen);
1772 while (in < inend) {
1773 d = *in++;
1774 if (d < 0x80) { c= d; trailing= 0; }
1775 else if (d < 0xC0) {
1776 /* trailing byte in leading position */
1777 *outlen = out - outstart;
1778 *inlen = processed - instart;
1779 return(-2);
1780 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1781 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1782 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1783 else {
1784 /* no chance for this in Ascii */
1785 *outlen = out - outstart;
1786 *inlen = processed - instart;
1787 return(-2);
1788 }
1789
1790 if (inend - in < trailing) {
1791 break;
1792 }
1793
1794 for ( ; trailing; trailing--) {
1795 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1796 break;
1797 c <<= 6;
1798 c |= d & 0x3F;
1799 }
1800
1801 /* assertion: c is a single UTF-4 value */
1802 if (c < 0x80) {
1803 if (out + 1 >= outend)
1804 break;
1805 *out++ = c;
1806 } else {
1807 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001808 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001809
1810 /*
1811 * Try to lookup a predefined HTML entity for it
1812 */
1813
1814 ent = htmlEntityValueLookup(c);
1815 if (ent == NULL) {
1816 /* no chance for this in Ascii */
1817 *outlen = out - outstart;
1818 *inlen = processed - instart;
1819 return(-2);
1820 }
1821 len = strlen(ent->name);
1822 if (out + 2 + len >= outend)
1823 break;
1824 *out++ = '&';
1825 memcpy(out, ent->name, len);
1826 out += len;
1827 *out++ = ';';
1828 }
1829 processed = in;
1830 }
1831 *outlen = out - outstart;
1832 *inlen = processed - instart;
1833 return(0);
1834}
1835
1836/**
1837 * htmlEncodeEntities:
1838 * @out: a pointer to an array of bytes to store the result
1839 * @outlen: the length of @out
1840 * @in: a pointer to an array of UTF-8 chars
1841 * @inlen: the length of @in
1842 * @quoteChar: the quote character to escape (' or ") or zero.
1843 *
1844 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1845 * plus HTML entities block of chars out.
1846 *
1847 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1848 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001849 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001850 * The value of @outlen after return is the number of octets consumed.
1851 */
1852int
1853htmlEncodeEntities(unsigned char* out, int *outlen,
1854 const unsigned char* in, int *inlen, int quoteChar) {
1855 const unsigned char* processed = in;
1856 const unsigned char* outend = out + (*outlen);
1857 const unsigned char* outstart = out;
1858 const unsigned char* instart = in;
1859 const unsigned char* inend = in + (*inlen);
1860 unsigned int c, d;
1861 int trailing;
1862
1863 while (in < inend) {
1864 d = *in++;
1865 if (d < 0x80) { c= d; trailing= 0; }
1866 else if (d < 0xC0) {
1867 /* trailing byte in leading position */
1868 *outlen = out - outstart;
1869 *inlen = processed - instart;
1870 return(-2);
1871 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1872 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1873 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1874 else {
1875 /* no chance for this in Ascii */
1876 *outlen = out - outstart;
1877 *inlen = processed - instart;
1878 return(-2);
1879 }
1880
1881 if (inend - in < trailing)
1882 break;
1883
1884 while (trailing--) {
1885 if (((d= *in++) & 0xC0) != 0x80) {
1886 *outlen = out - outstart;
1887 *inlen = processed - instart;
1888 return(-2);
1889 }
1890 c <<= 6;
1891 c |= d & 0x3F;
1892 }
1893
1894 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001895 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1896 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001897 if (out >= outend)
1898 break;
1899 *out++ = c;
1900 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001901 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001902 const char *cp;
1903 char nbuf[16];
1904 int len;
1905
1906 /*
1907 * Try to lookup a predefined HTML entity for it
1908 */
1909 ent = htmlEntityValueLookup(c);
1910 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001911 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001912 cp = nbuf;
1913 }
1914 else
1915 cp = ent->name;
1916 len = strlen(cp);
1917 if (out + 2 + len > outend)
1918 break;
1919 *out++ = '&';
1920 memcpy(out, cp, len);
1921 out += len;
1922 *out++ = ';';
1923 }
1924 processed = in;
1925 }
1926 *outlen = out - outstart;
1927 *inlen = processed - instart;
1928 return(0);
1929}
1930
1931/**
1932 * htmlDecodeEntities:
1933 * @ctxt: the parser context
1934 * @len: the len to decode (in bytes !), -1 for no size limit
1935 * @end: an end marker xmlChar, 0 if none
1936 * @end2: an end marker xmlChar, 0 if none
1937 * @end3: an end marker xmlChar, 0 if none
1938 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001939 * Substitute the HTML entities by their value
Owen Taylor3473f882001-02-23 17:55:21 +00001940 *
1941 * DEPRECATED !!!!
1942 *
1943 * Returns A newly allocated string with the substitution done. The caller
1944 * must deallocate it !
1945 */
1946xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001947htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1948 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001949 static int deprecated = 0;
1950 if (!deprecated) {
1951 xmlGenericError(xmlGenericErrorContext,
1952 "htmlDecodeEntities() deprecated function reached\n");
1953 deprecated = 1;
1954 }
1955 return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001956}
1957
1958/************************************************************************
1959 * *
1960 * Commodity functions to handle streams *
1961 * *
1962 ************************************************************************/
1963
1964/**
Owen Taylor3473f882001-02-23 17:55:21 +00001965 * htmlNewInputStream:
1966 * @ctxt: an HTML parser context
1967 *
1968 * Create a new input stream structure
1969 * Returns the new input stream or NULL
1970 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001971static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001972htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1973 htmlParserInputPtr input;
1974
1975 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1976 if (input == NULL) {
1977 ctxt->errNo = XML_ERR_NO_MEMORY;
1978 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1979 ctxt->sax->error(ctxt->userData,
1980 "malloc: couldn't allocate a new input stream\n");
1981 return(NULL);
1982 }
1983 memset(input, 0, sizeof(htmlParserInput));
1984 input->filename = NULL;
1985 input->directory = NULL;
1986 input->base = NULL;
1987 input->cur = NULL;
1988 input->buf = NULL;
1989 input->line = 1;
1990 input->col = 1;
1991 input->buf = NULL;
1992 input->free = NULL;
1993 input->version = NULL;
1994 input->consumed = 0;
1995 input->length = 0;
1996 return(input);
1997}
1998
1999
2000/************************************************************************
2001 * *
2002 * Commodity functions, cleanup needed ? *
2003 * *
2004 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002005/*
2006 * all tags allowing pc data from the html 4.01 loose dtd
2007 * NOTE: it might be more apropriate to integrate this information
2008 * into the html40ElementTable array but I don't want to risk any
2009 * binary incomptibility
2010 */
2011static const char *allowPCData[] = {
2012 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2013 "blockquote", "body", "button", "caption", "center", "cite", "code",
2014 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2015 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2016 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2017 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2018};
Owen Taylor3473f882001-02-23 17:55:21 +00002019
2020/**
2021 * areBlanks:
2022 * @ctxt: an HTML parser context
2023 * @str: a xmlChar *
2024 * @len: the size of @str
2025 *
2026 * Is this a sequence of blank chars that one can ignore ?
2027 *
2028 * Returns 1 if ignorable 0 otherwise.
2029 */
2030
2031static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002032 unsigned int i;
2033 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002034 xmlNodePtr lastChild;
2035
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002036 for (j = 0;j < len;j++)
2037 if (!(IS_BLANK(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002038
2039 if (CUR == 0) return(1);
2040 if (CUR != '<') return(0);
2041 if (ctxt->name == NULL)
2042 return(1);
2043 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2044 return(1);
2045 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2046 return(1);
2047 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
2048 return(1);
2049 if (ctxt->node == NULL) return(0);
2050 lastChild = xmlGetLastChild(ctxt->node);
2051 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002052 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2053 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002054 /* keep ws in constructs like ...<b> </b>...
2055 for all tags "b" allowing PCDATA */
2056 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2057 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2058 return(0);
2059 }
2060 }
Owen Taylor3473f882001-02-23 17:55:21 +00002061 } else if (xmlNodeIsText(lastChild)) {
2062 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002063 } else {
2064 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2065 for all tags "p" allowing PCDATA */
2066 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2067 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2068 return(0);
2069 }
2070 }
Owen Taylor3473f882001-02-23 17:55:21 +00002071 }
2072 return(1);
2073}
2074
2075/**
Owen Taylor3473f882001-02-23 17:55:21 +00002076 * htmlNewDocNoDtD:
2077 * @URI: URI for the dtd, or NULL
2078 * @ExternalID: the external ID of the DTD, or NULL
2079 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002080 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2081 * are NULL
2082 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002083 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002084 */
2085htmlDocPtr
2086htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2087 xmlDocPtr cur;
2088
2089 /*
2090 * Allocate a new document and fill the fields.
2091 */
2092 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2093 if (cur == NULL) {
2094 xmlGenericError(xmlGenericErrorContext,
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002095 "htmlNewDocNoDtD : malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002096 return(NULL);
2097 }
2098 memset(cur, 0, sizeof(xmlDoc));
2099
2100 cur->type = XML_HTML_DOCUMENT_NODE;
2101 cur->version = NULL;
2102 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002103 cur->doc = cur;
2104 cur->name = NULL;
2105 cur->children = NULL;
2106 cur->extSubset = NULL;
2107 cur->oldNs = NULL;
2108 cur->encoding = NULL;
2109 cur->standalone = 1;
2110 cur->compression = 0;
2111 cur->ids = NULL;
2112 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002113 cur->_private = NULL;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002114 if ((ExternalID != NULL) ||
2115 (URI != NULL))
Daniel Veillard5151c062001-10-23 13:10:19 +00002116 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002117 return(cur);
2118}
2119
2120/**
2121 * htmlNewDoc:
2122 * @URI: URI for the dtd, or NULL
2123 * @ExternalID: the external ID of the DTD, or NULL
2124 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002125 * Creates a new HTML document
2126 *
Owen Taylor3473f882001-02-23 17:55:21 +00002127 * Returns a new document
2128 */
2129htmlDocPtr
2130htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2131 if ((URI == NULL) && (ExternalID == NULL))
2132 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002133 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2134 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002135
2136 return(htmlNewDocNoDtD(URI, ExternalID));
2137}
2138
2139
2140/************************************************************************
2141 * *
2142 * The parser itself *
2143 * Relates to http://www.w3.org/TR/html40 *
2144 * *
2145 ************************************************************************/
2146
2147/************************************************************************
2148 * *
2149 * The parser itself *
2150 * *
2151 ************************************************************************/
2152
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002153static xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2154
Owen Taylor3473f882001-02-23 17:55:21 +00002155/**
2156 * htmlParseHTMLName:
2157 * @ctxt: an HTML parser context
2158 *
2159 * parse an HTML tag or attribute name, note that we convert it to lowercase
2160 * since HTML names are not case-sensitive.
2161 *
2162 * Returns the Tag Name parsed or NULL
2163 */
2164
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002165static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002166htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2167 xmlChar *ret = NULL;
2168 int i = 0;
2169 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2170
2171 if (!IS_LETTER(CUR) && (CUR != '_') &&
2172 (CUR != ':')) return(NULL);
2173
2174 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2175 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2176 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2177 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2178 else loc[i] = CUR;
2179 i++;
2180
2181 NEXT;
2182 }
2183
2184 ret = xmlStrndup(loc, i);
2185
2186 return(ret);
2187}
2188
2189/**
2190 * htmlParseName:
2191 * @ctxt: an HTML parser context
2192 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002193 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002194 *
2195 * Returns the Name parsed or NULL
2196 */
2197
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002198static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002199htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002200 const xmlChar *in;
2201 xmlChar *ret;
2202 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002203
2204 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002205
2206 /*
2207 * Accelerator for simple ASCII names
2208 */
2209 in = ctxt->input->cur;
2210 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2211 ((*in >= 0x41) && (*in <= 0x5A)) ||
2212 (*in == '_') || (*in == ':')) {
2213 in++;
2214 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2215 ((*in >= 0x41) && (*in <= 0x5A)) ||
2216 ((*in >= 0x30) && (*in <= 0x39)) ||
2217 (*in == '_') || (*in == '-') ||
2218 (*in == ':') || (*in == '.'))
2219 in++;
2220 if ((*in > 0) && (*in < 0x80)) {
2221 count = in - ctxt->input->cur;
2222 ret = xmlStrndup(ctxt->input->cur, count);
2223 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002224 ctxt->nbChars += count;
2225 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002226 return(ret);
2227 }
2228 }
2229 return(htmlParseNameComplex(ctxt));
2230}
2231
2232static xmlChar *
2233htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2234 xmlChar buf[XML_MAX_NAMELEN + 5];
2235 int len = 0, l;
2236 int c;
2237 int count = 0;
2238
2239 /*
2240 * Handler for more complex cases
2241 */
2242 GROW;
2243 c = CUR_CHAR(l);
2244 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2245 (!IS_LETTER(c) && (c != '_') &&
2246 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002247 return(NULL);
2248 }
2249
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002250 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2251 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2252 (c == '.') || (c == '-') ||
2253 (c == '_') || (c == ':') ||
2254 (IS_COMBINING(c)) ||
2255 (IS_EXTENDER(c)))) {
2256 if (count++ > 100) {
2257 count = 0;
2258 GROW;
2259 }
2260 COPY_BUF(l,buf,len,c);
2261 NEXTL(l);
2262 c = CUR_CHAR(l);
2263 if (len >= XML_MAX_NAMELEN) {
2264 /*
2265 * Okay someone managed to make a huge name, so he's ready to pay
2266 * for the processing speed.
2267 */
2268 xmlChar *buffer;
2269 int max = len * 2;
2270
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002271 buffer = (xmlChar *) xmlMallocAtomic(max * sizeof(xmlChar));
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002272 if (buffer == NULL) {
2273 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2274 ctxt->sax->error(ctxt->userData,
2275 "htmlParseNameComplex: out of memory\n");
2276 return(NULL);
2277 }
2278 memcpy(buffer, buf, len);
2279 while ((IS_LETTER(c)) || (IS_DIGIT(c)) || /* test bigname.xml */
2280 (c == '.') || (c == '-') ||
2281 (c == '_') || (c == ':') ||
2282 (IS_COMBINING(c)) ||
2283 (IS_EXTENDER(c))) {
2284 if (count++ > 100) {
2285 count = 0;
2286 GROW;
2287 }
2288 if (len + 10 > max) {
2289 max *= 2;
2290 buffer = (xmlChar *) xmlRealloc(buffer,
2291 max * sizeof(xmlChar));
2292 if (buffer == NULL) {
2293 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2294 ctxt->sax->error(ctxt->userData,
2295 "htmlParseNameComplex: out of memory\n");
2296 return(NULL);
2297 }
2298 }
2299 COPY_BUF(l,buffer,len,c);
2300 NEXTL(l);
2301 c = CUR_CHAR(l);
2302 }
2303 buffer[len] = 0;
2304 return(buffer);
Owen Taylor3473f882001-02-23 17:55:21 +00002305 }
2306 }
2307 return(xmlStrndup(buf, len));
2308}
2309
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002310
Owen Taylor3473f882001-02-23 17:55:21 +00002311/**
2312 * htmlParseHTMLAttribute:
2313 * @ctxt: an HTML parser context
2314 * @stop: a char stop value
2315 *
2316 * parse an HTML attribute value till the stop (quote), if
2317 * stop is 0 then it stops at the first space
2318 *
2319 * Returns the attribute parsed or NULL
2320 */
2321
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002322static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002323htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2324 xmlChar *buffer = NULL;
2325 int buffer_size = 0;
2326 xmlChar *out = NULL;
2327 xmlChar *name = NULL;
2328
2329 xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002330 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002331
2332 /*
2333 * allocate a translation buffer.
2334 */
2335 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002336 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002337 if (buffer == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00002338 xmlGenericError(xmlGenericErrorContext,
2339 "htmlParseHTMLAttribute: malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002340 return(NULL);
2341 }
2342 out = buffer;
2343
2344 /*
2345 * Ok loop until we reach one of the ending chars
2346 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002347 while ((CUR != 0) && (CUR != stop)) {
2348 if ((stop == 0) && (CUR == '>')) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002349 if ((stop == 0) && (IS_BLANK(CUR))) break;
2350 if (CUR == '&') {
2351 if (NXT(1) == '#') {
2352 unsigned int c;
2353 int bits;
2354
2355 c = htmlParseCharRef(ctxt);
2356 if (c < 0x80)
2357 { *out++ = c; bits= -6; }
2358 else if (c < 0x800)
2359 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2360 else if (c < 0x10000)
2361 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2362 else
2363 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2364
2365 for ( ; bits >= 0; bits-= 6) {
2366 *out++ = ((c >> bits) & 0x3F) | 0x80;
2367 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002368
2369 if (out - buffer > buffer_size - 100) {
2370 int indx = out - buffer;
2371
2372 growBuffer(buffer);
2373 out = &buffer[indx];
2374 }
Owen Taylor3473f882001-02-23 17:55:21 +00002375 } else {
2376 ent = htmlParseEntityRef(ctxt, &name);
2377 if (name == NULL) {
2378 *out++ = '&';
2379 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002380 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002381
2382 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002383 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002384 }
2385 } else if (ent == NULL) {
2386 *out++ = '&';
2387 cur = name;
2388 while (*cur != 0) {
2389 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002390 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002391
2392 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002393 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002394 }
2395 *out++ = *cur++;
2396 }
2397 xmlFree(name);
2398 } else {
2399 unsigned int c;
2400 int bits;
2401
2402 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002403 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002404
2405 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002406 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002407 }
2408 c = (xmlChar)ent->value;
2409 if (c < 0x80)
2410 { *out++ = c; bits= -6; }
2411 else if (c < 0x800)
2412 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2413 else if (c < 0x10000)
2414 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2415 else
2416 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2417
2418 for ( ; bits >= 0; bits-= 6) {
2419 *out++ = ((c >> bits) & 0x3F) | 0x80;
2420 }
2421 xmlFree(name);
2422 }
2423 }
2424 } else {
2425 unsigned int c;
2426 int bits, l;
2427
2428 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002429 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002430
2431 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002432 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002433 }
2434 c = CUR_CHAR(l);
2435 if (c < 0x80)
2436 { *out++ = c; bits= -6; }
2437 else if (c < 0x800)
2438 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2439 else if (c < 0x10000)
2440 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2441 else
2442 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2443
2444 for ( ; bits >= 0; bits-= 6) {
2445 *out++ = ((c >> bits) & 0x3F) | 0x80;
2446 }
2447 NEXT;
2448 }
2449 }
2450 *out++ = 0;
2451 return(buffer);
2452}
2453
2454/**
Owen Taylor3473f882001-02-23 17:55:21 +00002455 * htmlParseEntityRef:
2456 * @ctxt: an HTML parser context
2457 * @str: location to store the entity name
2458 *
2459 * parse an HTML ENTITY references
2460 *
2461 * [68] EntityRef ::= '&' Name ';'
2462 *
2463 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2464 * if non-NULL *str will have to be freed by the caller.
2465 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002466const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00002467htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2468 xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002469 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002470 *str = NULL;
2471
2472 if (CUR == '&') {
2473 NEXT;
2474 name = htmlParseName(ctxt);
2475 if (name == NULL) {
2476 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2477 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2478 ctxt->wellFormed = 0;
2479 } else {
2480 GROW;
2481 if (CUR == ';') {
2482 *str = name;
2483
2484 /*
2485 * Lookup the entity in the table.
2486 */
2487 ent = htmlEntityLookup(name);
2488 if (ent != NULL) /* OK that's ugly !!! */
2489 NEXT;
2490 } else {
2491 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2492 ctxt->sax->error(ctxt->userData,
2493 "htmlParseEntityRef: expecting ';'\n");
2494 *str = name;
2495 }
2496 }
2497 }
2498 return(ent);
2499}
2500
2501/**
2502 * htmlParseAttValue:
2503 * @ctxt: an HTML parser context
2504 *
2505 * parse a value for an attribute
2506 * Note: the parser won't do substitution of entities here, this
2507 * will be handled later in xmlStringGetNodeList, unless it was
2508 * asked for ctxt->replaceEntities != 0
2509 *
2510 * Returns the AttValue parsed or NULL.
2511 */
2512
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002513static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002514htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2515 xmlChar *ret = NULL;
2516
2517 if (CUR == '"') {
2518 NEXT;
2519 ret = htmlParseHTMLAttribute(ctxt, '"');
2520 if (CUR != '"') {
2521 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2522 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2523 ctxt->wellFormed = 0;
2524 } else
2525 NEXT;
2526 } else if (CUR == '\'') {
2527 NEXT;
2528 ret = htmlParseHTMLAttribute(ctxt, '\'');
2529 if (CUR != '\'') {
2530 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2531 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2532 ctxt->wellFormed = 0;
2533 } else
2534 NEXT;
2535 } else {
2536 /*
2537 * That's an HTMLism, the attribute value may not be quoted
2538 */
2539 ret = htmlParseHTMLAttribute(ctxt, 0);
2540 if (ret == NULL) {
2541 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2542 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2543 ctxt->wellFormed = 0;
2544 }
2545 }
2546 return(ret);
2547}
2548
2549/**
2550 * htmlParseSystemLiteral:
2551 * @ctxt: an HTML parser context
2552 *
2553 * parse an HTML Literal
2554 *
2555 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2556 *
2557 * Returns the SystemLiteral parsed or NULL
2558 */
2559
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002560static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002561htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2562 const xmlChar *q;
2563 xmlChar *ret = NULL;
2564
2565 if (CUR == '"') {
2566 NEXT;
2567 q = CUR_PTR;
2568 while ((IS_CHAR(CUR)) && (CUR != '"'))
2569 NEXT;
2570 if (!IS_CHAR(CUR)) {
2571 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2572 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2573 ctxt->wellFormed = 0;
2574 } else {
2575 ret = xmlStrndup(q, CUR_PTR - q);
2576 NEXT;
2577 }
2578 } else if (CUR == '\'') {
2579 NEXT;
2580 q = CUR_PTR;
2581 while ((IS_CHAR(CUR)) && (CUR != '\''))
2582 NEXT;
2583 if (!IS_CHAR(CUR)) {
2584 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2585 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2586 ctxt->wellFormed = 0;
2587 } else {
2588 ret = xmlStrndup(q, CUR_PTR - q);
2589 NEXT;
2590 }
2591 } else {
2592 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2593 ctxt->sax->error(ctxt->userData,
2594 "SystemLiteral \" or ' expected\n");
2595 ctxt->wellFormed = 0;
2596 }
2597
2598 return(ret);
2599}
2600
2601/**
2602 * htmlParsePubidLiteral:
2603 * @ctxt: an HTML parser context
2604 *
2605 * parse an HTML public literal
2606 *
2607 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2608 *
2609 * Returns the PubidLiteral parsed or NULL.
2610 */
2611
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002612static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002613htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2614 const xmlChar *q;
2615 xmlChar *ret = NULL;
2616 /*
2617 * Name ::= (Letter | '_') (NameChar)*
2618 */
2619 if (CUR == '"') {
2620 NEXT;
2621 q = CUR_PTR;
2622 while (IS_PUBIDCHAR(CUR)) NEXT;
2623 if (CUR != '"') {
2624 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2625 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2626 ctxt->wellFormed = 0;
2627 } else {
2628 ret = xmlStrndup(q, CUR_PTR - q);
2629 NEXT;
2630 }
2631 } else if (CUR == '\'') {
2632 NEXT;
2633 q = CUR_PTR;
Daniel Veillard6560a422003-03-27 21:25:38 +00002634 while ((IS_PUBIDCHAR(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002635 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002636 if (CUR != '\'') {
Owen Taylor3473f882001-02-23 17:55:21 +00002637 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2638 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2639 ctxt->wellFormed = 0;
2640 } else {
2641 ret = xmlStrndup(q, CUR_PTR - q);
2642 NEXT;
2643 }
2644 } else {
2645 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2646 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2647 ctxt->wellFormed = 0;
2648 }
2649
2650 return(ret);
2651}
2652
2653/**
2654 * htmlParseScript:
2655 * @ctxt: an HTML parser context
2656 *
2657 * parse the content of an HTML SCRIPT or STYLE element
2658 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2659 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2660 * http://www.w3.org/TR/html4/types.html#type-script
2661 * http://www.w3.org/TR/html4/types.html#h-6.15
2662 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2663 *
2664 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2665 * element and the value of intrinsic event attributes. User agents must
2666 * not evaluate script data as HTML markup but instead must pass it on as
2667 * data to a script engine.
2668 * NOTES:
2669 * - The content is passed like CDATA
2670 * - the attributes for style and scripting "onXXX" are also described
2671 * as CDATA but SGML allows entities references in attributes so their
2672 * processing is identical as other attributes
2673 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002674static void
Owen Taylor3473f882001-02-23 17:55:21 +00002675htmlParseScript(htmlParserCtxtPtr ctxt) {
2676 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2677 int nbchar = 0;
2678 xmlChar cur;
2679
2680 SHRINK;
2681 cur = CUR;
2682 while (IS_CHAR(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002683 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2684 (NXT(3) == '-')) {
2685 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2686 if (ctxt->sax->cdataBlock!= NULL) {
2687 /*
2688 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2689 */
2690 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002691 } else if (ctxt->sax->characters != NULL) {
2692 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Daniel Veillardc1f78342001-11-10 11:43:05 +00002693 }
2694 }
2695 nbchar = 0;
2696 htmlParseComment(ctxt);
2697 cur = CUR;
2698 continue;
2699 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002700 /*
2701 * One should break here, the specification is clear:
2702 * Authors should therefore escape "</" within the content.
2703 * Escape mechanisms are specific to each scripting or
2704 * style sheet language.
2705 */
2706 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2707 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2708 break; /* while */
2709 }
2710 buf[nbchar++] = cur;
2711 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2712 if (ctxt->sax->cdataBlock!= NULL) {
2713 /*
2714 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2715 */
2716 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002717 } else if (ctxt->sax->characters != NULL) {
2718 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002719 }
2720 nbchar = 0;
2721 }
2722 NEXT;
2723 cur = CUR;
2724 }
2725 if (!(IS_CHAR(cur))) {
2726 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2727 ctxt->sax->error(ctxt->userData,
2728 "Invalid char in CDATA 0x%X\n", cur);
2729 ctxt->wellFormed = 0;
2730 NEXT;
2731 }
2732
2733 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2734 if (ctxt->sax->cdataBlock!= NULL) {
2735 /*
2736 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2737 */
2738 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002739 } else if (ctxt->sax->characters != NULL) {
2740 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002741 }
2742 }
2743}
2744
2745
2746/**
2747 * htmlParseCharData:
2748 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002749 *
2750 * parse a CharData section.
2751 * if we are within a CDATA section ']]>' marks an end of section.
2752 *
2753 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2754 */
2755
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002756static void
2757htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002758 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2759 int nbchar = 0;
2760 int cur, l;
2761
2762 SHRINK;
2763 cur = CUR_CHAR(l);
2764 while (((cur != '<') || (ctxt->token == '<')) &&
2765 ((cur != '&') || (ctxt->token == '&')) &&
2766 (IS_CHAR(cur))) {
2767 COPY_BUF(l,buf,nbchar,cur);
2768 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2769 /*
2770 * Ok the segment is to be consumed as chars.
2771 */
2772 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2773 if (areBlanks(ctxt, buf, nbchar)) {
2774 if (ctxt->sax->ignorableWhitespace != NULL)
2775 ctxt->sax->ignorableWhitespace(ctxt->userData,
2776 buf, nbchar);
2777 } else {
2778 htmlCheckParagraph(ctxt);
2779 if (ctxt->sax->characters != NULL)
2780 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2781 }
2782 }
2783 nbchar = 0;
2784 }
2785 NEXTL(l);
2786 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002787 if (cur == 0) {
2788 SHRINK;
2789 GROW;
2790 cur = CUR_CHAR(l);
2791 }
Owen Taylor3473f882001-02-23 17:55:21 +00002792 }
2793 if (nbchar != 0) {
2794 /*
2795 * Ok the segment is to be consumed as chars.
2796 */
2797 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2798 if (areBlanks(ctxt, buf, nbchar)) {
2799 if (ctxt->sax->ignorableWhitespace != NULL)
2800 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2801 } else {
2802 htmlCheckParagraph(ctxt);
2803 if (ctxt->sax->characters != NULL)
2804 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2805 }
2806 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002807 } else {
2808 /*
2809 * Loop detection
2810 */
2811 if (cur == 0)
2812 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002813 }
2814}
2815
2816/**
2817 * htmlParseExternalID:
2818 * @ctxt: an HTML parser context
2819 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002820 *
2821 * Parse an External ID or a Public ID
2822 *
Owen Taylor3473f882001-02-23 17:55:21 +00002823 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2824 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2825 *
2826 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2827 *
2828 * Returns the function returns SystemLiteral and in the second
2829 * case publicID receives PubidLiteral, is strict is off
2830 * it is possible to return NULL and have publicID set.
2831 */
2832
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002833static xmlChar *
2834htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002835 xmlChar *URI = NULL;
2836
2837 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2838 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2839 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2840 SKIP(6);
2841 if (!IS_BLANK(CUR)) {
2842 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2843 ctxt->sax->error(ctxt->userData,
2844 "Space required after 'SYSTEM'\n");
2845 ctxt->wellFormed = 0;
2846 }
2847 SKIP_BLANKS;
2848 URI = htmlParseSystemLiteral(ctxt);
2849 if (URI == NULL) {
2850 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2851 ctxt->sax->error(ctxt->userData,
2852 "htmlParseExternalID: SYSTEM, no URI\n");
2853 ctxt->wellFormed = 0;
2854 }
2855 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2856 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2857 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2858 SKIP(6);
2859 if (!IS_BLANK(CUR)) {
2860 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2861 ctxt->sax->error(ctxt->userData,
2862 "Space required after 'PUBLIC'\n");
2863 ctxt->wellFormed = 0;
2864 }
2865 SKIP_BLANKS;
2866 *publicID = htmlParsePubidLiteral(ctxt);
2867 if (*publicID == NULL) {
2868 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2869 ctxt->sax->error(ctxt->userData,
2870 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2871 ctxt->wellFormed = 0;
2872 }
2873 SKIP_BLANKS;
2874 if ((CUR == '"') || (CUR == '\'')) {
2875 URI = htmlParseSystemLiteral(ctxt);
2876 }
2877 }
2878 return(URI);
2879}
2880
2881/**
2882 * htmlParseComment:
2883 * @ctxt: an HTML parser context
2884 *
2885 * Parse an XML (SGML) comment <!-- .... -->
2886 *
2887 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2888 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002889static void
Owen Taylor3473f882001-02-23 17:55:21 +00002890htmlParseComment(htmlParserCtxtPtr ctxt) {
2891 xmlChar *buf = NULL;
2892 int len;
2893 int size = HTML_PARSER_BUFFER_SIZE;
2894 int q, ql;
2895 int r, rl;
2896 int cur, l;
2897 xmlParserInputState state;
2898
2899 /*
2900 * Check that there is a comment right here.
2901 */
2902 if ((RAW != '<') || (NXT(1) != '!') ||
2903 (NXT(2) != '-') || (NXT(3) != '-')) return;
2904
2905 state = ctxt->instate;
2906 ctxt->instate = XML_PARSER_COMMENT;
2907 SHRINK;
2908 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002909 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002910 if (buf == NULL) {
2911 xmlGenericError(xmlGenericErrorContext,
2912 "malloc of %d byte failed\n", size);
2913 ctxt->instate = state;
2914 return;
2915 }
2916 q = CUR_CHAR(ql);
2917 NEXTL(ql);
2918 r = CUR_CHAR(rl);
2919 NEXTL(rl);
2920 cur = CUR_CHAR(l);
2921 len = 0;
2922 while (IS_CHAR(cur) &&
2923 ((cur != '>') ||
2924 (r != '-') || (q != '-'))) {
2925 if (len + 5 >= size) {
2926 size *= 2;
2927 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2928 if (buf == NULL) {
2929 xmlGenericError(xmlGenericErrorContext,
2930 "realloc of %d byte failed\n", size);
2931 ctxt->instate = state;
2932 return;
2933 }
2934 }
2935 COPY_BUF(ql,buf,len,q);
2936 q = r;
2937 ql = rl;
2938 r = cur;
2939 rl = l;
2940 NEXTL(l);
2941 cur = CUR_CHAR(l);
2942 if (cur == 0) {
2943 SHRINK;
2944 GROW;
2945 cur = CUR_CHAR(l);
2946 }
2947 }
2948 buf[len] = 0;
2949 if (!IS_CHAR(cur)) {
2950 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2951 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2952 ctxt->sax->error(ctxt->userData,
2953 "Comment not terminated \n<!--%.50s\n", buf);
2954 ctxt->wellFormed = 0;
2955 xmlFree(buf);
2956 } else {
2957 NEXT;
2958 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2959 (!ctxt->disableSAX))
2960 ctxt->sax->comment(ctxt->userData, buf);
2961 xmlFree(buf);
2962 }
2963 ctxt->instate = state;
2964}
2965
2966/**
2967 * htmlParseCharRef:
2968 * @ctxt: an HTML parser context
2969 *
2970 * parse Reference declarations
2971 *
2972 * [66] CharRef ::= '&#' [0-9]+ ';' |
2973 * '&#x' [0-9a-fA-F]+ ';'
2974 *
2975 * Returns the value parsed (as an int)
2976 */
2977int
2978htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2979 int val = 0;
2980
2981 if ((CUR == '&') && (NXT(1) == '#') &&
2982 (NXT(2) == 'x')) {
2983 SKIP(3);
2984 while (CUR != ';') {
2985 if ((CUR >= '0') && (CUR <= '9'))
2986 val = val * 16 + (CUR - '0');
2987 else if ((CUR >= 'a') && (CUR <= 'f'))
2988 val = val * 16 + (CUR - 'a') + 10;
2989 else if ((CUR >= 'A') && (CUR <= 'F'))
2990 val = val * 16 + (CUR - 'A') + 10;
2991 else {
2992 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2993 ctxt->sax->error(ctxt->userData,
2994 "htmlParseCharRef: invalid hexadecimal value\n");
2995 ctxt->wellFormed = 0;
2996 return(0);
2997 }
2998 NEXT;
2999 }
3000 if (CUR == ';')
3001 NEXT;
3002 } else if ((CUR == '&') && (NXT(1) == '#')) {
3003 SKIP(2);
3004 while (CUR != ';') {
3005 if ((CUR >= '0') && (CUR <= '9'))
3006 val = val * 10 + (CUR - '0');
3007 else {
3008 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3009 ctxt->sax->error(ctxt->userData,
3010 "htmlParseCharRef: invalid decimal value\n");
3011 ctxt->wellFormed = 0;
3012 return(0);
3013 }
3014 NEXT;
3015 }
3016 if (CUR == ';')
3017 NEXT;
3018 } else {
3019 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3020 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
3021 ctxt->wellFormed = 0;
3022 }
3023 /*
3024 * Check the value IS_CHAR ...
3025 */
3026 if (IS_CHAR(val)) {
3027 return(val);
3028 } else {
3029 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3030 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
3031 val);
3032 ctxt->wellFormed = 0;
3033 }
3034 return(0);
3035}
3036
3037
3038/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003039 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003040 * @ctxt: an HTML parser context
3041 *
3042 * parse a DOCTYPE declaration
3043 *
3044 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3045 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3046 */
3047
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003048static void
Owen Taylor3473f882001-02-23 17:55:21 +00003049htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3050 xmlChar *name;
3051 xmlChar *ExternalID = NULL;
3052 xmlChar *URI = NULL;
3053
3054 /*
3055 * We know that '<!DOCTYPE' has been detected.
3056 */
3057 SKIP(9);
3058
3059 SKIP_BLANKS;
3060
3061 /*
3062 * Parse the DOCTYPE name.
3063 */
3064 name = htmlParseName(ctxt);
3065 if (name == NULL) {
3066 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3067 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
3068 ctxt->wellFormed = 0;
3069 }
3070 /*
3071 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3072 */
3073
3074 SKIP_BLANKS;
3075
3076 /*
3077 * Check for SystemID and ExternalID
3078 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003079 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003080 SKIP_BLANKS;
3081
3082 /*
3083 * We should be at the end of the DOCTYPE declaration.
3084 */
3085 if (CUR != '>') {
3086 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardf6ed8bc2001-10-02 09:22:47 +00003087 ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003088 ctxt->wellFormed = 0;
3089 /* We shouldn't try to resynchronize ... */
3090 }
3091 NEXT;
3092
3093 /*
3094 * Create or update the document accordingly to the DOCTYPE
3095 */
3096 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3097 (!ctxt->disableSAX))
3098 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3099
3100 /*
3101 * Cleanup, since we don't use all those identifiers
3102 */
3103 if (URI != NULL) xmlFree(URI);
3104 if (ExternalID != NULL) xmlFree(ExternalID);
3105 if (name != NULL) xmlFree(name);
3106}
3107
3108/**
3109 * htmlParseAttribute:
3110 * @ctxt: an HTML parser context
3111 * @value: a xmlChar ** used to store the value of the attribute
3112 *
3113 * parse an attribute
3114 *
3115 * [41] Attribute ::= Name Eq AttValue
3116 *
3117 * [25] Eq ::= S? '=' S?
3118 *
3119 * With namespace:
3120 *
3121 * [NS 11] Attribute ::= QName Eq AttValue
3122 *
3123 * Also the case QName == xmlns:??? is handled independently as a namespace
3124 * definition.
3125 *
3126 * Returns the attribute name, and the value in *value.
3127 */
3128
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003129static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003130htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3131 xmlChar *name, *val = NULL;
3132
3133 *value = NULL;
3134 name = htmlParseHTMLName(ctxt);
3135 if (name == NULL) {
3136 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3137 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
3138 ctxt->wellFormed = 0;
3139 return(NULL);
3140 }
3141
3142 /*
3143 * read the value
3144 */
3145 SKIP_BLANKS;
3146 if (CUR == '=') {
3147 NEXT;
3148 SKIP_BLANKS;
3149 val = htmlParseAttValue(ctxt);
3150 /******
3151 } else {
3152 * TODO : some attribute must have values, some may not
3153 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3154 ctxt->sax->warning(ctxt->userData,
3155 "No value for attribute %s\n", name); */
3156 }
3157
3158 *value = val;
3159 return(name);
3160}
3161
3162/**
3163 * htmlCheckEncoding:
3164 * @ctxt: an HTML parser context
3165 * @attvalue: the attribute value
3166 *
3167 * Checks an http-equiv attribute from a Meta tag to detect
3168 * the encoding
3169 * If a new encoding is detected the parser is switched to decode
3170 * it and pass UTF8
3171 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003172static void
Owen Taylor3473f882001-02-23 17:55:21 +00003173htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3174 const xmlChar *encoding;
3175
3176 if ((ctxt == NULL) || (attvalue == NULL))
3177 return;
3178
3179 /* do not change encoding */
3180 if (ctxt->input->encoding != NULL)
3181 return;
3182
3183 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3184 if (encoding != NULL) {
3185 encoding += 8;
3186 } else {
3187 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3188 if (encoding != NULL)
3189 encoding += 9;
3190 }
3191 if (encoding != NULL) {
3192 xmlCharEncoding enc;
3193 xmlCharEncodingHandlerPtr handler;
3194
3195 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3196
3197 if (ctxt->input->encoding != NULL)
3198 xmlFree((xmlChar *) ctxt->input->encoding);
3199 ctxt->input->encoding = xmlStrdup(encoding);
3200
3201 enc = xmlParseCharEncoding((const char *) encoding);
3202 /*
3203 * registered set of known encodings
3204 */
3205 if (enc != XML_CHAR_ENCODING_ERROR) {
3206 xmlSwitchEncoding(ctxt, enc);
3207 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3208 } else {
3209 /*
3210 * fallback for unknown encodings
3211 */
3212 handler = xmlFindCharEncodingHandler((const char *) encoding);
3213 if (handler != NULL) {
3214 xmlSwitchToEncoding(ctxt, handler);
3215 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3216 } else {
3217 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3218 }
3219 }
3220
3221 if ((ctxt->input->buf != NULL) &&
3222 (ctxt->input->buf->encoder != NULL) &&
3223 (ctxt->input->buf->raw != NULL) &&
3224 (ctxt->input->buf->buffer != NULL)) {
3225 int nbchars;
3226 int processed;
3227
3228 /*
3229 * convert as much as possible to the parser reading buffer.
3230 */
3231 processed = ctxt->input->cur - ctxt->input->base;
3232 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3233 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3234 ctxt->input->buf->buffer,
3235 ctxt->input->buf->raw);
3236 if (nbchars < 0) {
3237 ctxt->errNo = XML_ERR_INVALID_ENCODING;
3238 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3239 ctxt->sax->error(ctxt->userData,
3240 "htmlCheckEncoding: encoder error\n");
3241 }
3242 ctxt->input->base =
3243 ctxt->input->cur = ctxt->input->buf->buffer->content;
3244 }
3245 }
3246}
3247
3248/**
3249 * htmlCheckMeta:
3250 * @ctxt: an HTML parser context
3251 * @atts: the attributes values
3252 *
3253 * Checks an attributes from a Meta tag
3254 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003255static void
Owen Taylor3473f882001-02-23 17:55:21 +00003256htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3257 int i;
3258 const xmlChar *att, *value;
3259 int http = 0;
3260 const xmlChar *content = NULL;
3261
3262 if ((ctxt == NULL) || (atts == NULL))
3263 return;
3264
3265 i = 0;
3266 att = atts[i++];
3267 while (att != NULL) {
3268 value = atts[i++];
3269 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3270 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3271 http = 1;
3272 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3273 content = value;
3274 att = atts[i++];
3275 }
3276 if ((http) && (content != NULL))
3277 htmlCheckEncoding(ctxt, content);
3278
3279}
3280
3281/**
3282 * htmlParseStartTag:
3283 * @ctxt: an HTML parser context
3284 *
3285 * parse a start of tag either for rule element or
3286 * EmptyElement. In both case we don't parse the tag closing chars.
3287 *
3288 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3289 *
3290 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3291 *
3292 * With namespace:
3293 *
3294 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3295 *
3296 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3297 *
3298 */
3299
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003300static void
Owen Taylor3473f882001-02-23 17:55:21 +00003301htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3302 xmlChar *name;
3303 xmlChar *attname;
3304 xmlChar *attvalue;
3305 const xmlChar **atts = NULL;
3306 int nbatts = 0;
3307 int maxatts = 0;
3308 int meta = 0;
3309 int i;
3310
3311 if (CUR != '<') return;
3312 NEXT;
3313
3314 GROW;
3315 name = htmlParseHTMLName(ctxt);
3316 if (name == NULL) {
3317 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3318 ctxt->sax->error(ctxt->userData,
3319 "htmlParseStartTag: invalid element name\n");
3320 ctxt->wellFormed = 0;
3321 /* Dump the bogus tag like browsers do */
3322 while ((IS_CHAR(CUR)) && (CUR != '>'))
3323 NEXT;
3324 return;
3325 }
3326 if (xmlStrEqual(name, BAD_CAST"meta"))
3327 meta = 1;
3328
3329 /*
3330 * Check for auto-closure of HTML elements.
3331 */
3332 htmlAutoClose(ctxt, name);
3333
3334 /*
3335 * Check for implied HTML elements.
3336 */
3337 htmlCheckImplied(ctxt, name);
3338
3339 /*
3340 * Avoid html at any level > 0, head at any level != 1
3341 * or any attempt to recurse body
3342 */
3343 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3344 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3345 ctxt->sax->error(ctxt->userData,
3346 "htmlParseStartTag: misplaced <html> tag\n");
3347 ctxt->wellFormed = 0;
3348 xmlFree(name);
3349 return;
3350 }
3351 if ((ctxt->nameNr != 1) &&
3352 (xmlStrEqual(name, BAD_CAST"head"))) {
3353 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3354 ctxt->sax->error(ctxt->userData,
3355 "htmlParseStartTag: misplaced <head> tag\n");
3356 ctxt->wellFormed = 0;
3357 xmlFree(name);
3358 return;
3359 }
3360 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003361 int indx;
3362 for (indx = 0;indx < ctxt->nameNr;indx++) {
3363 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00003364 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3365 ctxt->sax->error(ctxt->userData,
3366 "htmlParseStartTag: misplaced <body> tag\n");
3367 ctxt->wellFormed = 0;
3368 xmlFree(name);
3369 return;
3370 }
3371 }
3372 }
3373
3374 /*
3375 * Now parse the attributes, it ends up with the ending
3376 *
3377 * (S Attribute)* S?
3378 */
3379 SKIP_BLANKS;
3380 while ((IS_CHAR(CUR)) &&
3381 (CUR != '>') &&
3382 ((CUR != '/') || (NXT(1) != '>'))) {
3383 long cons = ctxt->nbChars;
3384
3385 GROW;
3386 attname = htmlParseAttribute(ctxt, &attvalue);
3387 if (attname != NULL) {
3388
3389 /*
3390 * Well formedness requires at most one declaration of an attribute
3391 */
3392 for (i = 0; i < nbatts;i += 2) {
3393 if (xmlStrEqual(atts[i], attname)) {
3394 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3395 ctxt->sax->error(ctxt->userData,
3396 "Attribute %s redefined\n",
3397 attname);
3398 ctxt->wellFormed = 0;
3399 xmlFree(attname);
3400 if (attvalue != NULL)
3401 xmlFree(attvalue);
3402 goto failed;
3403 }
3404 }
3405
3406 /*
3407 * Add the pair to atts
3408 */
3409 if (atts == NULL) {
3410 maxatts = 10;
3411 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
3412 if (atts == NULL) {
3413 xmlGenericError(xmlGenericErrorContext,
3414 "malloc of %ld byte failed\n",
3415 maxatts * (long)sizeof(xmlChar *));
3416 if (name != NULL) xmlFree(name);
3417 return;
3418 }
3419 } else if (nbatts + 4 > maxatts) {
3420 maxatts *= 2;
3421 atts = (const xmlChar **) xmlRealloc((void *) atts,
3422 maxatts * sizeof(xmlChar *));
3423 if (atts == NULL) {
3424 xmlGenericError(xmlGenericErrorContext,
3425 "realloc of %ld byte failed\n",
3426 maxatts * (long)sizeof(xmlChar *));
3427 if (name != NULL) xmlFree(name);
3428 return;
3429 }
3430 }
3431 atts[nbatts++] = attname;
3432 atts[nbatts++] = attvalue;
3433 atts[nbatts] = NULL;
3434 atts[nbatts + 1] = NULL;
3435 }
3436 else {
3437 /* Dump the bogus attribute string up to the next blank or
3438 * the end of the tag. */
Daniel Veillard561b7f82002-03-20 21:55:57 +00003439 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3440 && ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003441 NEXT;
3442 }
3443
3444failed:
3445 SKIP_BLANKS;
3446 if (cons == ctxt->nbChars) {
3447 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3448 ctxt->sax->error(ctxt->userData,
3449 "htmlParseStartTag: problem parsing attributes\n");
3450 ctxt->wellFormed = 0;
3451 break;
3452 }
3453 }
3454
3455 /*
3456 * Handle specific association to the META tag
3457 */
3458 if (meta)
3459 htmlCheckMeta(ctxt, atts);
3460
3461 /*
3462 * SAX: Start of Element !
3463 */
3464 htmlnamePush(ctxt, xmlStrdup(name));
3465#ifdef DEBUG
3466 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3467#endif
3468 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3469 ctxt->sax->startElement(ctxt->userData, name, atts);
3470
3471 if (atts != NULL) {
3472 for (i = 0;i < nbatts;i++) {
3473 if (atts[i] != NULL)
3474 xmlFree((xmlChar *) atts[i]);
3475 }
3476 xmlFree((void *) atts);
3477 }
3478 if (name != NULL) xmlFree(name);
3479}
3480
3481/**
3482 * htmlParseEndTag:
3483 * @ctxt: an HTML parser context
3484 *
3485 * parse an end of tag
3486 *
3487 * [42] ETag ::= '</' Name S? '>'
3488 *
3489 * With namespace
3490 *
3491 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003492 *
3493 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003494 */
3495
Daniel Veillardf420ac52001-07-04 16:04:09 +00003496static int
Owen Taylor3473f882001-02-23 17:55:21 +00003497htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3498 xmlChar *name;
3499 xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003500 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003501
3502 if ((CUR != '<') || (NXT(1) != '/')) {
3503 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3504 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3505 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003506 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003507 }
3508 SKIP(2);
3509
3510 name = htmlParseHTMLName(ctxt);
Daniel Veillardf420ac52001-07-04 16:04:09 +00003511 if (name == NULL) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003512
3513 /*
3514 * We should definitely be at the ending "S? '>'" part
3515 */
3516 SKIP_BLANKS;
3517 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3518 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3519 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3520 ctxt->wellFormed = 0;
3521 } else
3522 NEXT;
3523
3524 /*
3525 * If the name read is not one of the element in the parsing stack
3526 * then return, it's just an error.
3527 */
3528 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3529 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3530 }
3531 if (i < 0) {
3532 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3533 ctxt->sax->error(ctxt->userData,
3534 "Unexpected end tag : %s\n", name);
3535 xmlFree(name);
3536 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003537 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003538 }
3539
3540
3541 /*
3542 * Check for auto-closure of HTML elements.
3543 */
3544
3545 htmlAutoCloseOnClose(ctxt, name);
3546
3547 /*
3548 * Well formedness constraints, opening and closing must match.
3549 * With the exception that the autoclose may have popped stuff out
3550 * of the stack.
3551 */
3552 if (!xmlStrEqual(name, ctxt->name)) {
3553#ifdef DEBUG
3554 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3555#endif
3556 if ((ctxt->name != NULL) &&
3557 (!xmlStrEqual(ctxt->name, name))) {
3558 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3559 ctxt->sax->error(ctxt->userData,
3560 "Opening and ending tag mismatch: %s and %s\n",
3561 name, ctxt->name);
3562 ctxt->wellFormed = 0;
3563 }
3564 }
3565
3566 /*
3567 * SAX: End of Tag
3568 */
3569 oldname = ctxt->name;
3570 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3571 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3572 ctxt->sax->endElement(ctxt->userData, name);
3573 oldname = htmlnamePop(ctxt);
3574 if (oldname != NULL) {
3575#ifdef DEBUG
3576 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3577#endif
3578 xmlFree(oldname);
3579#ifdef DEBUG
3580 } else {
3581 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3582#endif
3583 }
Daniel Veillardf420ac52001-07-04 16:04:09 +00003584 ret = 1;
3585 } else {
3586 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003587 }
3588
3589 if (name != NULL)
3590 xmlFree(name);
3591
Daniel Veillardf420ac52001-07-04 16:04:09 +00003592 return(ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003593}
3594
3595
3596/**
3597 * htmlParseReference:
3598 * @ctxt: an HTML parser context
3599 *
3600 * parse and handle entity references in content,
3601 * this will end-up in a call to character() since this is either a
3602 * CharRef, or a predefined entity.
3603 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003604static void
Owen Taylor3473f882001-02-23 17:55:21 +00003605htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003606 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003607 xmlChar out[6];
3608 xmlChar *name;
3609 if (CUR != '&') return;
3610
3611 if (NXT(1) == '#') {
3612 unsigned int c;
3613 int bits, i = 0;
3614
3615 c = htmlParseCharRef(ctxt);
3616 if (c == 0)
3617 return;
3618
3619 if (c < 0x80) { out[i++]= c; bits= -6; }
3620 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3621 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3622 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3623
3624 for ( ; bits >= 0; bits-= 6) {
3625 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3626 }
3627 out[i] = 0;
3628
3629 htmlCheckParagraph(ctxt);
3630 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3631 ctxt->sax->characters(ctxt->userData, out, i);
3632 } else {
3633 ent = htmlParseEntityRef(ctxt, &name);
3634 if (name == NULL) {
3635 htmlCheckParagraph(ctxt);
3636 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3637 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3638 return;
3639 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003640 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003641 htmlCheckParagraph(ctxt);
3642 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3643 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3644 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3645 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3646 }
3647 } else {
3648 unsigned int c;
3649 int bits, i = 0;
3650
3651 c = ent->value;
3652 if (c < 0x80)
3653 { out[i++]= c; bits= -6; }
3654 else if (c < 0x800)
3655 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3656 else if (c < 0x10000)
3657 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3658 else
3659 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3660
3661 for ( ; bits >= 0; bits-= 6) {
3662 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3663 }
3664 out[i] = 0;
3665
3666 htmlCheckParagraph(ctxt);
3667 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3668 ctxt->sax->characters(ctxt->userData, out, i);
3669 }
3670 xmlFree(name);
3671 }
3672}
3673
3674/**
3675 * htmlParseContent:
3676 * @ctxt: an HTML parser context
3677 * @name: the node name
3678 *
3679 * Parse a content: comment, sub-element, reference or text.
3680 *
3681 */
3682
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003683static void
Owen Taylor3473f882001-02-23 17:55:21 +00003684htmlParseContent(htmlParserCtxtPtr ctxt) {
3685 xmlChar *currentNode;
3686 int depth;
3687
3688 currentNode = xmlStrdup(ctxt->name);
3689 depth = ctxt->nameNr;
3690 while (1) {
3691 long cons = ctxt->nbChars;
3692
3693 GROW;
3694 /*
3695 * Our tag or one of it's parent or children is ending.
3696 */
3697 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003698 if (htmlParseEndTag(ctxt) &&
3699 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3700 if (currentNode != NULL)
3701 xmlFree(currentNode);
3702 return;
3703 }
3704 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003705 }
3706
3707 /*
3708 * Has this node been popped out during parsing of
3709 * the next element
3710 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003711 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3712 (!xmlStrEqual(currentNode, ctxt->name)))
3713 {
Owen Taylor3473f882001-02-23 17:55:21 +00003714 if (currentNode != NULL) xmlFree(currentNode);
3715 return;
3716 }
3717
Daniel Veillardf9533d12001-03-03 10:04:57 +00003718 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3719 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003720 /*
3721 * Handle SCRIPT/STYLE separately
3722 */
3723 htmlParseScript(ctxt);
3724 } else {
3725 /*
3726 * Sometimes DOCTYPE arrives in the middle of the document
3727 */
3728 if ((CUR == '<') && (NXT(1) == '!') &&
3729 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3730 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3731 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3732 (UPP(8) == 'E')) {
3733 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3734 ctxt->sax->error(ctxt->userData,
3735 "Misplaced DOCTYPE declaration\n");
3736 ctxt->wellFormed = 0;
3737 htmlParseDocTypeDecl(ctxt);
3738 }
3739
3740 /*
3741 * First case : a comment
3742 */
3743 if ((CUR == '<') && (NXT(1) == '!') &&
3744 (NXT(2) == '-') && (NXT(3) == '-')) {
3745 htmlParseComment(ctxt);
3746 }
3747
3748 /*
3749 * Second case : a sub-element.
3750 */
3751 else if (CUR == '<') {
3752 htmlParseElement(ctxt);
3753 }
3754
3755 /*
3756 * Third case : a reference. If if has not been resolved,
3757 * parsing returns it's Name, create the node
3758 */
3759 else if (CUR == '&') {
3760 htmlParseReference(ctxt);
3761 }
3762
3763 /*
3764 * Fourth : end of the resource
3765 */
3766 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003767 htmlAutoCloseOnEnd(ctxt);
3768 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003769 }
3770
3771 /*
3772 * Last case, text. Note that References are handled directly.
3773 */
3774 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003775 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003776 }
3777
3778 if (cons == ctxt->nbChars) {
3779 if (ctxt->node != NULL) {
3780 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3781 ctxt->sax->error(ctxt->userData,
3782 "detected an error in element content\n");
3783 ctxt->wellFormed = 0;
3784 }
3785 break;
3786 }
3787 }
3788 GROW;
3789 }
3790 if (currentNode != NULL) xmlFree(currentNode);
3791}
3792
3793/**
3794 * htmlParseElement:
3795 * @ctxt: an HTML parser context
3796 *
3797 * parse an HTML element, this is highly recursive
3798 *
3799 * [39] element ::= EmptyElemTag | STag content ETag
3800 *
3801 * [41] Attribute ::= Name Eq AttValue
3802 */
3803
3804void
3805htmlParseElement(htmlParserCtxtPtr ctxt) {
3806 xmlChar *name;
3807 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003808 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003809 htmlParserNodeInfo node_info;
3810 xmlChar *oldname;
3811 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003812 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003813
3814 /* Capture start position */
3815 if (ctxt->record_info) {
3816 node_info.begin_pos = ctxt->input->consumed +
3817 (CUR_PTR - ctxt->input->base);
3818 node_info.begin_line = ctxt->input->line;
3819 }
3820
3821 oldname = xmlStrdup(ctxt->name);
3822 htmlParseStartTag(ctxt);
3823 name = ctxt->name;
3824#ifdef DEBUG
3825 if (oldname == NULL)
3826 xmlGenericError(xmlGenericErrorContext,
3827 "Start of element %s\n", name);
3828 else if (name == NULL)
3829 xmlGenericError(xmlGenericErrorContext,
3830 "Start of element failed, was %s\n", oldname);
3831 else
3832 xmlGenericError(xmlGenericErrorContext,
3833 "Start of element %s, was %s\n", name, oldname);
3834#endif
3835 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3836 (name == NULL)) {
3837 if (CUR == '>')
3838 NEXT;
3839 if (oldname != NULL)
3840 xmlFree(oldname);
3841 return;
3842 }
3843 if (oldname != NULL)
3844 xmlFree(oldname);
3845
3846 /*
3847 * Lookup the info for that element.
3848 */
3849 info = htmlTagLookup(name);
3850 if (info == NULL) {
3851 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3852 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3853 name);
3854 ctxt->wellFormed = 0;
3855 } else if (info->depr) {
3856/***************************
3857 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3858 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3859 name);
3860 ***************************/
3861 }
3862
3863 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003864 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003865 */
3866 if ((CUR == '/') && (NXT(1) == '>')) {
3867 SKIP(2);
3868 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3869 ctxt->sax->endElement(ctxt->userData, name);
3870 oldname = htmlnamePop(ctxt);
3871#ifdef DEBUG
3872 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3873#endif
3874 if (oldname != NULL)
3875 xmlFree(oldname);
3876 return;
3877 }
3878
3879 if (CUR == '>') {
3880 NEXT;
3881 } else {
3882 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3883 ctxt->sax->error(ctxt->userData,
3884 "Couldn't find end of Start Tag %s\n",
3885 name);
3886 ctxt->wellFormed = 0;
3887
3888 /*
3889 * end of parsing of this node.
3890 */
3891 if (xmlStrEqual(name, ctxt->name)) {
3892 nodePop(ctxt);
3893 oldname = htmlnamePop(ctxt);
3894#ifdef DEBUG
3895 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3896#endif
3897 if (oldname != NULL)
3898 xmlFree(oldname);
3899 }
3900
3901 /*
3902 * Capture end position and add node
3903 */
3904 if ( currentNode != NULL && ctxt->record_info ) {
3905 node_info.end_pos = ctxt->input->consumed +
3906 (CUR_PTR - ctxt->input->base);
3907 node_info.end_line = ctxt->input->line;
3908 node_info.node = ctxt->node;
3909 xmlParserAddNodeInfo(ctxt, &node_info);
3910 }
3911 return;
3912 }
3913
3914 /*
3915 * Check for an Empty Element from DTD definition
3916 */
3917 if ((info != NULL) && (info->empty)) {
3918 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3919 ctxt->sax->endElement(ctxt->userData, name);
3920 oldname = htmlnamePop(ctxt);
3921#ifdef DEBUG
3922 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3923#endif
3924 if (oldname != NULL)
3925 xmlFree(oldname);
3926 return;
3927 }
3928
3929 /*
3930 * Parse the content of the element:
3931 */
3932 currentNode = xmlStrdup(ctxt->name);
3933 depth = ctxt->nameNr;
3934 while (IS_CHAR(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003935 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003936 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003937 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003938 if (ctxt->nameNr < depth) break;
3939 }
3940
Owen Taylor3473f882001-02-23 17:55:21 +00003941 /*
3942 * Capture end position and add node
3943 */
3944 if ( currentNode != NULL && ctxt->record_info ) {
3945 node_info.end_pos = ctxt->input->consumed +
3946 (CUR_PTR - ctxt->input->base);
3947 node_info.end_line = ctxt->input->line;
3948 node_info.node = ctxt->node;
3949 xmlParserAddNodeInfo(ctxt, &node_info);
3950 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003951 if (!IS_CHAR(CUR)) {
3952 htmlAutoCloseOnEnd(ctxt);
3953 }
3954
Owen Taylor3473f882001-02-23 17:55:21 +00003955 if (currentNode != NULL)
3956 xmlFree(currentNode);
3957}
3958
3959/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003960 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00003961 * @ctxt: an HTML parser context
3962 *
3963 * parse an HTML document (and build a tree if using the standard SAX
3964 * interface).
3965 *
3966 * Returns 0, -1 in case of error. the parser context is augmented
3967 * as a result of the parsing.
3968 */
3969
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00003970int
Owen Taylor3473f882001-02-23 17:55:21 +00003971htmlParseDocument(htmlParserCtxtPtr ctxt) {
3972 xmlDtdPtr dtd;
3973
Daniel Veillardd0463562001-10-13 09:15:48 +00003974 xmlInitParser();
3975
Owen Taylor3473f882001-02-23 17:55:21 +00003976 htmlDefaultSAXHandlerInit();
3977 ctxt->html = 1;
3978
3979 GROW;
3980 /*
3981 * SAX: beginning of the document processing.
3982 */
3983 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3984 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3985
3986 /*
3987 * Wipe out everything which is before the first '<'
3988 */
3989 SKIP_BLANKS;
3990 if (CUR == 0) {
3991 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3992 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3993 ctxt->wellFormed = 0;
3994 }
3995
3996 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3997 ctxt->sax->startDocument(ctxt->userData);
3998
3999
4000 /*
4001 * Parse possible comments before any content
4002 */
4003 while ((CUR == '<') && (NXT(1) == '!') &&
4004 (NXT(2) == '-') && (NXT(3) == '-')) {
4005 htmlParseComment(ctxt);
4006 SKIP_BLANKS;
4007 }
4008
4009
4010 /*
4011 * Then possibly doc type declaration(s) and more Misc
4012 * (doctypedecl Misc*)?
4013 */
4014 if ((CUR == '<') && (NXT(1) == '!') &&
4015 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4016 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4017 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4018 (UPP(8) == 'E')) {
4019 htmlParseDocTypeDecl(ctxt);
4020 }
4021 SKIP_BLANKS;
4022
4023 /*
4024 * Parse possible comments before any content
4025 */
4026 while ((CUR == '<') && (NXT(1) == '!') &&
4027 (NXT(2) == '-') && (NXT(3) == '-')) {
4028 htmlParseComment(ctxt);
4029 SKIP_BLANKS;
4030 }
4031
4032 /*
4033 * Time to start parsing the tree itself
4034 */
4035 htmlParseContent(ctxt);
4036
4037 /*
4038 * autoclose
4039 */
4040 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004041 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004042
4043
4044 /*
4045 * SAX: end of the document processing.
4046 */
4047 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4048 ctxt->sax->endDocument(ctxt->userData);
4049
4050 if (ctxt->myDoc != NULL) {
4051 dtd = xmlGetIntSubset(ctxt->myDoc);
4052 if (dtd == NULL)
4053 ctxt->myDoc->intSubset =
4054 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4055 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4056 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4057 }
4058 if (! ctxt->wellFormed) return(-1);
4059 return(0);
4060}
4061
4062
4063/************************************************************************
4064 * *
4065 * Parser contexts handling *
4066 * *
4067 ************************************************************************/
4068
4069/**
4070 * xmlInitParserCtxt:
4071 * @ctxt: an HTML parser context
4072 *
4073 * Initialize a parser context
4074 */
4075
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004076static void
Owen Taylor3473f882001-02-23 17:55:21 +00004077htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4078{
4079 htmlSAXHandler *sax;
4080
4081 if (ctxt == NULL) return;
4082 memset(ctxt, 0, sizeof(htmlParserCtxt));
4083
4084 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4085 if (sax == NULL) {
4086 xmlGenericError(xmlGenericErrorContext,
4087 "htmlInitParserCtxt: out of memory\n");
4088 }
4089 else
4090 memset(sax, 0, sizeof(htmlSAXHandler));
4091
4092 /* Allocate the Input stack */
4093 ctxt->inputTab = (htmlParserInputPtr *)
4094 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4095 if (ctxt->inputTab == NULL) {
4096 xmlGenericError(xmlGenericErrorContext,
4097 "htmlInitParserCtxt: out of memory\n");
4098 ctxt->inputNr = 0;
4099 ctxt->inputMax = 0;
4100 ctxt->input = NULL;
4101 return;
4102 }
4103 ctxt->inputNr = 0;
4104 ctxt->inputMax = 5;
4105 ctxt->input = NULL;
4106 ctxt->version = NULL;
4107 ctxt->encoding = NULL;
4108 ctxt->standalone = -1;
4109 ctxt->instate = XML_PARSER_START;
4110
4111 /* Allocate the Node stack */
4112 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4113 if (ctxt->nodeTab == NULL) {
4114 xmlGenericError(xmlGenericErrorContext,
4115 "htmlInitParserCtxt: out of memory\n");
4116 ctxt->nodeNr = 0;
4117 ctxt->nodeMax = 0;
4118 ctxt->node = NULL;
4119 ctxt->inputNr = 0;
4120 ctxt->inputMax = 0;
4121 ctxt->input = NULL;
4122 return;
4123 }
4124 ctxt->nodeNr = 0;
4125 ctxt->nodeMax = 10;
4126 ctxt->node = NULL;
4127
4128 /* Allocate the Name stack */
4129 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4130 if (ctxt->nameTab == NULL) {
4131 xmlGenericError(xmlGenericErrorContext,
4132 "htmlInitParserCtxt: out of memory\n");
4133 ctxt->nameNr = 0;
4134 ctxt->nameMax = 10;
4135 ctxt->name = NULL;
4136 ctxt->nodeNr = 0;
4137 ctxt->nodeMax = 0;
4138 ctxt->node = NULL;
4139 ctxt->inputNr = 0;
4140 ctxt->inputMax = 0;
4141 ctxt->input = NULL;
4142 return;
4143 }
4144 ctxt->nameNr = 0;
4145 ctxt->nameMax = 10;
4146 ctxt->name = NULL;
4147
4148 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
4149 else {
4150 ctxt->sax = sax;
4151 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
4152 }
4153 ctxt->userData = ctxt;
4154 ctxt->myDoc = NULL;
4155 ctxt->wellFormed = 1;
4156 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004157 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004158 ctxt->html = 1;
4159 ctxt->record_info = 0;
4160 ctxt->validate = 0;
4161 ctxt->nbChars = 0;
4162 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004163 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004164 xmlInitNodeInfoSeq(&ctxt->node_seq);
4165}
4166
4167/**
4168 * htmlFreeParserCtxt:
4169 * @ctxt: an HTML parser context
4170 *
4171 * Free all the memory used by a parser context. However the parsed
4172 * document in ctxt->myDoc is not freed.
4173 */
4174
4175void
4176htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4177{
4178 xmlFreeParserCtxt(ctxt);
4179}
4180
4181/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004182 * htmlNewParserCtxt:
4183 *
4184 * Allocate and initialize a new parser context.
4185 *
4186 * Returns the xmlParserCtxtPtr or NULL
4187 */
4188
4189static htmlParserCtxtPtr
4190htmlNewParserCtxt(void)
4191{
4192 xmlParserCtxtPtr ctxt;
4193
4194 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4195 if (ctxt == NULL) {
4196 xmlGenericError(xmlGenericErrorContext,
4197 "xmlNewParserCtxt : cannot allocate context\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004198 return(NULL);
4199 }
4200 memset(ctxt, 0, sizeof(xmlParserCtxt));
4201 htmlInitParserCtxt(ctxt);
4202 return(ctxt);
4203}
4204
4205/**
4206 * htmlCreateMemoryParserCtxt:
4207 * @buffer: a pointer to a char array
4208 * @size: the size of the array
4209 *
4210 * Create a parser context for an HTML in-memory document.
4211 *
4212 * Returns the new parser context or NULL
4213 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004214htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004215htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4216 xmlParserCtxtPtr ctxt;
4217 xmlParserInputPtr input;
4218 xmlParserInputBufferPtr buf;
4219
4220 if (buffer == NULL)
4221 return(NULL);
4222 if (size <= 0)
4223 return(NULL);
4224
4225 ctxt = htmlNewParserCtxt();
4226 if (ctxt == NULL)
4227 return(NULL);
4228
4229 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4230 if (buf == NULL) return(NULL);
4231
4232 input = xmlNewInputStream(ctxt);
4233 if (input == NULL) {
4234 xmlFreeParserCtxt(ctxt);
4235 return(NULL);
4236 }
4237
4238 input->filename = NULL;
4239 input->buf = buf;
4240 input->base = input->buf->buffer->content;
4241 input->cur = input->buf->buffer->content;
4242 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4243
4244 inputPush(ctxt, input);
4245 return(ctxt);
4246}
4247
4248/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004249 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004250 * @cur: a pointer to an array of xmlChar
4251 * @encoding: a free form C string describing the HTML document encoding, or NULL
4252 *
4253 * Create a parser context for an HTML document.
4254 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004255 * TODO: check the need to add encoding handling there
4256 *
Owen Taylor3473f882001-02-23 17:55:21 +00004257 * Returns the new parser context or NULL
4258 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004259static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00004260htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004261 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004262 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004263
Daniel Veillard1d995272002-07-22 16:43:32 +00004264 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004265 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004266 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004267 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4268
4269 if (encoding != NULL) {
4270 xmlCharEncoding enc;
4271 xmlCharEncodingHandlerPtr handler;
4272
4273 if (ctxt->input->encoding != NULL)
4274 xmlFree((xmlChar *) ctxt->input->encoding);
4275 ctxt->input->encoding = (const xmlChar *) encoding;
4276
4277 enc = xmlParseCharEncoding(encoding);
4278 /*
4279 * registered set of known encodings
4280 */
4281 if (enc != XML_CHAR_ENCODING_ERROR) {
4282 xmlSwitchEncoding(ctxt, enc);
4283 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4284 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4285 ctxt->sax->error(ctxt->userData,
4286 "Unsupported encoding %s\n", encoding);
4287 ctxt->input->encoding = NULL;
4288 }
4289 } else {
4290 /*
4291 * fallback for unknown encodings
4292 */
4293 handler = xmlFindCharEncodingHandler((const char *) encoding);
4294 if (handler != NULL) {
4295 xmlSwitchToEncoding(ctxt, handler);
4296 } else {
4297 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
4298 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4299 ctxt->sax->error(ctxt->userData,
4300 "Unsupported encoding %s\n", encoding);
4301 }
4302 }
4303 }
4304 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004305}
4306
4307/************************************************************************
4308 * *
4309 * Progressive parsing interfaces *
4310 * *
4311 ************************************************************************/
4312
4313/**
4314 * htmlParseLookupSequence:
4315 * @ctxt: an HTML parser context
4316 * @first: the first char to lookup
4317 * @next: the next char to lookup or zero
4318 * @third: the next char to lookup or zero
4319 *
4320 * Try to find if a sequence (first, next, third) or just (first next) or
4321 * (first) is available in the input stream.
4322 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4323 * to avoid rescanning sequences of bytes, it DOES change the state of the
4324 * parser, do not use liberally.
4325 * This is basically similar to xmlParseLookupSequence()
4326 *
4327 * Returns the index to the current parsing point if the full sequence
4328 * is available, -1 otherwise.
4329 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004330static int
Owen Taylor3473f882001-02-23 17:55:21 +00004331htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
4332 xmlChar next, xmlChar third) {
4333 int base, len;
4334 htmlParserInputPtr in;
4335 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004336 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004337
4338 in = ctxt->input;
4339 if (in == NULL) return(-1);
4340 base = in->cur - in->base;
4341 if (base < 0) return(-1);
4342 if (ctxt->checkIndex > base)
4343 base = ctxt->checkIndex;
4344 if (in->buf == NULL) {
4345 buf = in->base;
4346 len = in->length;
4347 } else {
4348 buf = in->buf->buffer->content;
4349 len = in->buf->buffer->use;
4350 }
4351 /* take into account the sequence length */
4352 if (third) len -= 2;
4353 else if (next) len --;
4354 for (;base < len;base++) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004355 if (!incomment && (base + 4 < len)) {
4356 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4357 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4358 incomment = 1;
4359 }
4360 /* do not increment base, some people use <!--> */
4361 }
4362 if (incomment) {
4363 if (base + 3 < len)
4364 return(-1);
4365 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4366 (buf[base + 2] == '>')) {
4367 incomment = 0;
4368 base += 2;
4369 }
4370 continue;
4371 }
Owen Taylor3473f882001-02-23 17:55:21 +00004372 if (buf[base] == first) {
4373 if (third != 0) {
4374 if ((buf[base + 1] != next) ||
4375 (buf[base + 2] != third)) continue;
4376 } else if (next != 0) {
4377 if (buf[base + 1] != next) continue;
4378 }
4379 ctxt->checkIndex = 0;
4380#ifdef DEBUG_PUSH
4381 if (next == 0)
4382 xmlGenericError(xmlGenericErrorContext,
4383 "HPP: lookup '%c' found at %d\n",
4384 first, base);
4385 else if (third == 0)
4386 xmlGenericError(xmlGenericErrorContext,
4387 "HPP: lookup '%c%c' found at %d\n",
4388 first, next, base);
4389 else
4390 xmlGenericError(xmlGenericErrorContext,
4391 "HPP: lookup '%c%c%c' found at %d\n",
4392 first, next, third, base);
4393#endif
4394 return(base - (in->cur - in->base));
4395 }
4396 }
4397 ctxt->checkIndex = base;
4398#ifdef DEBUG_PUSH
4399 if (next == 0)
4400 xmlGenericError(xmlGenericErrorContext,
4401 "HPP: lookup '%c' failed\n", first);
4402 else if (third == 0)
4403 xmlGenericError(xmlGenericErrorContext,
4404 "HPP: lookup '%c%c' failed\n", first, next);
4405 else
4406 xmlGenericError(xmlGenericErrorContext,
4407 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4408#endif
4409 return(-1);
4410}
4411
4412/**
4413 * htmlParseTryOrFinish:
4414 * @ctxt: an HTML parser context
4415 * @terminate: last chunk indicator
4416 *
4417 * Try to progress on parsing
4418 *
4419 * Returns zero if no parsing was possible
4420 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004421static int
Owen Taylor3473f882001-02-23 17:55:21 +00004422htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4423 int ret = 0;
4424 htmlParserInputPtr in;
4425 int avail = 0;
4426 xmlChar cur, next;
4427
4428#ifdef DEBUG_PUSH
4429 switch (ctxt->instate) {
4430 case XML_PARSER_EOF:
4431 xmlGenericError(xmlGenericErrorContext,
4432 "HPP: try EOF\n"); break;
4433 case XML_PARSER_START:
4434 xmlGenericError(xmlGenericErrorContext,
4435 "HPP: try START\n"); break;
4436 case XML_PARSER_MISC:
4437 xmlGenericError(xmlGenericErrorContext,
4438 "HPP: try MISC\n");break;
4439 case XML_PARSER_COMMENT:
4440 xmlGenericError(xmlGenericErrorContext,
4441 "HPP: try COMMENT\n");break;
4442 case XML_PARSER_PROLOG:
4443 xmlGenericError(xmlGenericErrorContext,
4444 "HPP: try PROLOG\n");break;
4445 case XML_PARSER_START_TAG:
4446 xmlGenericError(xmlGenericErrorContext,
4447 "HPP: try START_TAG\n");break;
4448 case XML_PARSER_CONTENT:
4449 xmlGenericError(xmlGenericErrorContext,
4450 "HPP: try CONTENT\n");break;
4451 case XML_PARSER_CDATA_SECTION:
4452 xmlGenericError(xmlGenericErrorContext,
4453 "HPP: try CDATA_SECTION\n");break;
4454 case XML_PARSER_END_TAG:
4455 xmlGenericError(xmlGenericErrorContext,
4456 "HPP: try END_TAG\n");break;
4457 case XML_PARSER_ENTITY_DECL:
4458 xmlGenericError(xmlGenericErrorContext,
4459 "HPP: try ENTITY_DECL\n");break;
4460 case XML_PARSER_ENTITY_VALUE:
4461 xmlGenericError(xmlGenericErrorContext,
4462 "HPP: try ENTITY_VALUE\n");break;
4463 case XML_PARSER_ATTRIBUTE_VALUE:
4464 xmlGenericError(xmlGenericErrorContext,
4465 "HPP: try ATTRIBUTE_VALUE\n");break;
4466 case XML_PARSER_DTD:
4467 xmlGenericError(xmlGenericErrorContext,
4468 "HPP: try DTD\n");break;
4469 case XML_PARSER_EPILOG:
4470 xmlGenericError(xmlGenericErrorContext,
4471 "HPP: try EPILOG\n");break;
4472 case XML_PARSER_PI:
4473 xmlGenericError(xmlGenericErrorContext,
4474 "HPP: try PI\n");break;
4475 case XML_PARSER_SYSTEM_LITERAL:
4476 xmlGenericError(xmlGenericErrorContext,
4477 "HPP: try SYSTEM_LITERAL\n");break;
4478 }
4479#endif
4480
4481 while (1) {
4482
4483 in = ctxt->input;
4484 if (in == NULL) break;
4485 if (in->buf == NULL)
4486 avail = in->length - (in->cur - in->base);
4487 else
4488 avail = in->buf->buffer->use - (in->cur - in->base);
4489 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004490 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004491 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4492 /*
4493 * SAX: end of the document processing.
4494 */
4495 ctxt->instate = XML_PARSER_EOF;
4496 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4497 ctxt->sax->endDocument(ctxt->userData);
4498 }
4499 }
4500 if (avail < 1)
4501 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004502 cur = in->cur[0];
4503 if (cur == 0) {
4504 SKIP(1);
4505 continue;
4506 }
4507
Owen Taylor3473f882001-02-23 17:55:21 +00004508 switch (ctxt->instate) {
4509 case XML_PARSER_EOF:
4510 /*
4511 * Document parsing is done !
4512 */
4513 goto done;
4514 case XML_PARSER_START:
4515 /*
4516 * Very first chars read from the document flow.
4517 */
4518 cur = in->cur[0];
4519 if (IS_BLANK(cur)) {
4520 SKIP_BLANKS;
4521 if (in->buf == NULL)
4522 avail = in->length - (in->cur - in->base);
4523 else
4524 avail = in->buf->buffer->use - (in->cur - in->base);
4525 }
4526 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4527 ctxt->sax->setDocumentLocator(ctxt->userData,
4528 &xmlDefaultSAXLocator);
4529 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4530 (!ctxt->disableSAX))
4531 ctxt->sax->startDocument(ctxt->userData);
4532
4533 cur = in->cur[0];
4534 next = in->cur[1];
4535 if ((cur == '<') && (next == '!') &&
4536 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4537 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4538 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4539 (UPP(8) == 'E')) {
4540 if ((!terminate) &&
4541 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4542 goto done;
4543#ifdef DEBUG_PUSH
4544 xmlGenericError(xmlGenericErrorContext,
4545 "HPP: Parsing internal subset\n");
4546#endif
4547 htmlParseDocTypeDecl(ctxt);
4548 ctxt->instate = XML_PARSER_PROLOG;
4549#ifdef DEBUG_PUSH
4550 xmlGenericError(xmlGenericErrorContext,
4551 "HPP: entering PROLOG\n");
4552#endif
4553 } else {
4554 ctxt->instate = XML_PARSER_MISC;
4555 }
4556#ifdef DEBUG_PUSH
4557 xmlGenericError(xmlGenericErrorContext,
4558 "HPP: entering MISC\n");
4559#endif
4560 break;
4561 case XML_PARSER_MISC:
4562 SKIP_BLANKS;
4563 if (in->buf == NULL)
4564 avail = in->length - (in->cur - in->base);
4565 else
4566 avail = in->buf->buffer->use - (in->cur - in->base);
4567 if (avail < 2)
4568 goto done;
4569 cur = in->cur[0];
4570 next = in->cur[1];
4571 if ((cur == '<') && (next == '!') &&
4572 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4573 if ((!terminate) &&
4574 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4575 goto done;
4576#ifdef DEBUG_PUSH
4577 xmlGenericError(xmlGenericErrorContext,
4578 "HPP: Parsing Comment\n");
4579#endif
4580 htmlParseComment(ctxt);
4581 ctxt->instate = XML_PARSER_MISC;
4582 } else if ((cur == '<') && (next == '!') &&
4583 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4584 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4585 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4586 (UPP(8) == 'E')) {
4587 if ((!terminate) &&
4588 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4589 goto done;
4590#ifdef DEBUG_PUSH
4591 xmlGenericError(xmlGenericErrorContext,
4592 "HPP: Parsing internal subset\n");
4593#endif
4594 htmlParseDocTypeDecl(ctxt);
4595 ctxt->instate = XML_PARSER_PROLOG;
4596#ifdef DEBUG_PUSH
4597 xmlGenericError(xmlGenericErrorContext,
4598 "HPP: entering PROLOG\n");
4599#endif
4600 } else if ((cur == '<') && (next == '!') &&
4601 (avail < 9)) {
4602 goto done;
4603 } else {
4604 ctxt->instate = XML_PARSER_START_TAG;
4605#ifdef DEBUG_PUSH
4606 xmlGenericError(xmlGenericErrorContext,
4607 "HPP: entering START_TAG\n");
4608#endif
4609 }
4610 break;
4611 case XML_PARSER_PROLOG:
4612 SKIP_BLANKS;
4613 if (in->buf == NULL)
4614 avail = in->length - (in->cur - in->base);
4615 else
4616 avail = in->buf->buffer->use - (in->cur - in->base);
4617 if (avail < 2)
4618 goto done;
4619 cur = in->cur[0];
4620 next = in->cur[1];
4621 if ((cur == '<') && (next == '!') &&
4622 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4623 if ((!terminate) &&
4624 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4625 goto done;
4626#ifdef DEBUG_PUSH
4627 xmlGenericError(xmlGenericErrorContext,
4628 "HPP: Parsing Comment\n");
4629#endif
4630 htmlParseComment(ctxt);
4631 ctxt->instate = XML_PARSER_PROLOG;
4632 } else if ((cur == '<') && (next == '!') &&
4633 (avail < 4)) {
4634 goto done;
4635 } else {
4636 ctxt->instate = XML_PARSER_START_TAG;
4637#ifdef DEBUG_PUSH
4638 xmlGenericError(xmlGenericErrorContext,
4639 "HPP: entering START_TAG\n");
4640#endif
4641 }
4642 break;
4643 case XML_PARSER_EPILOG:
4644 if (in->buf == NULL)
4645 avail = in->length - (in->cur - in->base);
4646 else
4647 avail = in->buf->buffer->use - (in->cur - in->base);
4648 if (avail < 1)
4649 goto done;
4650 cur = in->cur[0];
4651 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004652 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004653 goto done;
4654 }
4655 if (avail < 2)
4656 goto done;
4657 next = in->cur[1];
4658 if ((cur == '<') && (next == '!') &&
4659 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4660 if ((!terminate) &&
4661 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4662 goto done;
4663#ifdef DEBUG_PUSH
4664 xmlGenericError(xmlGenericErrorContext,
4665 "HPP: Parsing Comment\n");
4666#endif
4667 htmlParseComment(ctxt);
4668 ctxt->instate = XML_PARSER_EPILOG;
4669 } else if ((cur == '<') && (next == '!') &&
4670 (avail < 4)) {
4671 goto done;
4672 } else {
4673 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004674 ctxt->wellFormed = 0;
4675 ctxt->instate = XML_PARSER_EOF;
4676#ifdef DEBUG_PUSH
4677 xmlGenericError(xmlGenericErrorContext,
4678 "HPP: entering EOF\n");
4679#endif
4680 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4681 ctxt->sax->endDocument(ctxt->userData);
4682 goto done;
4683 }
4684 break;
4685 case XML_PARSER_START_TAG: {
4686 xmlChar *name, *oldname;
4687 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004688 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004689
4690 if (avail < 2)
4691 goto done;
4692 cur = in->cur[0];
4693 if (cur != '<') {
4694 ctxt->instate = XML_PARSER_CONTENT;
4695#ifdef DEBUG_PUSH
4696 xmlGenericError(xmlGenericErrorContext,
4697 "HPP: entering CONTENT\n");
4698#endif
4699 break;
4700 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004701 if (in->cur[1] == '/') {
4702 ctxt->instate = XML_PARSER_END_TAG;
4703 ctxt->checkIndex = 0;
4704#ifdef DEBUG_PUSH
4705 xmlGenericError(xmlGenericErrorContext,
4706 "HPP: entering END_TAG\n");
4707#endif
4708 break;
4709 }
Owen Taylor3473f882001-02-23 17:55:21 +00004710 if ((!terminate) &&
4711 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4712 goto done;
4713
4714 oldname = xmlStrdup(ctxt->name);
4715 htmlParseStartTag(ctxt);
4716 name = ctxt->name;
4717#ifdef DEBUG
4718 if (oldname == NULL)
4719 xmlGenericError(xmlGenericErrorContext,
4720 "Start of element %s\n", name);
4721 else if (name == NULL)
4722 xmlGenericError(xmlGenericErrorContext,
4723 "Start of element failed, was %s\n",
4724 oldname);
4725 else
4726 xmlGenericError(xmlGenericErrorContext,
4727 "Start of element %s, was %s\n",
4728 name, oldname);
4729#endif
4730 if (((depth == ctxt->nameNr) &&
4731 (xmlStrEqual(oldname, ctxt->name))) ||
4732 (name == NULL)) {
4733 if (CUR == '>')
4734 NEXT;
4735 if (oldname != NULL)
4736 xmlFree(oldname);
4737 break;
4738 }
4739 if (oldname != NULL)
4740 xmlFree(oldname);
4741
4742 /*
4743 * Lookup the info for that element.
4744 */
4745 info = htmlTagLookup(name);
4746 if (info == NULL) {
4747 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4748 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4749 name);
4750 ctxt->wellFormed = 0;
4751 } else if (info->depr) {
4752 /***************************
4753 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4754 ctxt->sax->warning(ctxt->userData,
4755 "Tag %s is deprecated\n",
4756 name);
4757 ***************************/
4758 }
4759
4760 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004761 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004762 */
4763 if ((CUR == '/') && (NXT(1) == '>')) {
4764 SKIP(2);
4765 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4766 ctxt->sax->endElement(ctxt->userData, name);
4767 oldname = htmlnamePop(ctxt);
4768#ifdef DEBUG
4769 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4770 oldname);
4771#endif
4772 if (oldname != NULL)
4773 xmlFree(oldname);
4774 ctxt->instate = XML_PARSER_CONTENT;
4775#ifdef DEBUG_PUSH
4776 xmlGenericError(xmlGenericErrorContext,
4777 "HPP: entering CONTENT\n");
4778#endif
4779 break;
4780 }
4781
4782 if (CUR == '>') {
4783 NEXT;
4784 } else {
4785 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4786 ctxt->sax->error(ctxt->userData,
4787 "Couldn't find end of Start Tag %s\n",
4788 name);
4789 ctxt->wellFormed = 0;
4790
4791 /*
4792 * end of parsing of this node.
4793 */
4794 if (xmlStrEqual(name, ctxt->name)) {
4795 nodePop(ctxt);
4796 oldname = htmlnamePop(ctxt);
4797#ifdef DEBUG
4798 xmlGenericError(xmlGenericErrorContext,
4799 "End of start tag problem: popping out %s\n", oldname);
4800#endif
4801 if (oldname != NULL)
4802 xmlFree(oldname);
4803 }
4804
4805 ctxt->instate = XML_PARSER_CONTENT;
4806#ifdef DEBUG_PUSH
4807 xmlGenericError(xmlGenericErrorContext,
4808 "HPP: entering CONTENT\n");
4809#endif
4810 break;
4811 }
4812
4813 /*
4814 * Check for an Empty Element from DTD definition
4815 */
4816 if ((info != NULL) && (info->empty)) {
4817 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4818 ctxt->sax->endElement(ctxt->userData, name);
4819 oldname = htmlnamePop(ctxt);
4820#ifdef DEBUG
4821 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4822#endif
4823 if (oldname != NULL)
4824 xmlFree(oldname);
4825 }
4826 ctxt->instate = XML_PARSER_CONTENT;
4827#ifdef DEBUG_PUSH
4828 xmlGenericError(xmlGenericErrorContext,
4829 "HPP: entering CONTENT\n");
4830#endif
4831 break;
4832 }
4833 case XML_PARSER_CONTENT: {
4834 long cons;
4835 /*
4836 * Handle preparsed entities and charRef
4837 */
4838 if (ctxt->token != 0) {
4839 xmlChar chr[2] = { 0 , 0 } ;
4840
4841 chr[0] = (xmlChar) ctxt->token;
4842 htmlCheckParagraph(ctxt);
4843 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4844 ctxt->sax->characters(ctxt->userData, chr, 1);
4845 ctxt->token = 0;
4846 ctxt->checkIndex = 0;
4847 }
4848 if ((avail == 1) && (terminate)) {
4849 cur = in->cur[0];
4850 if ((cur != '<') && (cur != '&')) {
4851 if (ctxt->sax != NULL) {
4852 if (IS_BLANK(cur)) {
4853 if (ctxt->sax->ignorableWhitespace != NULL)
4854 ctxt->sax->ignorableWhitespace(
4855 ctxt->userData, &cur, 1);
4856 } else {
4857 htmlCheckParagraph(ctxt);
4858 if (ctxt->sax->characters != NULL)
4859 ctxt->sax->characters(
4860 ctxt->userData, &cur, 1);
4861 }
4862 }
4863 ctxt->token = 0;
4864 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00004865 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00004866 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004867 }
Owen Taylor3473f882001-02-23 17:55:21 +00004868 }
4869 if (avail < 2)
4870 goto done;
4871 cur = in->cur[0];
4872 next = in->cur[1];
4873 cons = ctxt->nbChars;
4874 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4875 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4876 /*
4877 * Handle SCRIPT/STYLE separately
4878 */
4879 if ((!terminate) &&
4880 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4881 goto done;
4882 htmlParseScript(ctxt);
4883 if ((cur == '<') && (next == '/')) {
4884 ctxt->instate = XML_PARSER_END_TAG;
4885 ctxt->checkIndex = 0;
4886#ifdef DEBUG_PUSH
4887 xmlGenericError(xmlGenericErrorContext,
4888 "HPP: entering END_TAG\n");
4889#endif
4890 break;
4891 }
4892 } else {
4893 /*
4894 * Sometimes DOCTYPE arrives in the middle of the document
4895 */
4896 if ((cur == '<') && (next == '!') &&
4897 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4898 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4899 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4900 (UPP(8) == 'E')) {
4901 if ((!terminate) &&
4902 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4903 goto done;
4904 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4905 ctxt->sax->error(ctxt->userData,
4906 "Misplaced DOCTYPE declaration\n");
4907 ctxt->wellFormed = 0;
4908 htmlParseDocTypeDecl(ctxt);
4909 } else if ((cur == '<') && (next == '!') &&
4910 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4911 if ((!terminate) &&
4912 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4913 goto done;
4914#ifdef DEBUG_PUSH
4915 xmlGenericError(xmlGenericErrorContext,
4916 "HPP: Parsing Comment\n");
4917#endif
4918 htmlParseComment(ctxt);
4919 ctxt->instate = XML_PARSER_CONTENT;
4920 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4921 goto done;
4922 } else if ((cur == '<') && (next == '/')) {
4923 ctxt->instate = XML_PARSER_END_TAG;
4924 ctxt->checkIndex = 0;
4925#ifdef DEBUG_PUSH
4926 xmlGenericError(xmlGenericErrorContext,
4927 "HPP: entering END_TAG\n");
4928#endif
4929 break;
4930 } else if (cur == '<') {
4931 ctxt->instate = XML_PARSER_START_TAG;
4932 ctxt->checkIndex = 0;
4933#ifdef DEBUG_PUSH
4934 xmlGenericError(xmlGenericErrorContext,
4935 "HPP: entering START_TAG\n");
4936#endif
4937 break;
4938 } else if (cur == '&') {
4939 if ((!terminate) &&
4940 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4941 goto done;
4942#ifdef DEBUG_PUSH
4943 xmlGenericError(xmlGenericErrorContext,
4944 "HPP: Parsing Reference\n");
4945#endif
4946 /* TODO: check generation of subtrees if noent !!! */
4947 htmlParseReference(ctxt);
4948 } else {
4949 /* TODO Avoid the extra copy, handle directly !!!!!! */
4950 /*
Daniel Veillard01c13b52002-12-10 15:19:08 +00004951 * Goal of the following test is:
Owen Taylor3473f882001-02-23 17:55:21 +00004952 * - minimize calls to the SAX 'character' callback
4953 * when they are mergeable
4954 */
4955 if ((ctxt->inputNr == 1) &&
4956 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4957 if ((!terminate) &&
4958 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4959 goto done;
4960 }
4961 ctxt->checkIndex = 0;
4962#ifdef DEBUG_PUSH
4963 xmlGenericError(xmlGenericErrorContext,
4964 "HPP: Parsing char data\n");
4965#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004966 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004967 }
4968 }
4969 if (cons == ctxt->nbChars) {
4970 if (ctxt->node != NULL) {
4971 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4972 ctxt->sax->error(ctxt->userData,
4973 "detected an error in element content\n");
4974 ctxt->wellFormed = 0;
4975 }
4976 NEXT;
4977 break;
4978 }
4979
4980 break;
4981 }
4982 case XML_PARSER_END_TAG:
4983 if (avail < 2)
4984 goto done;
4985 if ((!terminate) &&
4986 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4987 goto done;
4988 htmlParseEndTag(ctxt);
4989 if (ctxt->nameNr == 0) {
4990 ctxt->instate = XML_PARSER_EPILOG;
4991 } else {
4992 ctxt->instate = XML_PARSER_CONTENT;
4993 }
4994 ctxt->checkIndex = 0;
4995#ifdef DEBUG_PUSH
4996 xmlGenericError(xmlGenericErrorContext,
4997 "HPP: entering CONTENT\n");
4998#endif
4999 break;
5000 case XML_PARSER_CDATA_SECTION:
5001 xmlGenericError(xmlGenericErrorContext,
5002 "HPP: internal error, state == CDATA\n");
5003 ctxt->instate = XML_PARSER_CONTENT;
5004 ctxt->checkIndex = 0;
5005#ifdef DEBUG_PUSH
5006 xmlGenericError(xmlGenericErrorContext,
5007 "HPP: entering CONTENT\n");
5008#endif
5009 break;
5010 case XML_PARSER_DTD:
5011 xmlGenericError(xmlGenericErrorContext,
5012 "HPP: internal error, state == DTD\n");
5013 ctxt->instate = XML_PARSER_CONTENT;
5014 ctxt->checkIndex = 0;
5015#ifdef DEBUG_PUSH
5016 xmlGenericError(xmlGenericErrorContext,
5017 "HPP: entering CONTENT\n");
5018#endif
5019 break;
5020 case XML_PARSER_COMMENT:
5021 xmlGenericError(xmlGenericErrorContext,
5022 "HPP: internal error, state == COMMENT\n");
5023 ctxt->instate = XML_PARSER_CONTENT;
5024 ctxt->checkIndex = 0;
5025#ifdef DEBUG_PUSH
5026 xmlGenericError(xmlGenericErrorContext,
5027 "HPP: entering CONTENT\n");
5028#endif
5029 break;
5030 case XML_PARSER_PI:
5031 xmlGenericError(xmlGenericErrorContext,
5032 "HPP: internal error, state == PI\n");
5033 ctxt->instate = XML_PARSER_CONTENT;
5034 ctxt->checkIndex = 0;
5035#ifdef DEBUG_PUSH
5036 xmlGenericError(xmlGenericErrorContext,
5037 "HPP: entering CONTENT\n");
5038#endif
5039 break;
5040 case XML_PARSER_ENTITY_DECL:
5041 xmlGenericError(xmlGenericErrorContext,
5042 "HPP: internal error, state == ENTITY_DECL\n");
5043 ctxt->instate = XML_PARSER_CONTENT;
5044 ctxt->checkIndex = 0;
5045#ifdef DEBUG_PUSH
5046 xmlGenericError(xmlGenericErrorContext,
5047 "HPP: entering CONTENT\n");
5048#endif
5049 break;
5050 case XML_PARSER_ENTITY_VALUE:
5051 xmlGenericError(xmlGenericErrorContext,
5052 "HPP: internal error, state == ENTITY_VALUE\n");
5053 ctxt->instate = XML_PARSER_CONTENT;
5054 ctxt->checkIndex = 0;
5055#ifdef DEBUG_PUSH
5056 xmlGenericError(xmlGenericErrorContext,
5057 "HPP: entering DTD\n");
5058#endif
5059 break;
5060 case XML_PARSER_ATTRIBUTE_VALUE:
5061 xmlGenericError(xmlGenericErrorContext,
5062 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
5063 ctxt->instate = XML_PARSER_START_TAG;
5064 ctxt->checkIndex = 0;
5065#ifdef DEBUG_PUSH
5066 xmlGenericError(xmlGenericErrorContext,
5067 "HPP: entering START_TAG\n");
5068#endif
5069 break;
5070 case XML_PARSER_SYSTEM_LITERAL:
5071 xmlGenericError(xmlGenericErrorContext,
5072 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
5073 ctxt->instate = XML_PARSER_CONTENT;
5074 ctxt->checkIndex = 0;
5075#ifdef DEBUG_PUSH
5076 xmlGenericError(xmlGenericErrorContext,
5077 "HPP: entering CONTENT\n");
5078#endif
5079 break;
5080 case XML_PARSER_IGNORE:
5081 xmlGenericError(xmlGenericErrorContext,
5082 "HPP: internal error, state == XML_PARSER_IGNORE\n");
5083 ctxt->instate = XML_PARSER_CONTENT;
5084 ctxt->checkIndex = 0;
5085#ifdef DEBUG_PUSH
5086 xmlGenericError(xmlGenericErrorContext,
5087 "HPP: entering CONTENT\n");
5088#endif
5089 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005090 case XML_PARSER_PUBLIC_LITERAL:
5091 xmlGenericError(xmlGenericErrorContext,
5092 "HPP: internal error, state == XML_PARSER_LITERAL\n");
5093 ctxt->instate = XML_PARSER_CONTENT;
5094 ctxt->checkIndex = 0;
5095#ifdef DEBUG_PUSH
5096 xmlGenericError(xmlGenericErrorContext,
5097 "HPP: entering CONTENT\n");
5098#endif
5099 break;
5100
Owen Taylor3473f882001-02-23 17:55:21 +00005101 }
5102 }
5103done:
5104 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005105 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005106 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5107 /*
5108 * SAX: end of the document processing.
5109 */
5110 ctxt->instate = XML_PARSER_EOF;
5111 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5112 ctxt->sax->endDocument(ctxt->userData);
5113 }
5114 }
5115 if ((ctxt->myDoc != NULL) &&
5116 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5117 (ctxt->instate == XML_PARSER_EPILOG))) {
5118 xmlDtdPtr dtd;
5119 dtd = xmlGetIntSubset(ctxt->myDoc);
5120 if (dtd == NULL)
5121 ctxt->myDoc->intSubset =
5122 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
5123 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5124 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5125 }
5126#ifdef DEBUG_PUSH
5127 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5128#endif
5129 return(ret);
5130}
5131
5132/**
Owen Taylor3473f882001-02-23 17:55:21 +00005133 * htmlParseChunk:
5134 * @ctxt: an XML parser context
5135 * @chunk: an char array
5136 * @size: the size in byte of the chunk
5137 * @terminate: last chunk indicator
5138 *
5139 * Parse a Chunk of memory
5140 *
5141 * Returns zero if no error, the xmlParserErrors otherwise.
5142 */
5143int
5144htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5145 int terminate) {
5146 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5147 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5148 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5149 int cur = ctxt->input->cur - ctxt->input->base;
5150
5151 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5152 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5153 ctxt->input->cur = ctxt->input->base + cur;
5154#ifdef DEBUG_PUSH
5155 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5156#endif
5157
5158 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5159 htmlParseTryOrFinish(ctxt, terminate);
5160 } else if (ctxt->instate != XML_PARSER_EOF) {
5161 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
5162 htmlParseTryOrFinish(ctxt, terminate);
5163 }
5164 if (terminate) {
5165 if ((ctxt->instate != XML_PARSER_EOF) &&
5166 (ctxt->instate != XML_PARSER_EPILOG) &&
5167 (ctxt->instate != XML_PARSER_MISC)) {
5168 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005169 ctxt->wellFormed = 0;
5170 }
5171 if (ctxt->instate != XML_PARSER_EOF) {
5172 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5173 ctxt->sax->endDocument(ctxt->userData);
5174 }
5175 ctxt->instate = XML_PARSER_EOF;
5176 }
5177 return((xmlParserErrors) ctxt->errNo);
5178}
5179
5180/************************************************************************
5181 * *
5182 * User entry points *
5183 * *
5184 ************************************************************************/
5185
5186/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005187 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005188 * @sax: a SAX handler
5189 * @user_data: The user data returned on SAX callbacks
5190 * @chunk: a pointer to an array of chars
5191 * @size: number of chars in the array
5192 * @filename: an optional file name or URI
5193 * @enc: an optional encoding
5194 *
5195 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005196 * The value of @filename is used for fetching external entities
5197 * and error/warning reports.
5198 *
5199 * Returns the new parser context or NULL
5200 */
5201htmlParserCtxtPtr
5202htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5203 const char *chunk, int size, const char *filename,
5204 xmlCharEncoding enc) {
5205 htmlParserCtxtPtr ctxt;
5206 htmlParserInputPtr inputStream;
5207 xmlParserInputBufferPtr buf;
5208
Daniel Veillardd0463562001-10-13 09:15:48 +00005209 xmlInitParser();
5210
Owen Taylor3473f882001-02-23 17:55:21 +00005211 buf = xmlAllocParserInputBuffer(enc);
5212 if (buf == NULL) return(NULL);
5213
5214 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
5215 if (ctxt == NULL) {
5216 xmlFree(buf);
5217 return(NULL);
5218 }
5219 memset(ctxt, 0, sizeof(htmlParserCtxt));
5220 htmlInitParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005221 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5222 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005223 if (sax != NULL) {
5224 if (ctxt->sax != &htmlDefaultSAXHandler)
5225 xmlFree(ctxt->sax);
5226 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5227 if (ctxt->sax == NULL) {
5228 xmlFree(buf);
5229 xmlFree(ctxt);
5230 return(NULL);
5231 }
5232 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5233 if (user_data != NULL)
5234 ctxt->userData = user_data;
5235 }
5236 if (filename == NULL) {
5237 ctxt->directory = NULL;
5238 } else {
5239 ctxt->directory = xmlParserGetDirectory(filename);
5240 }
5241
5242 inputStream = htmlNewInputStream(ctxt);
5243 if (inputStream == NULL) {
5244 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005245 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005246 return(NULL);
5247 }
5248
5249 if (filename == NULL)
5250 inputStream->filename = NULL;
5251 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005252 inputStream->filename = (char *)
5253 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005254 inputStream->buf = buf;
5255 inputStream->base = inputStream->buf->buffer->content;
5256 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillard5f704af2003-03-05 10:01:43 +00005257 inputStream->end =
5258 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005259
5260 inputPush(ctxt, inputStream);
5261
5262 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5263 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005264 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5265 int cur = ctxt->input->cur - ctxt->input->base;
5266
Owen Taylor3473f882001-02-23 17:55:21 +00005267 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005268
5269 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5270 ctxt->input->cur = ctxt->input->base + cur;
5271 ctxt->input->end =
5272 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005273#ifdef DEBUG_PUSH
5274 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5275#endif
5276 }
5277
5278 return(ctxt);
5279}
5280
5281/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005282 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005283 * @cur: a pointer to an array of xmlChar
5284 * @encoding: a free form C string describing the HTML document encoding, or NULL
5285 * @sax: the SAX handler block
5286 * @userData: if using SAX, this pointer will be provided on callbacks.
5287 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005288 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5289 * to handle parse events. If sax is NULL, fallback to the default DOM
5290 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00005291 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005292 * Returns the resulting document tree unless SAX is NULL or the document is
5293 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005294 */
5295
5296htmlDocPtr
5297htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5298 htmlDocPtr ret;
5299 htmlParserCtxtPtr ctxt;
5300
Daniel Veillardd0463562001-10-13 09:15:48 +00005301 xmlInitParser();
5302
Owen Taylor3473f882001-02-23 17:55:21 +00005303 if (cur == NULL) return(NULL);
5304
5305
5306 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5307 if (ctxt == NULL) return(NULL);
5308 if (sax != NULL) {
5309 ctxt->sax = sax;
5310 ctxt->userData = userData;
5311 }
5312
5313 htmlParseDocument(ctxt);
5314 ret = ctxt->myDoc;
5315 if (sax != NULL) {
5316 ctxt->sax = NULL;
5317 ctxt->userData = NULL;
5318 }
5319 htmlFreeParserCtxt(ctxt);
5320
5321 return(ret);
5322}
5323
5324/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005325 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005326 * @cur: a pointer to an array of xmlChar
5327 * @encoding: a free form C string describing the HTML document encoding, or NULL
5328 *
5329 * parse an HTML in-memory document and build a tree.
5330 *
5331 * Returns the resulting document tree
5332 */
5333
5334htmlDocPtr
5335htmlParseDoc(xmlChar *cur, const char *encoding) {
5336 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5337}
5338
5339
5340/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005341 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005342 * @filename: the filename
5343 * @encoding: a free form C string describing the HTML document encoding, or NULL
5344 *
5345 * Create a parser context for a file content.
5346 * Automatic support for ZLIB/Compress compressed document is provided
5347 * by default if found at compile-time.
5348 *
5349 * Returns the new parser context or NULL
5350 */
5351htmlParserCtxtPtr
5352htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5353{
5354 htmlParserCtxtPtr ctxt;
5355 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005356 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005357 /* htmlCharEncoding enc; */
5358 xmlChar *content, *content_line = (xmlChar *) "charset=";
5359
Owen Taylor3473f882001-02-23 17:55:21 +00005360 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
5361 if (ctxt == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00005362 xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00005363 return(NULL);
5364 }
5365 memset(ctxt, 0, sizeof(htmlParserCtxt));
5366 htmlInitParserCtxt(ctxt);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005367 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5368 if (canonicFilename == NULL) {
5369 if (xmlDefaultSAXHandler.error != NULL) {
5370 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5371 }
Daniel Veillard104caa32003-05-13 22:54:05 +00005372 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005373 return(NULL);
5374 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005375
5376 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5377 xmlFree(canonicFilename);
5378 if (inputStream == NULL) {
5379 xmlFreeParserCtxt(ctxt);
5380 return(NULL);
5381 }
Owen Taylor3473f882001-02-23 17:55:21 +00005382
5383 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005384
Owen Taylor3473f882001-02-23 17:55:21 +00005385 /* set encoding */
5386 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005387 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Owen Taylor3473f882001-02-23 17:55:21 +00005388 if (content) {
5389 strcpy ((char *)content, (char *)content_line);
5390 strcat ((char *)content, (char *)encoding);
5391 htmlCheckEncoding (ctxt, content);
5392 xmlFree (content);
5393 }
5394 }
5395
5396 return(ctxt);
5397}
5398
5399/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005400 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005401 * @filename: the filename
5402 * @encoding: a free form C string describing the HTML document encoding, or NULL
5403 * @sax: the SAX handler block
5404 * @userData: if using SAX, this pointer will be provided on callbacks.
5405 *
5406 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5407 * compressed document is provided by default if found at compile-time.
5408 * It use the given SAX function block to handle the parsing callback.
5409 * If sax is NULL, fallback to the default DOM tree building routines.
5410 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005411 * Returns the resulting document tree unless SAX is NULL or the document is
5412 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005413 */
5414
5415htmlDocPtr
5416htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5417 void *userData) {
5418 htmlDocPtr ret;
5419 htmlParserCtxtPtr ctxt;
5420 htmlSAXHandlerPtr oldsax = NULL;
5421
Daniel Veillardd0463562001-10-13 09:15:48 +00005422 xmlInitParser();
5423
Owen Taylor3473f882001-02-23 17:55:21 +00005424 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5425 if (ctxt == NULL) return(NULL);
5426 if (sax != NULL) {
5427 oldsax = ctxt->sax;
5428 ctxt->sax = sax;
5429 ctxt->userData = userData;
5430 }
5431
5432 htmlParseDocument(ctxt);
5433
5434 ret = ctxt->myDoc;
5435 if (sax != NULL) {
5436 ctxt->sax = oldsax;
5437 ctxt->userData = NULL;
5438 }
5439 htmlFreeParserCtxt(ctxt);
5440
5441 return(ret);
5442}
5443
5444/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005445 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005446 * @filename: the filename
5447 * @encoding: a free form C string describing the HTML document encoding, or NULL
5448 *
5449 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5450 * compressed document is provided by default if found at compile-time.
5451 *
5452 * Returns the resulting document tree
5453 */
5454
5455htmlDocPtr
5456htmlParseFile(const char *filename, const char *encoding) {
5457 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5458}
5459
5460/**
5461 * htmlHandleOmittedElem:
5462 * @val: int 0 or 1
5463 *
5464 * Set and return the previous value for handling HTML omitted tags.
5465 *
5466 * Returns the last value for 0 for no handling, 1 for auto insertion.
5467 */
5468
5469int
5470htmlHandleOmittedElem(int val) {
5471 int old = htmlOmittedDefaultValue;
5472
5473 htmlOmittedDefaultValue = val;
5474 return(old);
5475}
5476
Daniel Veillard930dfb62003-02-05 10:17:38 +00005477/**
5478 * htmlElementAllowedHere:
5479 * @parent: HTML parent element
5480 * @elt: HTML element
5481 *
5482 * Checks whether an HTML element may be a direct child of a parent element.
5483 * Note - doesn't check for deprecated elements
5484 *
5485 * Returns 1 if allowed; 0 otherwise.
5486 */
5487int
5488htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5489 const char** p ;
5490
5491 if ( ! elt || ! parent || ! parent->subelts )
5492 return 0 ;
5493
5494 for ( p = parent->subelts; *p; ++p )
5495 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5496 return 1 ;
5497
5498 return 0 ;
5499}
5500/**
5501 * htmlElementStatusHere:
5502 * @parent: HTML parent element
5503 * @elt: HTML element
5504 *
5505 * Checks whether an HTML element may be a direct child of a parent element.
5506 * and if so whether it is valid or deprecated.
5507 *
5508 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5509 */
5510htmlStatus
5511htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5512 if ( ! parent || ! elt )
5513 return HTML_INVALID ;
5514 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5515 return HTML_INVALID ;
5516
5517 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5518}
5519/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005520 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005521 * @elt: HTML element
5522 * @attr: HTML attribute
5523 * @legacy: whether to allow deprecated attributes
5524 *
5525 * Checks whether an attribute is valid for an element
5526 * Has full knowledge of Required and Deprecated attributes
5527 *
5528 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5529 */
5530htmlStatus
5531htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5532 const char** p ;
5533
5534 if ( !elt || ! attr )
5535 return HTML_INVALID ;
5536
5537 if ( elt->attrs_req )
5538 for ( p = elt->attrs_req; *p; ++p)
5539 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5540 return HTML_REQUIRED ;
5541
5542 if ( elt->attrs_opt )
5543 for ( p = elt->attrs_opt; *p; ++p)
5544 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5545 return HTML_VALID ;
5546
5547 if ( legacy && elt->attrs_depr )
5548 for ( p = elt->attrs_depr; *p; ++p)
5549 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5550 return HTML_DEPRECATED ;
5551
5552 return HTML_INVALID ;
5553}
5554/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005555 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005556 * @node: an htmlNodePtr in a tree
5557 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005558 * for Element nodes)
5559 *
5560 * Checks whether the tree node is valid. Experimental (the author
5561 * only uses the HTML enhancements in a SAX parser)
5562 *
5563 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5564 * legacy allowed) or htmlElementStatusHere (otherwise).
5565 * for Attribute nodes, a return from htmlAttrAllowed
5566 * for other nodes, HTML_NA (no checks performed)
5567 */
5568htmlStatus
5569htmlNodeStatus(const htmlNodePtr node, int legacy) {
5570 if ( ! node )
5571 return HTML_INVALID ;
5572
5573 switch ( node->type ) {
5574 case XML_ELEMENT_NODE:
5575 return legacy
5576 ? ( htmlElementAllowedHere (
5577 htmlTagLookup(node->parent->name) , node->name
5578 ) ? HTML_VALID : HTML_INVALID )
5579 : htmlElementStatusHere(
5580 htmlTagLookup(node->parent->name) ,
5581 htmlTagLookup(node->name) )
5582 ;
5583 case XML_ATTRIBUTE_NODE:
5584 return htmlAttrAllowed(
5585 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5586 default: return HTML_NA ;
5587 }
5588}
Owen Taylor3473f882001-02-23 17:55:21 +00005589#endif /* LIBXML_HTML_ENABLED */