blob: 86c575f051dcdb90cd6385530cc63a2a9297ef97 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Owen Taylor3473f882001-02-23 17:55:21 +000062 * Parser stacks related functions and macros *
63 * *
64 ************************************************************************/
65
Daniel Veillard1c732d22002-11-30 11:22:59 +000066/**
67 * htmlnamePush:
68 * @ctxt: an HTML parser context
69 * @value: the element name
70 *
71 * Pushes a new element name on top of the name stack
72 *
73 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +000074 */
Daniel Veillard1c732d22002-11-30 11:22:59 +000075static int
76htmlnamePush(htmlParserCtxtPtr ctxt, xmlChar * value)
77{
78 if (ctxt->nameNr >= ctxt->nameMax) {
79 ctxt->nameMax *= 2;
80 ctxt->nameTab =
81 (xmlChar * *)xmlRealloc(ctxt->nameTab,
82 ctxt->nameMax *
83 sizeof(ctxt->nameTab[0]));
84 if (ctxt->nameTab == NULL) {
85 xmlGenericError(xmlGenericErrorContext, "realloc failed !\n");
86 return (0);
87 }
88 }
89 ctxt->nameTab[ctxt->nameNr] = value;
90 ctxt->name = value;
91 return (ctxt->nameNr++);
92}
93/**
94 * htmlnamePop:
95 * @ctxt: an HTML parser context
96 *
97 * Pops the top element name from the name stack
98 *
99 * Returns the name just removed
100 */
101static xmlChar *
102htmlnamePop(htmlParserCtxtPtr ctxt)
103{
104 xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000105
Daniel Veillard1c732d22002-11-30 11:22:59 +0000106 if (ctxt->nameNr <= 0)
107 return (0);
108 ctxt->nameNr--;
109 if (ctxt->nameNr < 0)
110 return (0);
111 if (ctxt->nameNr > 0)
112 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
113 else
114 ctxt->name = NULL;
115 ret = ctxt->nameTab[ctxt->nameNr];
116 ctxt->nameTab[ctxt->nameNr] = 0;
117 return (ret);
118}
Owen Taylor3473f882001-02-23 17:55:21 +0000119
120/*
121 * Macros for accessing the content. Those should be used only by the parser,
122 * and not exported.
123 *
124 * Dirty macros, i.e. one need to make assumption on the context to use them
125 *
126 * CUR_PTR return the current pointer to the xmlChar to be parsed.
127 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
128 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
129 * in UNICODE mode. This should be used internally by the parser
130 * only to compare to ASCII values otherwise it would break when
131 * running with UTF-8 encoding.
132 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
133 * to compare on ASCII based substring.
134 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
135 * it should be used only to compare on ASCII based substring.
136 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000137 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000138 *
139 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
140 *
141 * CURRENT Returns the current char value, with the full decoding of
142 * UTF-8 if we are using this mode. It returns an int.
143 * NEXT Skip to the next character, this does the proper decoding
144 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000145 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000146 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
147 */
148
149#define UPPER (toupper(*ctxt->input->cur))
150
Daniel Veillard77a90a72003-03-22 00:04:05 +0000151#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000152
153#define NXT(val) ctxt->input->cur[(val)]
154
155#define UPP(val) (toupper(ctxt->input->cur[(val)]))
156
157#define CUR_PTR ctxt->input->cur
158
159#define SHRINK xmlParserInputShrink(ctxt->input)
160
161#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
162
163#define CURRENT ((int) (*ctxt->input->cur))
164
165#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
166
167/* Inported from XML */
168
Daniel Veillard561b7f82002-03-20 21:55:57 +0000169/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
170#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000171#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000172
Daniel Veillard561b7f82002-03-20 21:55:57 +0000173#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000174#define NXT(val) ctxt->input->cur[(val)]
175#define CUR_PTR ctxt->input->cur
176
177
178#define NEXTL(l) do { \
179 if (*(ctxt->input->cur) == '\n') { \
180 ctxt->input->line++; ctxt->input->col = 1; \
181 } else ctxt->input->col++; \
182 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
183 } while (0)
184
185/************
186 \
187 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
188 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
189 ************/
190
191#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
192#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
193
194#define COPY_BUF(l,b,i,v) \
195 if (l == 1) b[i++] = (xmlChar) v; \
196 else i += xmlCopyChar(l,&b[i],v)
197
198/**
199 * htmlCurrentChar:
200 * @ctxt: the HTML parser context
201 * @len: pointer to the length of the char read
202 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000203 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000204 * bytes in the input buffer. Implement the end of line normalization:
205 * 2.11 End-of-Line Handling
206 * If the encoding is unspecified, in the case we find an ISO-Latin-1
207 * char, then the encoding converter is plugged in automatically.
208 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000209 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000210 */
211
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000212static int
Owen Taylor3473f882001-02-23 17:55:21 +0000213htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
214 if (ctxt->instate == XML_PARSER_EOF)
215 return(0);
216
217 if (ctxt->token != 0) {
218 *len = 0;
219 return(ctxt->token);
220 }
221 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
222 /*
223 * We are supposed to handle UTF8, check it's valid
224 * From rfc2044: encoding of the Unicode values on UTF-8:
225 *
226 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
227 * 0000 0000-0000 007F 0xxxxxxx
228 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
229 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
230 *
231 * Check for the 0x110000 limit too
232 */
233 const unsigned char *cur = ctxt->input->cur;
234 unsigned char c;
235 unsigned int val;
236
237 c = *cur;
238 if (c & 0x80) {
239 if (cur[1] == 0)
240 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
241 if ((cur[1] & 0xc0) != 0x80)
242 goto encoding_error;
243 if ((c & 0xe0) == 0xe0) {
244
245 if (cur[2] == 0)
246 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
247 if ((cur[2] & 0xc0) != 0x80)
248 goto encoding_error;
249 if ((c & 0xf0) == 0xf0) {
250 if (cur[3] == 0)
251 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
252 if (((c & 0xf8) != 0xf0) ||
253 ((cur[3] & 0xc0) != 0x80))
254 goto encoding_error;
255 /* 4-byte code */
256 *len = 4;
257 val = (cur[0] & 0x7) << 18;
258 val |= (cur[1] & 0x3f) << 12;
259 val |= (cur[2] & 0x3f) << 6;
260 val |= cur[3] & 0x3f;
261 } else {
262 /* 3-byte code */
263 *len = 3;
264 val = (cur[0] & 0xf) << 12;
265 val |= (cur[1] & 0x3f) << 6;
266 val |= cur[2] & 0x3f;
267 }
268 } else {
269 /* 2-byte code */
270 *len = 2;
271 val = (cur[0] & 0x1f) << 6;
272 val |= cur[1] & 0x3f;
273 }
274 if (!IS_CHAR(val)) {
275 ctxt->errNo = XML_ERR_INVALID_ENCODING;
276 if ((ctxt->sax != NULL) &&
277 (ctxt->sax->error != NULL))
278 ctxt->sax->error(ctxt->userData,
279 "Char 0x%X out of allowed range\n", val);
280 ctxt->wellFormed = 0;
Daniel Veillarddad3f682002-11-17 16:47:27 +0000281 if (ctxt->recovery == 0) ctxt->disableSAX = 1;
Owen Taylor3473f882001-02-23 17:55:21 +0000282 }
283 return(val);
284 } else {
285 /* 1-byte code */
286 *len = 1;
287 return((int) *ctxt->input->cur);
288 }
289 }
290 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000291 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000292 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000293 * XML constructs only use < 128 chars
294 */
295 *len = 1;
296 if ((int) *ctxt->input->cur < 0x80)
297 return((int) *ctxt->input->cur);
298
299 /*
300 * Humm this is bad, do an automatic flow conversion
301 */
302 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
303 ctxt->charset = XML_CHAR_ENCODING_UTF8;
304 return(xmlCurrentChar(ctxt, len));
305
306encoding_error:
307 /*
308 * If we detect an UTF8 error that probably mean that the
309 * input encoding didn't get properly advertized in the
310 * declaration header. Report the error and switch the encoding
311 * to ISO-Latin-1 (if you don't like this policy, just declare the
312 * encoding !)
313 */
314 ctxt->errNo = XML_ERR_INVALID_ENCODING;
315 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
316 ctxt->sax->error(ctxt->userData,
317 "Input is not proper UTF-8, indicate encoding !\n");
318 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
319 ctxt->input->cur[0], ctxt->input->cur[1],
320 ctxt->input->cur[2], ctxt->input->cur[3]);
321 }
322
323 ctxt->charset = XML_CHAR_ENCODING_8859_1;
324 *len = 1;
325 return((int) *ctxt->input->cur);
326}
327
328/**
Owen Taylor3473f882001-02-23 17:55:21 +0000329 * htmlSkipBlankChars:
330 * @ctxt: the HTML parser context
331 *
332 * skip all blanks character found at that point in the input streams.
333 *
334 * Returns the number of space chars skipped
335 */
336
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000337static int
Owen Taylor3473f882001-02-23 17:55:21 +0000338htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
339 int res = 0;
340
341 while (IS_BLANK(*(ctxt->input->cur))) {
342 if ((*ctxt->input->cur == 0) &&
343 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
344 xmlPopInput(ctxt);
345 } else {
346 if (*(ctxt->input->cur) == '\n') {
347 ctxt->input->line++; ctxt->input->col = 1;
348 } else ctxt->input->col++;
349 ctxt->input->cur++;
350 ctxt->nbChars++;
351 if (*ctxt->input->cur == 0)
352 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
353 }
354 res++;
355 }
356 return(res);
357}
358
359
360
361/************************************************************************
362 * *
363 * The list of HTML elements and their properties *
364 * *
365 ************************************************************************/
366
367/*
368 * Start Tag: 1 means the start tag can be ommited
369 * End Tag: 1 means the end tag can be ommited
370 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000371 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000372 * Depr: this element is deprecated
373 * DTD: 1 means that this element is valid only in the Loose DTD
374 * 2 means that this element is valid only in the Frameset DTD
375 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000376 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000377 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000378 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000379
380/* Definitions and a couple of vars for HTML Elements */
381
382#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
383#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
384#define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
385#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
386#define BLOCK HEADING LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
387#define FORMCTRL "input", "select", "textarea", "label", "button"
388#define PCDATA
389#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
390#define LIST "ul", "ol", "dir", "menu"
391#define MODIFIER
392#define FLOW BLOCK,INLINE
393#define EMPTY NULL
394
395
396static const char* html_flow[] = { FLOW, NULL } ;
397static const char* html_inline[] = { INLINE, NULL } ;
398
399/* placeholders: elts with content but no subelements */
400static const char* html_pcdata[] = { NULL } ;
401#define html_cdata html_pcdata
402
403
404/* ... and for HTML Attributes */
405
406#define COREATTRS "id", "class", "style", "title"
407#define I18N "lang", "dir"
408#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
409#define ATTRS COREATTRS,I18N,EVENTS
410#define CELLHALIGN "align", "char", "charoff"
411#define CELLVALIGN "valign"
412
413static const char* html_attrs[] = { ATTRS, NULL } ;
414static const char* core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
415static const char* core_attrs[] = { COREATTRS, NULL } ;
416static const char* i18n_attrs[] = { I18N, NULL } ;
417
418
419/* Other declarations that should go inline ... */
420static const char* a_attrs[] = { ATTRS, "charset", "type", "name",
421 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
422 "tabindex", "onfocus", "onblur", NULL } ;
423static const char* target_attr[] = { "target", NULL } ;
424static const char* rows_cols_attr[] = { "rows", "cols", NULL } ;
425static const char* alt_attr[] = { "alt", NULL } ;
426static const char* src_alt_attrs[] = { "src", "alt", NULL } ;
427static const char* href_attrs[] = { "href", NULL } ;
428static const char* clear_attrs[] = { "clear", NULL } ;
429static const char* inline_p[] = { INLINE, "p", NULL } ;
430static const char* flow_param[] = { FLOW, "param", NULL } ;
431static const char* applet_attrs[] = { COREATTRS , "codebase",
432 "archive", "alt", "name", "height", "width", "align",
433 "hspace", "vspace", NULL } ;
434static const char* area_attrs[] = { "shape", "coords", "href", "nohref",
435 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
436static const char* basefont_attrs[] =
437 { "id", "size", "color", "face", NULL } ;
438static const char* quote_attrs[] = { ATTRS, "cite", NULL } ;
439static const char* body_contents[] = { FLOW, "ins", "del", NULL } ;
440static const char* body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
441static const char* body_depr[] = { "background", "bgcolor", "text",
442 "link", "vlink", "alink", NULL } ;
443static const char* button_attrs[] = { ATTRS, "name", "value", "type",
444 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
445
446
447static const char* col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
448static const char* col_elt[] = { "col", NULL } ;
449static const char* edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
450static const char* compact_attrs[] = { ATTRS, "compact", NULL } ;
451static const char* dl_contents[] = { "dt", "dd", NULL } ;
452static const char* compact_attr[] = { "compact", NULL } ;
453static const char* label_attr[] = { "label", NULL } ;
454static const char* fieldset_contents[] = { FLOW, "legend" } ;
455static const char* font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
456static const char* form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
457static const char* form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
458static const char* frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
459static const char* frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
460static const char* frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
461static const char* head_attrs[] = { I18N, "profile", NULL } ;
462static const char* head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
463static const char* hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
464static const char* version_attr[] = { "version", NULL } ;
465static const char* html_content[] = { "head", "body", "frameset", NULL } ;
466static const char* iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
467static const char* img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
468static const char* input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
469static const char* prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
470static const char* label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
471static const char* legend_attrs[] = { ATTRS, "accesskey", NULL } ;
472static const char* align_attr[] = { "align", NULL } ;
473static const char* link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
474static const char* map_contents[] = { BLOCK, "area", NULL } ;
475static const char* name_attr[] = { "name", NULL } ;
476static const char* action_attr[] = { "action", NULL } ;
477static const char* blockli_elt[] = { BLOCK, "li", NULL } ;
478static const char* meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
479static const char* content_attr[] = { "content", NULL } ;
480static const char* type_attr[] = { "type", NULL } ;
481static const char* noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
482static const char* object_contents[] = { FLOW, "param", NULL } ;
483static const char* object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
484static const char* object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
485static const char* ol_attrs[] = { "type", "compact", "start", NULL} ;
486static const char* option_elt[] = { "option", NULL } ;
487static const char* optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
488static const char* option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
489static const char* param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
490static const char* width_attr[] = { "width", NULL } ;
491static const char* pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
492static const char* script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
493static const char* language_attr[] = { "language", NULL } ;
494static const char* select_content[] = { "optgroup", "option", NULL } ;
495static const char* select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
496static const char* style_attrs[] = { I18N, "media", "title", NULL } ;
497static const char* table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
498static const char* table_depr[] = { "align", "bgcolor", NULL } ;
499static const char* table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
500static const char* tr_elt[] = { "tr", NULL } ;
501static const char* talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
502static const char* th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
503static const char* th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
504static const char* textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
505static const char* tr_contents[] = { "th", "td", NULL } ;
506static const char* bgcolor_attr[] = { "bgcolor", NULL } ;
507static const char* li_elt[] = { "li", NULL } ;
508static const char* ul_depr[] = { "type", "compact", NULL} ;
509static const char* dir_attr[] = { "dir", NULL} ;
510
511#define DECL (const char**)
512
Daniel Veillard22090732001-07-16 00:06:07 +0000513static const htmlElemDesc
514html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000515{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
516 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
517},
518{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
519 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
520},
521{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
522 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
523},
524{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
525 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
526},
527{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
528 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
529},
530{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
531 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
532},
533{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
534 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
535},
536{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
537 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
538},
539{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
540 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
541},
542{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
543 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
544},
545{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
546 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
547},
548{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
549 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
550},
551{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
552 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
553},
554{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
555 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
556},
557{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
558 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
559},
560{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
561 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
562},
563{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
564 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
565},
566{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
567 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
568},
569{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
570 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
571},
572{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
573 EMPTY , NULL , DECL col_attrs , NULL, NULL
574},
575{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
576 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
577},
578{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
579 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
580},
581{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
582 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
583},
584{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
585 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
586},
587{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
588 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
589},
590{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
591 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
592},
593{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
594 DECL dl_contents , "dd" , html_attrs, DECL compact_attr, NULL
595},
596{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
597 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
598},
599{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
600 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
601},
602{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
603 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
604},
605{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
606 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
607},
608{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
609 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
610},
611{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
612 EMPTY, NULL, NULL, DECL frame_attrs, NULL
613},
614{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
615 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
616},
617{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
618 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
619},
620{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
621 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
622},
623{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
624 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
625},
626{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
627 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
628},
629{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
630 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
631},
632{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
633 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
634},
635{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
636 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
637},
638{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
639 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
640},
641{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
642 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
643},
644{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
645 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
646},
647{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
648 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
649},
650{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
651 EMPTY, NULL, DECL img_attrs, DECL align_attr, src_alt_attrs
652},
653{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
654 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
655},
656{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
657 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
658},
659{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
660 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
661},
662{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
663 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
664},
665{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
666 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
667},
668{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
669 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
670},
671{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
672 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
673},
674{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
675 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
676},
677{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
678 DECL map_contents , NULL, DECL html_attrs , NULL, name_attr
679},
680{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
681 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
682},
683{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
684 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
685},
686{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
687 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
688},
689{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
690 DECL html_flow, "div", DECL html_attrs, NULL, NULL
691},
692{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
693 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
694},
695{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
696 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
697},
698{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
699 option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
700},
701{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
702 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
703},
704{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
705 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
706},
707{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
708 EMPTY, NULL, DECL param_attrs, NULL, name_attr
709},
710{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
711 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
712},
713{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
714 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
715},
716{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
717 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
718},
719{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
720 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
721},
722{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
723 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
724},
725{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
726 DECL select_content, NULL, DECL select_attrs, NULL, NULL
727},
728{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
729 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
730},
731{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
732 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
733},
734{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
735 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
736},
737{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
738 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
739},
740{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
741 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
742},
743{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
744 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
745},
746{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
747 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
748},
749{ "table", 0, 0, 0, 0, 0, 0, 0, "",
750 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
751},
752{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
753 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
754},
755{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
756 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
757},
758{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
759 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
760},
761{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
762 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
763},
764{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
765 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
766},
767{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
768 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
769},
770{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
771 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
772},
773{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
774 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
775},
776{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
777 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
778},
779{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
780 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
781},
782{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
783 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
784},
785{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
786 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
787}
Owen Taylor3473f882001-02-23 17:55:21 +0000788};
789
790/*
Owen Taylor3473f882001-02-23 17:55:21 +0000791 * start tags that imply the end of current element
792 */
Daniel Veillard22090732001-07-16 00:06:07 +0000793static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000794"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
795 "dl", "ul", "ol", "menu", "dir", "address", "pre",
796 "listing", "xmp", "head", NULL,
797"head", "p", NULL,
798"title", "p", NULL,
799"body", "head", "style", "link", "title", "p", NULL,
800"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
801 "pre", "listing", "xmp", "head", "li", NULL,
802"hr", "p", "head", NULL,
803"h1", "p", "head", NULL,
804"h2", "p", "head", NULL,
805"h3", "p", "head", NULL,
806"h4", "p", "head", NULL,
807"h5", "p", "head", NULL,
808"h6", "p", "head", NULL,
809"dir", "p", "head", NULL,
810"address", "p", "head", "ul", NULL,
811"pre", "p", "head", "ul", NULL,
812"listing", "p", "head", NULL,
813"xmp", "p", "head", NULL,
814"blockquote", "p", "head", NULL,
815"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
816 "xmp", "head", NULL,
817"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
818 "head", "dd", NULL,
819"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
820 "head", "dt", NULL,
821"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
822 "listing", "xmp", NULL,
823"ol", "p", "head", "ul", NULL,
824"menu", "p", "head", "ul", NULL,
825"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
826"div", "p", "head", NULL,
827"noscript", "p", "head", NULL,
828"center", "font", "b", "i", "p", "head", NULL,
829"a", "a", NULL,
830"caption", "p", NULL,
831"colgroup", "caption", "colgroup", "col", "p", NULL,
832"col", "caption", "col", "p", NULL,
833"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
834 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000835"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
836"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000837"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
838"thead", "caption", "col", "colgroup", NULL,
839"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
840 "tbody", "p", NULL,
841"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
842 "tfoot", "tbody", "p", NULL,
843"optgroup", "option", NULL,
844"option", "option", NULL,
845"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
846 "pre", "listing", "xmp", "a", NULL,
847NULL
848};
849
850/*
851 * The list of HTML elements which are supposed not to have
852 * CDATA content and where a p element will be implied
853 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000854 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000855 * implied paragraph
856 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000857static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000858 "html",
859 "head",
860 "body",
861 NULL
862};
863
864/*
865 * The list of HTML attributes which are of content %Script;
866 * NOTE: when adding ones, check htmlIsScriptAttribute() since
867 * it assumes the name starts with 'on'
868 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000869static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000870 "onclick",
871 "ondblclick",
872 "onmousedown",
873 "onmouseup",
874 "onmouseover",
875 "onmousemove",
876 "onmouseout",
877 "onkeypress",
878 "onkeydown",
879 "onkeyup",
880 "onload",
881 "onunload",
882 "onfocus",
883 "onblur",
884 "onsubmit",
885 "onrest",
886 "onchange",
887 "onselect"
888};
889
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000890/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000891 * This table is used by the htmlparser to know what to do with
892 * broken html pages. By assigning different priorities to different
893 * elements the parser can decide how to handle extra endtags.
894 * Endtags are only allowed to close elements with lower or equal
895 * priority.
896 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000897
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000898typedef struct {
899 const char *name;
900 int priority;
901} elementPriority;
902
Daniel Veillard22090732001-07-16 00:06:07 +0000903static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000904 {"div", 150},
905 {"td", 160},
906 {"th", 160},
907 {"tr", 170},
908 {"thead", 180},
909 {"tbody", 180},
910 {"tfoot", 180},
911 {"table", 190},
912 {"head", 200},
913 {"body", 200},
914 {"html", 220},
915 {NULL, 100} /* Default priority */
916};
Owen Taylor3473f882001-02-23 17:55:21 +0000917
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000918static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000919static int htmlStartCloseIndexinitialized = 0;
920
921/************************************************************************
922 * *
923 * functions to handle HTML specific data *
924 * *
925 ************************************************************************/
926
927/**
928 * htmlInitAutoClose:
929 *
930 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
931 * This is not reentrant. Call xmlInitParser() once before processing in
932 * case of use in multithreaded programs.
933 */
934void
935htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000936 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000937
938 if (htmlStartCloseIndexinitialized) return;
939
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000940 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
941 indx = 0;
942 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
943 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000944 while (htmlStartClose[i] != NULL) i++;
945 i++;
946 }
947 htmlStartCloseIndexinitialized = 1;
948}
949
950/**
951 * htmlTagLookup:
952 * @tag: The tag name in lowercase
953 *
954 * Lookup the HTML tag in the ElementTable
955 *
956 * Returns the related htmlElemDescPtr or NULL if not found.
957 */
Daniel Veillardbb371292001-08-16 23:26:59 +0000958const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +0000959htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000960 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000961
962 for (i = 0; i < (sizeof(html40ElementTable) /
963 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000964 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard22090732001-07-16 00:06:07 +0000965 return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +0000966 }
967 return(NULL);
968}
969
970/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000971 * htmlGetEndPriority:
972 * @name: The name of the element to look up the priority for.
973 *
974 * Return value: The "endtag" priority.
975 **/
976static int
977htmlGetEndPriority (const xmlChar *name) {
978 int i = 0;
979
980 while ((htmlEndPriority[i].name != NULL) &&
981 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
982 i++;
983
984 return(htmlEndPriority[i].priority);
985}
986
987/**
Owen Taylor3473f882001-02-23 17:55:21 +0000988 * htmlCheckAutoClose:
989 * @newtag: The new tag name
990 * @oldtag: The old tag name
991 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000992 * Checks whether the new tag is one of the registered valid tags for
993 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +0000994 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
995 *
996 * Returns 0 if no, 1 if yes.
997 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000998static int
Owen Taylor3473f882001-02-23 17:55:21 +0000999htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001000 int i, indx;
1001 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001002
1003 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
1004
1005 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001006 for (indx = 0; indx < 100;indx++) {
1007 closed = htmlStartCloseIndex[indx];
1008 if (closed == NULL) return(0);
1009 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +00001010 }
1011
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001012 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001013 i++;
1014 while (htmlStartClose[i] != NULL) {
1015 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1016 return(1);
1017 }
1018 i++;
1019 }
1020 return(0);
1021}
1022
1023/**
1024 * htmlAutoCloseOnClose:
1025 * @ctxt: an HTML parser context
1026 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001027 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001028 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001029 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001030 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001031static void
Owen Taylor3473f882001-02-23 17:55:21 +00001032htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardbb371292001-08-16 23:26:59 +00001033 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00001034 xmlChar *oldname;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001035 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001036
1037#ifdef DEBUG
1038 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
1039 for (i = 0;i < ctxt->nameNr;i++)
1040 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
1041#endif
1042
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001043 priority = htmlGetEndPriority (newtag);
1044
Owen Taylor3473f882001-02-23 17:55:21 +00001045 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001046
Owen Taylor3473f882001-02-23 17:55:21 +00001047 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001048 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001049 * A missplaced endtag can only close elements with lower
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001050 * or equal priority, so if we find an element with higher
1051 * priority before we find an element with
1052 * matching name, we just ignore this endtag
1053 */
1054 if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor3473f882001-02-23 17:55:21 +00001055 }
1056 if (i < 0) return;
1057
1058 while (!xmlStrEqual(newtag, ctxt->name)) {
1059 info = htmlTagLookup(ctxt->name);
1060 if ((info == NULL) || (info->endTag == 1)) {
1061#ifdef DEBUG
1062 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
1063#endif
Daniel Veillard56098d42001-04-24 12:51:09 +00001064 } else if (info->endTag == 3) {
1065#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001066 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack1633d182001-10-05 15:41:19 +00001067
Daniel Veillard56098d42001-04-24 12:51:09 +00001068#endif
1069 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1070 ctxt->sax->error(ctxt->userData,
1071 "Opening and ending tag mismatch: %s and %s\n",
1072 newtag, ctxt->name);
1073 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001074 }
1075 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1076 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1077 oldname = htmlnamePop(ctxt);
1078 if (oldname != NULL) {
1079#ifdef DEBUG
1080 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
1081#endif
1082 xmlFree(oldname);
1083 }
1084 }
1085}
1086
1087/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001088 * htmlAutoCloseOnEnd:
1089 * @ctxt: an HTML parser context
1090 *
1091 * Close all remaining tags at the end of the stream
1092 */
1093static void
1094htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
1095 xmlChar *oldname;
1096 int i;
1097
1098 if (ctxt->nameNr == 0)
1099 return;
1100#ifdef DEBUG
1101 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
1102#endif
1103
1104 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
1105#ifdef DEBUG
1106 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
1107#endif
1108 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1109 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1110 oldname = htmlnamePop(ctxt);
1111 if (oldname != NULL) {
1112#ifdef DEBUG
1113 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
1114#endif
1115 xmlFree(oldname);
1116 }
1117 }
1118}
1119
1120/**
Owen Taylor3473f882001-02-23 17:55:21 +00001121 * htmlAutoClose:
1122 * @ctxt: an HTML parser context
1123 * @newtag: The new tag name or NULL
1124 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001125 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001126 * The list is kept in htmlStartClose array. This function is
1127 * called when a new tag has been detected and generates the
1128 * appropriates closes if possible/needed.
1129 * If newtag is NULL this mean we are at the end of the resource
1130 * and we should check
1131 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001132static void
Owen Taylor3473f882001-02-23 17:55:21 +00001133htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1134 xmlChar *oldname;
1135 while ((newtag != NULL) && (ctxt->name != NULL) &&
1136 (htmlCheckAutoClose(newtag, ctxt->name))) {
1137#ifdef DEBUG
1138 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
1139#endif
1140 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1141 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1142 oldname = htmlnamePop(ctxt);
1143 if (oldname != NULL) {
1144#ifdef DEBUG
1145 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
1146#endif
1147 xmlFree(oldname);
1148 }
1149 }
1150 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001151 htmlAutoCloseOnEnd(ctxt);
1152 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001153 }
1154 while ((newtag == NULL) && (ctxt->name != NULL) &&
1155 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
1156 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
1157 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
1158#ifdef DEBUG
1159 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
1160#endif
1161 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1162 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1163 oldname = htmlnamePop(ctxt);
1164 if (oldname != NULL) {
1165#ifdef DEBUG
1166 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
1167#endif
1168 xmlFree(oldname);
1169 }
1170 }
1171
1172}
1173
1174/**
1175 * htmlAutoCloseTag:
1176 * @doc: the HTML document
1177 * @name: The tag name
1178 * @elem: the HTML element
1179 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001180 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001181 * The list is kept in htmlStartClose array. This function checks
1182 * if the element or one of it's children would autoclose the
1183 * given tag.
1184 *
1185 * Returns 1 if autoclose, 0 otherwise
1186 */
1187int
1188htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1189 htmlNodePtr child;
1190
1191 if (elem == NULL) return(1);
1192 if (xmlStrEqual(name, elem->name)) return(0);
1193 if (htmlCheckAutoClose(elem->name, name)) return(1);
1194 child = elem->children;
1195 while (child != NULL) {
1196 if (htmlAutoCloseTag(doc, name, child)) return(1);
1197 child = child->next;
1198 }
1199 return(0);
1200}
1201
1202/**
1203 * htmlIsAutoClosed:
1204 * @doc: the HTML document
1205 * @elem: the HTML element
1206 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001207 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001208 * The list is kept in htmlStartClose array. This function checks
1209 * if a tag is autoclosed by one of it's child
1210 *
1211 * Returns 1 if autoclosed, 0 otherwise
1212 */
1213int
1214htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1215 htmlNodePtr child;
1216
1217 if (elem == NULL) return(1);
1218 child = elem->children;
1219 while (child != NULL) {
1220 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1221 child = child->next;
1222 }
1223 return(0);
1224}
1225
1226/**
1227 * htmlCheckImplied:
1228 * @ctxt: an HTML parser context
1229 * @newtag: The new tag name
1230 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001231 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001232 * called when a new tag has been detected and generates the
1233 * appropriates implicit tags if missing
1234 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001235static void
Owen Taylor3473f882001-02-23 17:55:21 +00001236htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1237 if (!htmlOmittedDefaultValue)
1238 return;
1239 if (xmlStrEqual(newtag, BAD_CAST"html"))
1240 return;
1241 if (ctxt->nameNr <= 0) {
1242#ifdef DEBUG
1243 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
1244#endif
1245 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
1246 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1247 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1248 }
1249 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1250 return;
1251 if ((ctxt->nameNr <= 1) &&
1252 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1253 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1254 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1255 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1256 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1257 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1258 /*
1259 * dropped OBJECT ... i you put it first BODY will be
1260 * assumed !
1261 */
1262#ifdef DEBUG
1263 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
1264#endif
1265 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
1266 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1267 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1268 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1269 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1270 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1271 int i;
1272 for (i = 0;i < ctxt->nameNr;i++) {
1273 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1274 return;
1275 }
1276 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1277 return;
1278 }
1279 }
1280
1281#ifdef DEBUG
1282 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
1283#endif
1284 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
1285 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1286 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1287 }
1288}
1289
1290/**
1291 * htmlCheckParagraph
1292 * @ctxt: an HTML parser context
1293 *
1294 * Check whether a p element need to be implied before inserting
1295 * characters in the current element.
1296 *
1297 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1298 * in case of error.
1299 */
1300
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001301static int
Owen Taylor3473f882001-02-23 17:55:21 +00001302htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1303 const xmlChar *tag;
1304 int i;
1305
1306 if (ctxt == NULL)
1307 return(-1);
1308 tag = ctxt->name;
1309 if (tag == NULL) {
1310 htmlAutoClose(ctxt, BAD_CAST"p");
1311 htmlCheckImplied(ctxt, BAD_CAST"p");
1312 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
1313 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1314 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1315 return(1);
1316 }
1317 if (!htmlOmittedDefaultValue)
1318 return(0);
1319 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1320 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1321#ifdef DEBUG
1322 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
1323#endif
1324 htmlAutoClose(ctxt, BAD_CAST"p");
1325 htmlCheckImplied(ctxt, BAD_CAST"p");
1326 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
1327 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1328 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1329 return(1);
1330 }
1331 }
1332 return(0);
1333}
1334
1335/**
1336 * htmlIsScriptAttribute:
1337 * @name: an attribute name
1338 *
1339 * Check if an attribute is of content type Script
1340 *
1341 * Returns 1 is the attribute is a script 0 otherwise
1342 */
1343int
1344htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001345 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001346
1347 if (name == NULL)
1348 return(0);
1349 /*
1350 * all script attributes start with 'on'
1351 */
1352 if ((name[0] != 'o') || (name[1] != 'n'))
1353 return(0);
1354 for (i = 0;
1355 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1356 i++) {
1357 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1358 return(1);
1359 }
1360 return(0);
1361}
1362
1363/************************************************************************
1364 * *
1365 * The list of HTML predefined entities *
1366 * *
1367 ************************************************************************/
1368
1369
Daniel Veillard22090732001-07-16 00:06:07 +00001370static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001371/*
1372 * the 4 absolute ones, plus apostrophe.
1373 */
1374{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1375{ 38, "amp", "ampersand, U+0026 ISOnum" },
1376{ 39, "apos", "single quote" },
1377{ 60, "lt", "less-than sign, U+003C ISOnum" },
1378{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1379
1380/*
1381 * A bunch still in the 128-255 range
1382 * Replacing them depend really on the charset used.
1383 */
1384{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1385{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1386{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1387{ 163, "pound","pound sign, U+00A3 ISOnum" },
1388{ 164, "curren","currency sign, U+00A4 ISOnum" },
1389{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1390{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1391{ 167, "sect", "section sign, U+00A7 ISOnum" },
1392{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1393{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1394{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1395{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1396{ 172, "not", "not sign, U+00AC ISOnum" },
1397{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1398{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1399{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1400{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1401{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1402{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1403{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1404{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1405{ 181, "micro","micro sign, U+00B5 ISOnum" },
1406{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1407{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1408{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1409{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1410{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1411{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1412{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1413{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1414{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1415{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1416{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1417{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1418{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1419{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1420{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1421{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1422{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1423{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1424{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1425{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1426{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1427{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1428{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1429{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1430{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1431{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1432{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1433{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1434{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1435{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1436{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1437{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1438{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1439{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1440{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1441{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1442{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1443{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1444{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1445{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1446{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1447{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1448{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1449{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1450{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1451{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1452{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1453{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1454{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1455{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1456{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1457{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1458{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1459{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1460{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1461{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1462{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1463{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1464{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1465{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1466{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1467{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1468{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1469{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1470{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1471{ 247, "divide","division sign, U+00F7 ISOnum" },
1472{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1473{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1474{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1475{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1476{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1477{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1478{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1479{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1480
1481{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1482{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1483{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1484{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1485{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1486
1487/*
1488 * Anything below should really be kept as entities references
1489 */
1490{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1491
1492{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1493{ 732, "tilde","small tilde, U+02DC ISOdia" },
1494
1495{ 913, "Alpha","greek capital letter alpha, U+0391" },
1496{ 914, "Beta", "greek capital letter beta, U+0392" },
1497{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1498{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1499{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1500{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1501{ 919, "Eta", "greek capital letter eta, U+0397" },
1502{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1503{ 921, "Iota", "greek capital letter iota, U+0399" },
1504{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001505{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001506{ 924, "Mu", "greek capital letter mu, U+039C" },
1507{ 925, "Nu", "greek capital letter nu, U+039D" },
1508{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1509{ 927, "Omicron","greek capital letter omicron, U+039F" },
1510{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1511{ 929, "Rho", "greek capital letter rho, U+03A1" },
1512{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1513{ 932, "Tau", "greek capital letter tau, U+03A4" },
1514{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1515{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1516{ 935, "Chi", "greek capital letter chi, U+03A7" },
1517{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1518{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1519
1520{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1521{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1522{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1523{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1524{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1525{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1526{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1527{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1528{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1529{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1530{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1531{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1532{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1533{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1534{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1535{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1536{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1537{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1538{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1539{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1540{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1541{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1542{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1543{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1544{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1545{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1546{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1547{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1548
1549{ 8194, "ensp", "en space, U+2002 ISOpub" },
1550{ 8195, "emsp", "em space, U+2003 ISOpub" },
1551{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1552{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1553{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1554{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1555{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1556{ 8211, "ndash","en dash, U+2013 ISOpub" },
1557{ 8212, "mdash","em dash, U+2014 ISOpub" },
1558{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1559{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1560{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1561{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1562{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1563{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1564{ 8224, "dagger","dagger, U+2020 ISOpub" },
1565{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1566
1567{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1568{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1569
1570{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1571
1572{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1573{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1574
1575{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1576{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1577
1578{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1579{ 8260, "frasl","fraction slash, U+2044 NEW" },
1580
1581{ 8364, "euro", "euro sign, U+20AC NEW" },
1582
1583{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1584{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1585{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1586{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1587{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1588{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1589{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1590{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1591{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1592{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1593{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1594{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1595{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1596{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1597{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1598{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1599
1600{ 8704, "forall","for all, U+2200 ISOtech" },
1601{ 8706, "part", "partial differential, U+2202 ISOtech" },
1602{ 8707, "exist","there exists, U+2203 ISOtech" },
1603{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1604{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1605{ 8712, "isin", "element of, U+2208 ISOtech" },
1606{ 8713, "notin","not an element of, U+2209 ISOtech" },
1607{ 8715, "ni", "contains as member, U+220B ISOtech" },
1608{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001609{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001610{ 8722, "minus","minus sign, U+2212 ISOtech" },
1611{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1612{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1613{ 8733, "prop", "proportional to, U+221D ISOtech" },
1614{ 8734, "infin","infinity, U+221E ISOtech" },
1615{ 8736, "ang", "angle, U+2220 ISOamso" },
1616{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1617{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1618{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1619{ 8746, "cup", "union = cup, U+222A ISOtech" },
1620{ 8747, "int", "integral, U+222B ISOtech" },
1621{ 8756, "there4","therefore, U+2234 ISOtech" },
1622{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1623{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1624{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1625{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1626{ 8801, "equiv","identical to, U+2261 ISOtech" },
1627{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1628{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1629{ 8834, "sub", "subset of, U+2282 ISOtech" },
1630{ 8835, "sup", "superset of, U+2283 ISOtech" },
1631{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1632{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1633{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1634{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1635{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1636{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1637{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1638{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1639{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1640{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1641{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1642{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1643{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1644{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1645
1646{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1647{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1648{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1649{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1650
1651};
1652
1653/************************************************************************
1654 * *
1655 * Commodity functions to handle entities *
1656 * *
1657 ************************************************************************/
1658
1659/*
1660 * Macro used to grow the current buffer.
1661 */
1662#define growBuffer(buffer) { \
1663 buffer##_size *= 2; \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001664 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Owen Taylor3473f882001-02-23 17:55:21 +00001665 if (buffer == NULL) { \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001666 xmlGenericError(xmlGenericErrorContext, "realloc failed\n"); \
Owen Taylor3473f882001-02-23 17:55:21 +00001667 return(NULL); \
1668 } \
1669}
1670
1671/**
1672 * htmlEntityLookup:
1673 * @name: the entity name
1674 *
1675 * Lookup the given entity in EntitiesTable
1676 *
1677 * TODO: the linear scan is really ugly, an hash table is really needed.
1678 *
1679 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1680 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001681const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001682htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001683 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001684
1685 for (i = 0;i < (sizeof(html40EntitiesTable)/
1686 sizeof(html40EntitiesTable[0]));i++) {
1687 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1688#ifdef DEBUG
1689 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1690#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001691 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001692 }
1693 }
1694 return(NULL);
1695}
1696
1697/**
1698 * htmlEntityValueLookup:
1699 * @value: the entity's unicode value
1700 *
1701 * Lookup the given entity in EntitiesTable
1702 *
1703 * TODO: the linear scan is really ugly, an hash table is really needed.
1704 *
1705 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1706 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001707const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001708htmlEntityValueLookup(unsigned int value) {
1709 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001710#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001711 unsigned int lv = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001712#endif
1713
1714 for (i = 0;i < (sizeof(html40EntitiesTable)/
1715 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001716 if (html40EntitiesTable[i].value >= value) {
1717 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001718 break;
1719#ifdef DEBUG
1720 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1721#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001722 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001723 }
1724#ifdef DEBUG
1725 if (lv > html40EntitiesTable[i].value) {
1726 xmlGenericError(xmlGenericErrorContext,
1727 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1728 lv, html40EntitiesTable[i].value);
1729 }
1730 lv = html40EntitiesTable[i].value;
1731#endif
1732 }
1733 return(NULL);
1734}
1735
1736/**
1737 * UTF8ToHtml:
1738 * @out: a pointer to an array of bytes to store the result
1739 * @outlen: the length of @out
1740 * @in: a pointer to an array of UTF-8 chars
1741 * @inlen: the length of @in
1742 *
1743 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1744 * plus HTML entities block of chars out.
1745 *
1746 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1747 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001748 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001749 * The value of @outlen after return is the number of octets consumed.
1750 */
1751int
1752UTF8ToHtml(unsigned char* out, int *outlen,
1753 const unsigned char* in, int *inlen) {
1754 const unsigned char* processed = in;
1755 const unsigned char* outend;
1756 const unsigned char* outstart = out;
1757 const unsigned char* instart = in;
1758 const unsigned char* inend;
1759 unsigned int c, d;
1760 int trailing;
1761
1762 if (in == NULL) {
1763 /*
1764 * initialization nothing to do
1765 */
1766 *outlen = 0;
1767 *inlen = 0;
1768 return(0);
1769 }
1770 inend = in + (*inlen);
1771 outend = out + (*outlen);
1772 while (in < inend) {
1773 d = *in++;
1774 if (d < 0x80) { c= d; trailing= 0; }
1775 else if (d < 0xC0) {
1776 /* trailing byte in leading position */
1777 *outlen = out - outstart;
1778 *inlen = processed - instart;
1779 return(-2);
1780 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1781 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1782 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1783 else {
1784 /* no chance for this in Ascii */
1785 *outlen = out - outstart;
1786 *inlen = processed - instart;
1787 return(-2);
1788 }
1789
1790 if (inend - in < trailing) {
1791 break;
1792 }
1793
1794 for ( ; trailing; trailing--) {
1795 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1796 break;
1797 c <<= 6;
1798 c |= d & 0x3F;
1799 }
1800
1801 /* assertion: c is a single UTF-4 value */
1802 if (c < 0x80) {
1803 if (out + 1 >= outend)
1804 break;
1805 *out++ = c;
1806 } else {
1807 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001808 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001809
1810 /*
1811 * Try to lookup a predefined HTML entity for it
1812 */
1813
1814 ent = htmlEntityValueLookup(c);
1815 if (ent == NULL) {
1816 /* no chance for this in Ascii */
1817 *outlen = out - outstart;
1818 *inlen = processed - instart;
1819 return(-2);
1820 }
1821 len = strlen(ent->name);
1822 if (out + 2 + len >= outend)
1823 break;
1824 *out++ = '&';
1825 memcpy(out, ent->name, len);
1826 out += len;
1827 *out++ = ';';
1828 }
1829 processed = in;
1830 }
1831 *outlen = out - outstart;
1832 *inlen = processed - instart;
1833 return(0);
1834}
1835
1836/**
1837 * htmlEncodeEntities:
1838 * @out: a pointer to an array of bytes to store the result
1839 * @outlen: the length of @out
1840 * @in: a pointer to an array of UTF-8 chars
1841 * @inlen: the length of @in
1842 * @quoteChar: the quote character to escape (' or ") or zero.
1843 *
1844 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1845 * plus HTML entities block of chars out.
1846 *
1847 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1848 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001849 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001850 * The value of @outlen after return is the number of octets consumed.
1851 */
1852int
1853htmlEncodeEntities(unsigned char* out, int *outlen,
1854 const unsigned char* in, int *inlen, int quoteChar) {
1855 const unsigned char* processed = in;
1856 const unsigned char* outend = out + (*outlen);
1857 const unsigned char* outstart = out;
1858 const unsigned char* instart = in;
1859 const unsigned char* inend = in + (*inlen);
1860 unsigned int c, d;
1861 int trailing;
1862
1863 while (in < inend) {
1864 d = *in++;
1865 if (d < 0x80) { c= d; trailing= 0; }
1866 else if (d < 0xC0) {
1867 /* trailing byte in leading position */
1868 *outlen = out - outstart;
1869 *inlen = processed - instart;
1870 return(-2);
1871 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1872 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1873 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1874 else {
1875 /* no chance for this in Ascii */
1876 *outlen = out - outstart;
1877 *inlen = processed - instart;
1878 return(-2);
1879 }
1880
1881 if (inend - in < trailing)
1882 break;
1883
1884 while (trailing--) {
1885 if (((d= *in++) & 0xC0) != 0x80) {
1886 *outlen = out - outstart;
1887 *inlen = processed - instart;
1888 return(-2);
1889 }
1890 c <<= 6;
1891 c |= d & 0x3F;
1892 }
1893
1894 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001895 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1896 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001897 if (out >= outend)
1898 break;
1899 *out++ = c;
1900 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001901 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001902 const char *cp;
1903 char nbuf[16];
1904 int len;
1905
1906 /*
1907 * Try to lookup a predefined HTML entity for it
1908 */
1909 ent = htmlEntityValueLookup(c);
1910 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001911 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001912 cp = nbuf;
1913 }
1914 else
1915 cp = ent->name;
1916 len = strlen(cp);
1917 if (out + 2 + len > outend)
1918 break;
1919 *out++ = '&';
1920 memcpy(out, cp, len);
1921 out += len;
1922 *out++ = ';';
1923 }
1924 processed = in;
1925 }
1926 *outlen = out - outstart;
1927 *inlen = processed - instart;
1928 return(0);
1929}
1930
1931/**
1932 * htmlDecodeEntities:
1933 * @ctxt: the parser context
1934 * @len: the len to decode (in bytes !), -1 for no size limit
1935 * @end: an end marker xmlChar, 0 if none
1936 * @end2: an end marker xmlChar, 0 if none
1937 * @end3: an end marker xmlChar, 0 if none
1938 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001939 * Substitute the HTML entities by their value
Owen Taylor3473f882001-02-23 17:55:21 +00001940 *
1941 * DEPRECATED !!!!
1942 *
1943 * Returns A newly allocated string with the substitution done. The caller
1944 * must deallocate it !
1945 */
1946xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001947htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1948 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001949 static int deprecated = 0;
1950 if (!deprecated) {
1951 xmlGenericError(xmlGenericErrorContext,
1952 "htmlDecodeEntities() deprecated function reached\n");
1953 deprecated = 1;
1954 }
1955 return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001956}
1957
1958/************************************************************************
1959 * *
1960 * Commodity functions to handle streams *
1961 * *
1962 ************************************************************************/
1963
1964/**
Owen Taylor3473f882001-02-23 17:55:21 +00001965 * htmlNewInputStream:
1966 * @ctxt: an HTML parser context
1967 *
1968 * Create a new input stream structure
1969 * Returns the new input stream or NULL
1970 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001971static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001972htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1973 htmlParserInputPtr input;
1974
1975 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1976 if (input == NULL) {
1977 ctxt->errNo = XML_ERR_NO_MEMORY;
1978 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1979 ctxt->sax->error(ctxt->userData,
1980 "malloc: couldn't allocate a new input stream\n");
1981 return(NULL);
1982 }
1983 memset(input, 0, sizeof(htmlParserInput));
1984 input->filename = NULL;
1985 input->directory = NULL;
1986 input->base = NULL;
1987 input->cur = NULL;
1988 input->buf = NULL;
1989 input->line = 1;
1990 input->col = 1;
1991 input->buf = NULL;
1992 input->free = NULL;
1993 input->version = NULL;
1994 input->consumed = 0;
1995 input->length = 0;
1996 return(input);
1997}
1998
1999
2000/************************************************************************
2001 * *
2002 * Commodity functions, cleanup needed ? *
2003 * *
2004 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002005/*
2006 * all tags allowing pc data from the html 4.01 loose dtd
2007 * NOTE: it might be more apropriate to integrate this information
2008 * into the html40ElementTable array but I don't want to risk any
2009 * binary incomptibility
2010 */
2011static const char *allowPCData[] = {
2012 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2013 "blockquote", "body", "button", "caption", "center", "cite", "code",
2014 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2015 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2016 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2017 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2018};
Owen Taylor3473f882001-02-23 17:55:21 +00002019
2020/**
2021 * areBlanks:
2022 * @ctxt: an HTML parser context
2023 * @str: a xmlChar *
2024 * @len: the size of @str
2025 *
2026 * Is this a sequence of blank chars that one can ignore ?
2027 *
2028 * Returns 1 if ignorable 0 otherwise.
2029 */
2030
2031static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002032 unsigned int i;
2033 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002034 xmlNodePtr lastChild;
2035
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002036 for (j = 0;j < len;j++)
2037 if (!(IS_BLANK(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002038
2039 if (CUR == 0) return(1);
2040 if (CUR != '<') return(0);
2041 if (ctxt->name == NULL)
2042 return(1);
2043 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2044 return(1);
2045 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2046 return(1);
2047 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
2048 return(1);
2049 if (ctxt->node == NULL) return(0);
2050 lastChild = xmlGetLastChild(ctxt->node);
2051 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002052 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2053 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002054 /* keep ws in constructs like ...<b> </b>...
2055 for all tags "b" allowing PCDATA */
2056 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2057 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2058 return(0);
2059 }
2060 }
Owen Taylor3473f882001-02-23 17:55:21 +00002061 } else if (xmlNodeIsText(lastChild)) {
2062 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002063 } else {
2064 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2065 for all tags "p" allowing PCDATA */
2066 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2067 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2068 return(0);
2069 }
2070 }
Owen Taylor3473f882001-02-23 17:55:21 +00002071 }
2072 return(1);
2073}
2074
2075/**
Owen Taylor3473f882001-02-23 17:55:21 +00002076 * htmlNewDocNoDtD:
2077 * @URI: URI for the dtd, or NULL
2078 * @ExternalID: the external ID of the DTD, or NULL
2079 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002080 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2081 * are NULL
2082 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002083 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002084 */
2085htmlDocPtr
2086htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2087 xmlDocPtr cur;
2088
2089 /*
2090 * Allocate a new document and fill the fields.
2091 */
2092 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2093 if (cur == NULL) {
2094 xmlGenericError(xmlGenericErrorContext,
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002095 "htmlNewDocNoDtD : malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002096 return(NULL);
2097 }
2098 memset(cur, 0, sizeof(xmlDoc));
2099
2100 cur->type = XML_HTML_DOCUMENT_NODE;
2101 cur->version = NULL;
2102 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002103 cur->doc = cur;
2104 cur->name = NULL;
2105 cur->children = NULL;
2106 cur->extSubset = NULL;
2107 cur->oldNs = NULL;
2108 cur->encoding = NULL;
2109 cur->standalone = 1;
2110 cur->compression = 0;
2111 cur->ids = NULL;
2112 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002113 cur->_private = NULL;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002114 if ((ExternalID != NULL) ||
2115 (URI != NULL))
Daniel Veillard5151c062001-10-23 13:10:19 +00002116 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002117 return(cur);
2118}
2119
2120/**
2121 * htmlNewDoc:
2122 * @URI: URI for the dtd, or NULL
2123 * @ExternalID: the external ID of the DTD, or NULL
2124 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002125 * Creates a new HTML document
2126 *
Owen Taylor3473f882001-02-23 17:55:21 +00002127 * Returns a new document
2128 */
2129htmlDocPtr
2130htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2131 if ((URI == NULL) && (ExternalID == NULL))
2132 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002133 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2134 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002135
2136 return(htmlNewDocNoDtD(URI, ExternalID));
2137}
2138
2139
2140/************************************************************************
2141 * *
2142 * The parser itself *
2143 * Relates to http://www.w3.org/TR/html40 *
2144 * *
2145 ************************************************************************/
2146
2147/************************************************************************
2148 * *
2149 * The parser itself *
2150 * *
2151 ************************************************************************/
2152
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002153static xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2154
Owen Taylor3473f882001-02-23 17:55:21 +00002155/**
2156 * htmlParseHTMLName:
2157 * @ctxt: an HTML parser context
2158 *
2159 * parse an HTML tag or attribute name, note that we convert it to lowercase
2160 * since HTML names are not case-sensitive.
2161 *
2162 * Returns the Tag Name parsed or NULL
2163 */
2164
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002165static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002166htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2167 xmlChar *ret = NULL;
2168 int i = 0;
2169 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2170
2171 if (!IS_LETTER(CUR) && (CUR != '_') &&
2172 (CUR != ':')) return(NULL);
2173
2174 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2175 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2176 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2177 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2178 else loc[i] = CUR;
2179 i++;
2180
2181 NEXT;
2182 }
2183
2184 ret = xmlStrndup(loc, i);
2185
2186 return(ret);
2187}
2188
2189/**
2190 * htmlParseName:
2191 * @ctxt: an HTML parser context
2192 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002193 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002194 *
2195 * Returns the Name parsed or NULL
2196 */
2197
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002198static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002199htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002200 const xmlChar *in;
2201 xmlChar *ret;
2202 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002203
2204 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002205
2206 /*
2207 * Accelerator for simple ASCII names
2208 */
2209 in = ctxt->input->cur;
2210 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2211 ((*in >= 0x41) && (*in <= 0x5A)) ||
2212 (*in == '_') || (*in == ':')) {
2213 in++;
2214 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2215 ((*in >= 0x41) && (*in <= 0x5A)) ||
2216 ((*in >= 0x30) && (*in <= 0x39)) ||
2217 (*in == '_') || (*in == '-') ||
2218 (*in == ':') || (*in == '.'))
2219 in++;
2220 if ((*in > 0) && (*in < 0x80)) {
2221 count = in - ctxt->input->cur;
2222 ret = xmlStrndup(ctxt->input->cur, count);
2223 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002224 ctxt->nbChars += count;
2225 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002226 return(ret);
2227 }
2228 }
2229 return(htmlParseNameComplex(ctxt));
2230}
2231
2232static xmlChar *
2233htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2234 xmlChar buf[XML_MAX_NAMELEN + 5];
2235 int len = 0, l;
2236 int c;
2237 int count = 0;
2238
2239 /*
2240 * Handler for more complex cases
2241 */
2242 GROW;
2243 c = CUR_CHAR(l);
2244 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2245 (!IS_LETTER(c) && (c != '_') &&
2246 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002247 return(NULL);
2248 }
2249
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002250 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2251 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2252 (c == '.') || (c == '-') ||
2253 (c == '_') || (c == ':') ||
2254 (IS_COMBINING(c)) ||
2255 (IS_EXTENDER(c)))) {
2256 if (count++ > 100) {
2257 count = 0;
2258 GROW;
2259 }
2260 COPY_BUF(l,buf,len,c);
2261 NEXTL(l);
2262 c = CUR_CHAR(l);
2263 if (len >= XML_MAX_NAMELEN) {
2264 /*
2265 * Okay someone managed to make a huge name, so he's ready to pay
2266 * for the processing speed.
2267 */
2268 xmlChar *buffer;
2269 int max = len * 2;
2270
2271 buffer = (xmlChar *) xmlMalloc(max * sizeof(xmlChar));
2272 if (buffer == NULL) {
2273 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2274 ctxt->sax->error(ctxt->userData,
2275 "htmlParseNameComplex: out of memory\n");
2276 return(NULL);
2277 }
2278 memcpy(buffer, buf, len);
2279 while ((IS_LETTER(c)) || (IS_DIGIT(c)) || /* test bigname.xml */
2280 (c == '.') || (c == '-') ||
2281 (c == '_') || (c == ':') ||
2282 (IS_COMBINING(c)) ||
2283 (IS_EXTENDER(c))) {
2284 if (count++ > 100) {
2285 count = 0;
2286 GROW;
2287 }
2288 if (len + 10 > max) {
2289 max *= 2;
2290 buffer = (xmlChar *) xmlRealloc(buffer,
2291 max * sizeof(xmlChar));
2292 if (buffer == NULL) {
2293 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2294 ctxt->sax->error(ctxt->userData,
2295 "htmlParseNameComplex: out of memory\n");
2296 return(NULL);
2297 }
2298 }
2299 COPY_BUF(l,buffer,len,c);
2300 NEXTL(l);
2301 c = CUR_CHAR(l);
2302 }
2303 buffer[len] = 0;
2304 return(buffer);
Owen Taylor3473f882001-02-23 17:55:21 +00002305 }
2306 }
2307 return(xmlStrndup(buf, len));
2308}
2309
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002310
Owen Taylor3473f882001-02-23 17:55:21 +00002311/**
2312 * htmlParseHTMLAttribute:
2313 * @ctxt: an HTML parser context
2314 * @stop: a char stop value
2315 *
2316 * parse an HTML attribute value till the stop (quote), if
2317 * stop is 0 then it stops at the first space
2318 *
2319 * Returns the attribute parsed or NULL
2320 */
2321
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002322static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002323htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2324 xmlChar *buffer = NULL;
2325 int buffer_size = 0;
2326 xmlChar *out = NULL;
2327 xmlChar *name = NULL;
2328
2329 xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002330 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002331
2332 /*
2333 * allocate a translation buffer.
2334 */
2335 buffer_size = HTML_PARSER_BUFFER_SIZE;
2336 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
2337 if (buffer == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00002338 xmlGenericError(xmlGenericErrorContext,
2339 "htmlParseHTMLAttribute: malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002340 return(NULL);
2341 }
2342 out = buffer;
2343
2344 /*
2345 * Ok loop until we reach one of the ending chars
2346 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002347 while ((CUR != 0) && (CUR != stop)) {
2348 if ((stop == 0) && (CUR == '>')) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002349 if ((stop == 0) && (IS_BLANK(CUR))) break;
2350 if (CUR == '&') {
2351 if (NXT(1) == '#') {
2352 unsigned int c;
2353 int bits;
2354
2355 c = htmlParseCharRef(ctxt);
2356 if (c < 0x80)
2357 { *out++ = c; bits= -6; }
2358 else if (c < 0x800)
2359 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2360 else if (c < 0x10000)
2361 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2362 else
2363 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2364
2365 for ( ; bits >= 0; bits-= 6) {
2366 *out++ = ((c >> bits) & 0x3F) | 0x80;
2367 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002368
2369 if (out - buffer > buffer_size - 100) {
2370 int indx = out - buffer;
2371
2372 growBuffer(buffer);
2373 out = &buffer[indx];
2374 }
Owen Taylor3473f882001-02-23 17:55:21 +00002375 } else {
2376 ent = htmlParseEntityRef(ctxt, &name);
2377 if (name == NULL) {
2378 *out++ = '&';
2379 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002380 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002381
2382 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002383 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002384 }
2385 } else if (ent == NULL) {
2386 *out++ = '&';
2387 cur = name;
2388 while (*cur != 0) {
2389 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002390 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002391
2392 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002393 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002394 }
2395 *out++ = *cur++;
2396 }
2397 xmlFree(name);
2398 } else {
2399 unsigned int c;
2400 int bits;
2401
2402 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002403 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002404
2405 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002406 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002407 }
2408 c = (xmlChar)ent->value;
2409 if (c < 0x80)
2410 { *out++ = c; bits= -6; }
2411 else if (c < 0x800)
2412 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2413 else if (c < 0x10000)
2414 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2415 else
2416 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2417
2418 for ( ; bits >= 0; bits-= 6) {
2419 *out++ = ((c >> bits) & 0x3F) | 0x80;
2420 }
2421 xmlFree(name);
2422 }
2423 }
2424 } else {
2425 unsigned int c;
2426 int bits, l;
2427
2428 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002429 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002430
2431 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002432 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002433 }
2434 c = CUR_CHAR(l);
2435 if (c < 0x80)
2436 { *out++ = c; bits= -6; }
2437 else if (c < 0x800)
2438 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2439 else if (c < 0x10000)
2440 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2441 else
2442 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2443
2444 for ( ; bits >= 0; bits-= 6) {
2445 *out++ = ((c >> bits) & 0x3F) | 0x80;
2446 }
2447 NEXT;
2448 }
2449 }
2450 *out++ = 0;
2451 return(buffer);
2452}
2453
2454/**
Owen Taylor3473f882001-02-23 17:55:21 +00002455 * htmlParseEntityRef:
2456 * @ctxt: an HTML parser context
2457 * @str: location to store the entity name
2458 *
2459 * parse an HTML ENTITY references
2460 *
2461 * [68] EntityRef ::= '&' Name ';'
2462 *
2463 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2464 * if non-NULL *str will have to be freed by the caller.
2465 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002466const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00002467htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2468 xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002469 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002470 *str = NULL;
2471
2472 if (CUR == '&') {
2473 NEXT;
2474 name = htmlParseName(ctxt);
2475 if (name == NULL) {
2476 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2477 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2478 ctxt->wellFormed = 0;
2479 } else {
2480 GROW;
2481 if (CUR == ';') {
2482 *str = name;
2483
2484 /*
2485 * Lookup the entity in the table.
2486 */
2487 ent = htmlEntityLookup(name);
2488 if (ent != NULL) /* OK that's ugly !!! */
2489 NEXT;
2490 } else {
2491 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2492 ctxt->sax->error(ctxt->userData,
2493 "htmlParseEntityRef: expecting ';'\n");
2494 *str = name;
2495 }
2496 }
2497 }
2498 return(ent);
2499}
2500
2501/**
2502 * htmlParseAttValue:
2503 * @ctxt: an HTML parser context
2504 *
2505 * parse a value for an attribute
2506 * Note: the parser won't do substitution of entities here, this
2507 * will be handled later in xmlStringGetNodeList, unless it was
2508 * asked for ctxt->replaceEntities != 0
2509 *
2510 * Returns the AttValue parsed or NULL.
2511 */
2512
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002513static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002514htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2515 xmlChar *ret = NULL;
2516
2517 if (CUR == '"') {
2518 NEXT;
2519 ret = htmlParseHTMLAttribute(ctxt, '"');
2520 if (CUR != '"') {
2521 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2522 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2523 ctxt->wellFormed = 0;
2524 } else
2525 NEXT;
2526 } else if (CUR == '\'') {
2527 NEXT;
2528 ret = htmlParseHTMLAttribute(ctxt, '\'');
2529 if (CUR != '\'') {
2530 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2531 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2532 ctxt->wellFormed = 0;
2533 } else
2534 NEXT;
2535 } else {
2536 /*
2537 * That's an HTMLism, the attribute value may not be quoted
2538 */
2539 ret = htmlParseHTMLAttribute(ctxt, 0);
2540 if (ret == NULL) {
2541 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2542 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2543 ctxt->wellFormed = 0;
2544 }
2545 }
2546 return(ret);
2547}
2548
2549/**
2550 * htmlParseSystemLiteral:
2551 * @ctxt: an HTML parser context
2552 *
2553 * parse an HTML Literal
2554 *
2555 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2556 *
2557 * Returns the SystemLiteral parsed or NULL
2558 */
2559
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002560static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002561htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2562 const xmlChar *q;
2563 xmlChar *ret = NULL;
2564
2565 if (CUR == '"') {
2566 NEXT;
2567 q = CUR_PTR;
2568 while ((IS_CHAR(CUR)) && (CUR != '"'))
2569 NEXT;
2570 if (!IS_CHAR(CUR)) {
2571 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2572 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2573 ctxt->wellFormed = 0;
2574 } else {
2575 ret = xmlStrndup(q, CUR_PTR - q);
2576 NEXT;
2577 }
2578 } else if (CUR == '\'') {
2579 NEXT;
2580 q = CUR_PTR;
2581 while ((IS_CHAR(CUR)) && (CUR != '\''))
2582 NEXT;
2583 if (!IS_CHAR(CUR)) {
2584 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2585 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2586 ctxt->wellFormed = 0;
2587 } else {
2588 ret = xmlStrndup(q, CUR_PTR - q);
2589 NEXT;
2590 }
2591 } else {
2592 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2593 ctxt->sax->error(ctxt->userData,
2594 "SystemLiteral \" or ' expected\n");
2595 ctxt->wellFormed = 0;
2596 }
2597
2598 return(ret);
2599}
2600
2601/**
2602 * htmlParsePubidLiteral:
2603 * @ctxt: an HTML parser context
2604 *
2605 * parse an HTML public literal
2606 *
2607 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2608 *
2609 * Returns the PubidLiteral parsed or NULL.
2610 */
2611
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002612static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002613htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2614 const xmlChar *q;
2615 xmlChar *ret = NULL;
2616 /*
2617 * Name ::= (Letter | '_') (NameChar)*
2618 */
2619 if (CUR == '"') {
2620 NEXT;
2621 q = CUR_PTR;
2622 while (IS_PUBIDCHAR(CUR)) NEXT;
2623 if (CUR != '"') {
2624 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2625 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2626 ctxt->wellFormed = 0;
2627 } else {
2628 ret = xmlStrndup(q, CUR_PTR - q);
2629 NEXT;
2630 }
2631 } else if (CUR == '\'') {
2632 NEXT;
2633 q = CUR_PTR;
Daniel Veillard6560a422003-03-27 21:25:38 +00002634 while ((IS_PUBIDCHAR(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002635 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002636 if (CUR != '\'') {
Owen Taylor3473f882001-02-23 17:55:21 +00002637 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2638 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2639 ctxt->wellFormed = 0;
2640 } else {
2641 ret = xmlStrndup(q, CUR_PTR - q);
2642 NEXT;
2643 }
2644 } else {
2645 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2646 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2647 ctxt->wellFormed = 0;
2648 }
2649
2650 return(ret);
2651}
2652
2653/**
2654 * htmlParseScript:
2655 * @ctxt: an HTML parser context
2656 *
2657 * parse the content of an HTML SCRIPT or STYLE element
2658 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2659 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2660 * http://www.w3.org/TR/html4/types.html#type-script
2661 * http://www.w3.org/TR/html4/types.html#h-6.15
2662 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2663 *
2664 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2665 * element and the value of intrinsic event attributes. User agents must
2666 * not evaluate script data as HTML markup but instead must pass it on as
2667 * data to a script engine.
2668 * NOTES:
2669 * - The content is passed like CDATA
2670 * - the attributes for style and scripting "onXXX" are also described
2671 * as CDATA but SGML allows entities references in attributes so their
2672 * processing is identical as other attributes
2673 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002674static void
Owen Taylor3473f882001-02-23 17:55:21 +00002675htmlParseScript(htmlParserCtxtPtr ctxt) {
2676 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2677 int nbchar = 0;
2678 xmlChar cur;
2679
2680 SHRINK;
2681 cur = CUR;
2682 while (IS_CHAR(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002683 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2684 (NXT(3) == '-')) {
2685 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2686 if (ctxt->sax->cdataBlock!= NULL) {
2687 /*
2688 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2689 */
2690 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2691 }
2692 }
2693 nbchar = 0;
2694 htmlParseComment(ctxt);
2695 cur = CUR;
2696 continue;
2697 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002698 /*
2699 * One should break here, the specification is clear:
2700 * Authors should therefore escape "</" within the content.
2701 * Escape mechanisms are specific to each scripting or
2702 * style sheet language.
2703 */
2704 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2705 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2706 break; /* while */
2707 }
2708 buf[nbchar++] = cur;
2709 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2710 if (ctxt->sax->cdataBlock!= NULL) {
2711 /*
2712 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2713 */
2714 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2715 }
2716 nbchar = 0;
2717 }
2718 NEXT;
2719 cur = CUR;
2720 }
2721 if (!(IS_CHAR(cur))) {
2722 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2723 ctxt->sax->error(ctxt->userData,
2724 "Invalid char in CDATA 0x%X\n", cur);
2725 ctxt->wellFormed = 0;
2726 NEXT;
2727 }
2728
2729 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2730 if (ctxt->sax->cdataBlock!= NULL) {
2731 /*
2732 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2733 */
2734 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2735 }
2736 }
2737}
2738
2739
2740/**
2741 * htmlParseCharData:
2742 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002743 *
2744 * parse a CharData section.
2745 * if we are within a CDATA section ']]>' marks an end of section.
2746 *
2747 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2748 */
2749
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002750static void
2751htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002752 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2753 int nbchar = 0;
2754 int cur, l;
2755
2756 SHRINK;
2757 cur = CUR_CHAR(l);
2758 while (((cur != '<') || (ctxt->token == '<')) &&
2759 ((cur != '&') || (ctxt->token == '&')) &&
2760 (IS_CHAR(cur))) {
2761 COPY_BUF(l,buf,nbchar,cur);
2762 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2763 /*
2764 * Ok the segment is to be consumed as chars.
2765 */
2766 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2767 if (areBlanks(ctxt, buf, nbchar)) {
2768 if (ctxt->sax->ignorableWhitespace != NULL)
2769 ctxt->sax->ignorableWhitespace(ctxt->userData,
2770 buf, nbchar);
2771 } else {
2772 htmlCheckParagraph(ctxt);
2773 if (ctxt->sax->characters != NULL)
2774 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2775 }
2776 }
2777 nbchar = 0;
2778 }
2779 NEXTL(l);
2780 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002781 if (cur == 0) {
2782 SHRINK;
2783 GROW;
2784 cur = CUR_CHAR(l);
2785 }
Owen Taylor3473f882001-02-23 17:55:21 +00002786 }
2787 if (nbchar != 0) {
2788 /*
2789 * Ok the segment is to be consumed as chars.
2790 */
2791 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2792 if (areBlanks(ctxt, buf, nbchar)) {
2793 if (ctxt->sax->ignorableWhitespace != NULL)
2794 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2795 } else {
2796 htmlCheckParagraph(ctxt);
2797 if (ctxt->sax->characters != NULL)
2798 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2799 }
2800 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002801 } else {
2802 /*
2803 * Loop detection
2804 */
2805 if (cur == 0)
2806 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002807 }
2808}
2809
2810/**
2811 * htmlParseExternalID:
2812 * @ctxt: an HTML parser context
2813 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002814 *
2815 * Parse an External ID or a Public ID
2816 *
Owen Taylor3473f882001-02-23 17:55:21 +00002817 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2818 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2819 *
2820 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2821 *
2822 * Returns the function returns SystemLiteral and in the second
2823 * case publicID receives PubidLiteral, is strict is off
2824 * it is possible to return NULL and have publicID set.
2825 */
2826
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002827static xmlChar *
2828htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002829 xmlChar *URI = NULL;
2830
2831 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2832 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2833 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2834 SKIP(6);
2835 if (!IS_BLANK(CUR)) {
2836 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2837 ctxt->sax->error(ctxt->userData,
2838 "Space required after 'SYSTEM'\n");
2839 ctxt->wellFormed = 0;
2840 }
2841 SKIP_BLANKS;
2842 URI = htmlParseSystemLiteral(ctxt);
2843 if (URI == NULL) {
2844 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2845 ctxt->sax->error(ctxt->userData,
2846 "htmlParseExternalID: SYSTEM, no URI\n");
2847 ctxt->wellFormed = 0;
2848 }
2849 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2850 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2851 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2852 SKIP(6);
2853 if (!IS_BLANK(CUR)) {
2854 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2855 ctxt->sax->error(ctxt->userData,
2856 "Space required after 'PUBLIC'\n");
2857 ctxt->wellFormed = 0;
2858 }
2859 SKIP_BLANKS;
2860 *publicID = htmlParsePubidLiteral(ctxt);
2861 if (*publicID == NULL) {
2862 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2863 ctxt->sax->error(ctxt->userData,
2864 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2865 ctxt->wellFormed = 0;
2866 }
2867 SKIP_BLANKS;
2868 if ((CUR == '"') || (CUR == '\'')) {
2869 URI = htmlParseSystemLiteral(ctxt);
2870 }
2871 }
2872 return(URI);
2873}
2874
2875/**
2876 * htmlParseComment:
2877 * @ctxt: an HTML parser context
2878 *
2879 * Parse an XML (SGML) comment <!-- .... -->
2880 *
2881 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2882 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002883static void
Owen Taylor3473f882001-02-23 17:55:21 +00002884htmlParseComment(htmlParserCtxtPtr ctxt) {
2885 xmlChar *buf = NULL;
2886 int len;
2887 int size = HTML_PARSER_BUFFER_SIZE;
2888 int q, ql;
2889 int r, rl;
2890 int cur, l;
2891 xmlParserInputState state;
2892
2893 /*
2894 * Check that there is a comment right here.
2895 */
2896 if ((RAW != '<') || (NXT(1) != '!') ||
2897 (NXT(2) != '-') || (NXT(3) != '-')) return;
2898
2899 state = ctxt->instate;
2900 ctxt->instate = XML_PARSER_COMMENT;
2901 SHRINK;
2902 SKIP(4);
2903 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2904 if (buf == NULL) {
2905 xmlGenericError(xmlGenericErrorContext,
2906 "malloc of %d byte failed\n", size);
2907 ctxt->instate = state;
2908 return;
2909 }
2910 q = CUR_CHAR(ql);
2911 NEXTL(ql);
2912 r = CUR_CHAR(rl);
2913 NEXTL(rl);
2914 cur = CUR_CHAR(l);
2915 len = 0;
2916 while (IS_CHAR(cur) &&
2917 ((cur != '>') ||
2918 (r != '-') || (q != '-'))) {
2919 if (len + 5 >= size) {
2920 size *= 2;
2921 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2922 if (buf == NULL) {
2923 xmlGenericError(xmlGenericErrorContext,
2924 "realloc of %d byte failed\n", size);
2925 ctxt->instate = state;
2926 return;
2927 }
2928 }
2929 COPY_BUF(ql,buf,len,q);
2930 q = r;
2931 ql = rl;
2932 r = cur;
2933 rl = l;
2934 NEXTL(l);
2935 cur = CUR_CHAR(l);
2936 if (cur == 0) {
2937 SHRINK;
2938 GROW;
2939 cur = CUR_CHAR(l);
2940 }
2941 }
2942 buf[len] = 0;
2943 if (!IS_CHAR(cur)) {
2944 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2945 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2946 ctxt->sax->error(ctxt->userData,
2947 "Comment not terminated \n<!--%.50s\n", buf);
2948 ctxt->wellFormed = 0;
2949 xmlFree(buf);
2950 } else {
2951 NEXT;
2952 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2953 (!ctxt->disableSAX))
2954 ctxt->sax->comment(ctxt->userData, buf);
2955 xmlFree(buf);
2956 }
2957 ctxt->instate = state;
2958}
2959
2960/**
2961 * htmlParseCharRef:
2962 * @ctxt: an HTML parser context
2963 *
2964 * parse Reference declarations
2965 *
2966 * [66] CharRef ::= '&#' [0-9]+ ';' |
2967 * '&#x' [0-9a-fA-F]+ ';'
2968 *
2969 * Returns the value parsed (as an int)
2970 */
2971int
2972htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2973 int val = 0;
2974
2975 if ((CUR == '&') && (NXT(1) == '#') &&
2976 (NXT(2) == 'x')) {
2977 SKIP(3);
2978 while (CUR != ';') {
2979 if ((CUR >= '0') && (CUR <= '9'))
2980 val = val * 16 + (CUR - '0');
2981 else if ((CUR >= 'a') && (CUR <= 'f'))
2982 val = val * 16 + (CUR - 'a') + 10;
2983 else if ((CUR >= 'A') && (CUR <= 'F'))
2984 val = val * 16 + (CUR - 'A') + 10;
2985 else {
2986 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2987 ctxt->sax->error(ctxt->userData,
2988 "htmlParseCharRef: invalid hexadecimal value\n");
2989 ctxt->wellFormed = 0;
2990 return(0);
2991 }
2992 NEXT;
2993 }
2994 if (CUR == ';')
2995 NEXT;
2996 } else if ((CUR == '&') && (NXT(1) == '#')) {
2997 SKIP(2);
2998 while (CUR != ';') {
2999 if ((CUR >= '0') && (CUR <= '9'))
3000 val = val * 10 + (CUR - '0');
3001 else {
3002 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3003 ctxt->sax->error(ctxt->userData,
3004 "htmlParseCharRef: invalid decimal value\n");
3005 ctxt->wellFormed = 0;
3006 return(0);
3007 }
3008 NEXT;
3009 }
3010 if (CUR == ';')
3011 NEXT;
3012 } else {
3013 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3014 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
3015 ctxt->wellFormed = 0;
3016 }
3017 /*
3018 * Check the value IS_CHAR ...
3019 */
3020 if (IS_CHAR(val)) {
3021 return(val);
3022 } else {
3023 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3024 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
3025 val);
3026 ctxt->wellFormed = 0;
3027 }
3028 return(0);
3029}
3030
3031
3032/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003033 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003034 * @ctxt: an HTML parser context
3035 *
3036 * parse a DOCTYPE declaration
3037 *
3038 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3039 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3040 */
3041
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003042static void
Owen Taylor3473f882001-02-23 17:55:21 +00003043htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3044 xmlChar *name;
3045 xmlChar *ExternalID = NULL;
3046 xmlChar *URI = NULL;
3047
3048 /*
3049 * We know that '<!DOCTYPE' has been detected.
3050 */
3051 SKIP(9);
3052
3053 SKIP_BLANKS;
3054
3055 /*
3056 * Parse the DOCTYPE name.
3057 */
3058 name = htmlParseName(ctxt);
3059 if (name == NULL) {
3060 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3061 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
3062 ctxt->wellFormed = 0;
3063 }
3064 /*
3065 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3066 */
3067
3068 SKIP_BLANKS;
3069
3070 /*
3071 * Check for SystemID and ExternalID
3072 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003073 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003074 SKIP_BLANKS;
3075
3076 /*
3077 * We should be at the end of the DOCTYPE declaration.
3078 */
3079 if (CUR != '>') {
3080 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardf6ed8bc2001-10-02 09:22:47 +00003081 ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003082 ctxt->wellFormed = 0;
3083 /* We shouldn't try to resynchronize ... */
3084 }
3085 NEXT;
3086
3087 /*
3088 * Create or update the document accordingly to the DOCTYPE
3089 */
3090 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3091 (!ctxt->disableSAX))
3092 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3093
3094 /*
3095 * Cleanup, since we don't use all those identifiers
3096 */
3097 if (URI != NULL) xmlFree(URI);
3098 if (ExternalID != NULL) xmlFree(ExternalID);
3099 if (name != NULL) xmlFree(name);
3100}
3101
3102/**
3103 * htmlParseAttribute:
3104 * @ctxt: an HTML parser context
3105 * @value: a xmlChar ** used to store the value of the attribute
3106 *
3107 * parse an attribute
3108 *
3109 * [41] Attribute ::= Name Eq AttValue
3110 *
3111 * [25] Eq ::= S? '=' S?
3112 *
3113 * With namespace:
3114 *
3115 * [NS 11] Attribute ::= QName Eq AttValue
3116 *
3117 * Also the case QName == xmlns:??? is handled independently as a namespace
3118 * definition.
3119 *
3120 * Returns the attribute name, and the value in *value.
3121 */
3122
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003123static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003124htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3125 xmlChar *name, *val = NULL;
3126
3127 *value = NULL;
3128 name = htmlParseHTMLName(ctxt);
3129 if (name == NULL) {
3130 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3131 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
3132 ctxt->wellFormed = 0;
3133 return(NULL);
3134 }
3135
3136 /*
3137 * read the value
3138 */
3139 SKIP_BLANKS;
3140 if (CUR == '=') {
3141 NEXT;
3142 SKIP_BLANKS;
3143 val = htmlParseAttValue(ctxt);
3144 /******
3145 } else {
3146 * TODO : some attribute must have values, some may not
3147 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3148 ctxt->sax->warning(ctxt->userData,
3149 "No value for attribute %s\n", name); */
3150 }
3151
3152 *value = val;
3153 return(name);
3154}
3155
3156/**
3157 * htmlCheckEncoding:
3158 * @ctxt: an HTML parser context
3159 * @attvalue: the attribute value
3160 *
3161 * Checks an http-equiv attribute from a Meta tag to detect
3162 * the encoding
3163 * If a new encoding is detected the parser is switched to decode
3164 * it and pass UTF8
3165 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003166static void
Owen Taylor3473f882001-02-23 17:55:21 +00003167htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3168 const xmlChar *encoding;
3169
3170 if ((ctxt == NULL) || (attvalue == NULL))
3171 return;
3172
3173 /* do not change encoding */
3174 if (ctxt->input->encoding != NULL)
3175 return;
3176
3177 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3178 if (encoding != NULL) {
3179 encoding += 8;
3180 } else {
3181 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3182 if (encoding != NULL)
3183 encoding += 9;
3184 }
3185 if (encoding != NULL) {
3186 xmlCharEncoding enc;
3187 xmlCharEncodingHandlerPtr handler;
3188
3189 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3190
3191 if (ctxt->input->encoding != NULL)
3192 xmlFree((xmlChar *) ctxt->input->encoding);
3193 ctxt->input->encoding = xmlStrdup(encoding);
3194
3195 enc = xmlParseCharEncoding((const char *) encoding);
3196 /*
3197 * registered set of known encodings
3198 */
3199 if (enc != XML_CHAR_ENCODING_ERROR) {
3200 xmlSwitchEncoding(ctxt, enc);
3201 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3202 } else {
3203 /*
3204 * fallback for unknown encodings
3205 */
3206 handler = xmlFindCharEncodingHandler((const char *) encoding);
3207 if (handler != NULL) {
3208 xmlSwitchToEncoding(ctxt, handler);
3209 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3210 } else {
3211 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3212 }
3213 }
3214
3215 if ((ctxt->input->buf != NULL) &&
3216 (ctxt->input->buf->encoder != NULL) &&
3217 (ctxt->input->buf->raw != NULL) &&
3218 (ctxt->input->buf->buffer != NULL)) {
3219 int nbchars;
3220 int processed;
3221
3222 /*
3223 * convert as much as possible to the parser reading buffer.
3224 */
3225 processed = ctxt->input->cur - ctxt->input->base;
3226 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3227 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3228 ctxt->input->buf->buffer,
3229 ctxt->input->buf->raw);
3230 if (nbchars < 0) {
3231 ctxt->errNo = XML_ERR_INVALID_ENCODING;
3232 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3233 ctxt->sax->error(ctxt->userData,
3234 "htmlCheckEncoding: encoder error\n");
3235 }
3236 ctxt->input->base =
3237 ctxt->input->cur = ctxt->input->buf->buffer->content;
3238 }
3239 }
3240}
3241
3242/**
3243 * htmlCheckMeta:
3244 * @ctxt: an HTML parser context
3245 * @atts: the attributes values
3246 *
3247 * Checks an attributes from a Meta tag
3248 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003249static void
Owen Taylor3473f882001-02-23 17:55:21 +00003250htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3251 int i;
3252 const xmlChar *att, *value;
3253 int http = 0;
3254 const xmlChar *content = NULL;
3255
3256 if ((ctxt == NULL) || (atts == NULL))
3257 return;
3258
3259 i = 0;
3260 att = atts[i++];
3261 while (att != NULL) {
3262 value = atts[i++];
3263 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3264 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3265 http = 1;
3266 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3267 content = value;
3268 att = atts[i++];
3269 }
3270 if ((http) && (content != NULL))
3271 htmlCheckEncoding(ctxt, content);
3272
3273}
3274
3275/**
3276 * htmlParseStartTag:
3277 * @ctxt: an HTML parser context
3278 *
3279 * parse a start of tag either for rule element or
3280 * EmptyElement. In both case we don't parse the tag closing chars.
3281 *
3282 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3283 *
3284 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3285 *
3286 * With namespace:
3287 *
3288 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3289 *
3290 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3291 *
3292 */
3293
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003294static void
Owen Taylor3473f882001-02-23 17:55:21 +00003295htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3296 xmlChar *name;
3297 xmlChar *attname;
3298 xmlChar *attvalue;
3299 const xmlChar **atts = NULL;
3300 int nbatts = 0;
3301 int maxatts = 0;
3302 int meta = 0;
3303 int i;
3304
3305 if (CUR != '<') return;
3306 NEXT;
3307
3308 GROW;
3309 name = htmlParseHTMLName(ctxt);
3310 if (name == NULL) {
3311 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3312 ctxt->sax->error(ctxt->userData,
3313 "htmlParseStartTag: invalid element name\n");
3314 ctxt->wellFormed = 0;
3315 /* Dump the bogus tag like browsers do */
3316 while ((IS_CHAR(CUR)) && (CUR != '>'))
3317 NEXT;
3318 return;
3319 }
3320 if (xmlStrEqual(name, BAD_CAST"meta"))
3321 meta = 1;
3322
3323 /*
3324 * Check for auto-closure of HTML elements.
3325 */
3326 htmlAutoClose(ctxt, name);
3327
3328 /*
3329 * Check for implied HTML elements.
3330 */
3331 htmlCheckImplied(ctxt, name);
3332
3333 /*
3334 * Avoid html at any level > 0, head at any level != 1
3335 * or any attempt to recurse body
3336 */
3337 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3338 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3339 ctxt->sax->error(ctxt->userData,
3340 "htmlParseStartTag: misplaced <html> tag\n");
3341 ctxt->wellFormed = 0;
3342 xmlFree(name);
3343 return;
3344 }
3345 if ((ctxt->nameNr != 1) &&
3346 (xmlStrEqual(name, BAD_CAST"head"))) {
3347 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3348 ctxt->sax->error(ctxt->userData,
3349 "htmlParseStartTag: misplaced <head> tag\n");
3350 ctxt->wellFormed = 0;
3351 xmlFree(name);
3352 return;
3353 }
3354 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003355 int indx;
3356 for (indx = 0;indx < ctxt->nameNr;indx++) {
3357 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00003358 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3359 ctxt->sax->error(ctxt->userData,
3360 "htmlParseStartTag: misplaced <body> tag\n");
3361 ctxt->wellFormed = 0;
3362 xmlFree(name);
3363 return;
3364 }
3365 }
3366 }
3367
3368 /*
3369 * Now parse the attributes, it ends up with the ending
3370 *
3371 * (S Attribute)* S?
3372 */
3373 SKIP_BLANKS;
3374 while ((IS_CHAR(CUR)) &&
3375 (CUR != '>') &&
3376 ((CUR != '/') || (NXT(1) != '>'))) {
3377 long cons = ctxt->nbChars;
3378
3379 GROW;
3380 attname = htmlParseAttribute(ctxt, &attvalue);
3381 if (attname != NULL) {
3382
3383 /*
3384 * Well formedness requires at most one declaration of an attribute
3385 */
3386 for (i = 0; i < nbatts;i += 2) {
3387 if (xmlStrEqual(atts[i], attname)) {
3388 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3389 ctxt->sax->error(ctxt->userData,
3390 "Attribute %s redefined\n",
3391 attname);
3392 ctxt->wellFormed = 0;
3393 xmlFree(attname);
3394 if (attvalue != NULL)
3395 xmlFree(attvalue);
3396 goto failed;
3397 }
3398 }
3399
3400 /*
3401 * Add the pair to atts
3402 */
3403 if (atts == NULL) {
3404 maxatts = 10;
3405 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
3406 if (atts == NULL) {
3407 xmlGenericError(xmlGenericErrorContext,
3408 "malloc of %ld byte failed\n",
3409 maxatts * (long)sizeof(xmlChar *));
3410 if (name != NULL) xmlFree(name);
3411 return;
3412 }
3413 } else if (nbatts + 4 > maxatts) {
3414 maxatts *= 2;
3415 atts = (const xmlChar **) xmlRealloc((void *) atts,
3416 maxatts * sizeof(xmlChar *));
3417 if (atts == NULL) {
3418 xmlGenericError(xmlGenericErrorContext,
3419 "realloc of %ld byte failed\n",
3420 maxatts * (long)sizeof(xmlChar *));
3421 if (name != NULL) xmlFree(name);
3422 return;
3423 }
3424 }
3425 atts[nbatts++] = attname;
3426 atts[nbatts++] = attvalue;
3427 atts[nbatts] = NULL;
3428 atts[nbatts + 1] = NULL;
3429 }
3430 else {
3431 /* Dump the bogus attribute string up to the next blank or
3432 * the end of the tag. */
Daniel Veillard561b7f82002-03-20 21:55:57 +00003433 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3434 && ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003435 NEXT;
3436 }
3437
3438failed:
3439 SKIP_BLANKS;
3440 if (cons == ctxt->nbChars) {
3441 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3442 ctxt->sax->error(ctxt->userData,
3443 "htmlParseStartTag: problem parsing attributes\n");
3444 ctxt->wellFormed = 0;
3445 break;
3446 }
3447 }
3448
3449 /*
3450 * Handle specific association to the META tag
3451 */
3452 if (meta)
3453 htmlCheckMeta(ctxt, atts);
3454
3455 /*
3456 * SAX: Start of Element !
3457 */
3458 htmlnamePush(ctxt, xmlStrdup(name));
3459#ifdef DEBUG
3460 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3461#endif
3462 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3463 ctxt->sax->startElement(ctxt->userData, name, atts);
3464
3465 if (atts != NULL) {
3466 for (i = 0;i < nbatts;i++) {
3467 if (atts[i] != NULL)
3468 xmlFree((xmlChar *) atts[i]);
3469 }
3470 xmlFree((void *) atts);
3471 }
3472 if (name != NULL) xmlFree(name);
3473}
3474
3475/**
3476 * htmlParseEndTag:
3477 * @ctxt: an HTML parser context
3478 *
3479 * parse an end of tag
3480 *
3481 * [42] ETag ::= '</' Name S? '>'
3482 *
3483 * With namespace
3484 *
3485 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003486 *
3487 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003488 */
3489
Daniel Veillardf420ac52001-07-04 16:04:09 +00003490static int
Owen Taylor3473f882001-02-23 17:55:21 +00003491htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3492 xmlChar *name;
3493 xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003494 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003495
3496 if ((CUR != '<') || (NXT(1) != '/')) {
3497 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3498 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3499 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003500 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003501 }
3502 SKIP(2);
3503
3504 name = htmlParseHTMLName(ctxt);
Daniel Veillardf420ac52001-07-04 16:04:09 +00003505 if (name == NULL) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003506
3507 /*
3508 * We should definitely be at the ending "S? '>'" part
3509 */
3510 SKIP_BLANKS;
3511 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3512 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3513 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3514 ctxt->wellFormed = 0;
3515 } else
3516 NEXT;
3517
3518 /*
3519 * If the name read is not one of the element in the parsing stack
3520 * then return, it's just an error.
3521 */
3522 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3523 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3524 }
3525 if (i < 0) {
3526 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3527 ctxt->sax->error(ctxt->userData,
3528 "Unexpected end tag : %s\n", name);
3529 xmlFree(name);
3530 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003531 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003532 }
3533
3534
3535 /*
3536 * Check for auto-closure of HTML elements.
3537 */
3538
3539 htmlAutoCloseOnClose(ctxt, name);
3540
3541 /*
3542 * Well formedness constraints, opening and closing must match.
3543 * With the exception that the autoclose may have popped stuff out
3544 * of the stack.
3545 */
3546 if (!xmlStrEqual(name, ctxt->name)) {
3547#ifdef DEBUG
3548 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3549#endif
3550 if ((ctxt->name != NULL) &&
3551 (!xmlStrEqual(ctxt->name, name))) {
3552 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3553 ctxt->sax->error(ctxt->userData,
3554 "Opening and ending tag mismatch: %s and %s\n",
3555 name, ctxt->name);
3556 ctxt->wellFormed = 0;
3557 }
3558 }
3559
3560 /*
3561 * SAX: End of Tag
3562 */
3563 oldname = ctxt->name;
3564 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3565 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3566 ctxt->sax->endElement(ctxt->userData, name);
3567 oldname = htmlnamePop(ctxt);
3568 if (oldname != NULL) {
3569#ifdef DEBUG
3570 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3571#endif
3572 xmlFree(oldname);
3573#ifdef DEBUG
3574 } else {
3575 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3576#endif
3577 }
Daniel Veillardf420ac52001-07-04 16:04:09 +00003578 ret = 1;
3579 } else {
3580 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003581 }
3582
3583 if (name != NULL)
3584 xmlFree(name);
3585
Daniel Veillardf420ac52001-07-04 16:04:09 +00003586 return(ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003587}
3588
3589
3590/**
3591 * htmlParseReference:
3592 * @ctxt: an HTML parser context
3593 *
3594 * parse and handle entity references in content,
3595 * this will end-up in a call to character() since this is either a
3596 * CharRef, or a predefined entity.
3597 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003598static void
Owen Taylor3473f882001-02-23 17:55:21 +00003599htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003600 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003601 xmlChar out[6];
3602 xmlChar *name;
3603 if (CUR != '&') return;
3604
3605 if (NXT(1) == '#') {
3606 unsigned int c;
3607 int bits, i = 0;
3608
3609 c = htmlParseCharRef(ctxt);
3610 if (c == 0)
3611 return;
3612
3613 if (c < 0x80) { out[i++]= c; bits= -6; }
3614 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3615 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3616 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3617
3618 for ( ; bits >= 0; bits-= 6) {
3619 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3620 }
3621 out[i] = 0;
3622
3623 htmlCheckParagraph(ctxt);
3624 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3625 ctxt->sax->characters(ctxt->userData, out, i);
3626 } else {
3627 ent = htmlParseEntityRef(ctxt, &name);
3628 if (name == NULL) {
3629 htmlCheckParagraph(ctxt);
3630 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3631 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3632 return;
3633 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003634 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003635 htmlCheckParagraph(ctxt);
3636 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3637 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3638 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3639 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3640 }
3641 } else {
3642 unsigned int c;
3643 int bits, i = 0;
3644
3645 c = ent->value;
3646 if (c < 0x80)
3647 { out[i++]= c; bits= -6; }
3648 else if (c < 0x800)
3649 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3650 else if (c < 0x10000)
3651 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3652 else
3653 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3654
3655 for ( ; bits >= 0; bits-= 6) {
3656 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3657 }
3658 out[i] = 0;
3659
3660 htmlCheckParagraph(ctxt);
3661 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3662 ctxt->sax->characters(ctxt->userData, out, i);
3663 }
3664 xmlFree(name);
3665 }
3666}
3667
3668/**
3669 * htmlParseContent:
3670 * @ctxt: an HTML parser context
3671 * @name: the node name
3672 *
3673 * Parse a content: comment, sub-element, reference or text.
3674 *
3675 */
3676
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003677static void
Owen Taylor3473f882001-02-23 17:55:21 +00003678htmlParseContent(htmlParserCtxtPtr ctxt) {
3679 xmlChar *currentNode;
3680 int depth;
3681
3682 currentNode = xmlStrdup(ctxt->name);
3683 depth = ctxt->nameNr;
3684 while (1) {
3685 long cons = ctxt->nbChars;
3686
3687 GROW;
3688 /*
3689 * Our tag or one of it's parent or children is ending.
3690 */
3691 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003692 if (htmlParseEndTag(ctxt) &&
3693 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3694 if (currentNode != NULL)
3695 xmlFree(currentNode);
3696 return;
3697 }
3698 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003699 }
3700
3701 /*
3702 * Has this node been popped out during parsing of
3703 * the next element
3704 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003705 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3706 (!xmlStrEqual(currentNode, ctxt->name)))
3707 {
Owen Taylor3473f882001-02-23 17:55:21 +00003708 if (currentNode != NULL) xmlFree(currentNode);
3709 return;
3710 }
3711
Daniel Veillardf9533d12001-03-03 10:04:57 +00003712 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3713 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003714 /*
3715 * Handle SCRIPT/STYLE separately
3716 */
3717 htmlParseScript(ctxt);
3718 } else {
3719 /*
3720 * Sometimes DOCTYPE arrives in the middle of the document
3721 */
3722 if ((CUR == '<') && (NXT(1) == '!') &&
3723 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3724 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3725 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3726 (UPP(8) == 'E')) {
3727 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3728 ctxt->sax->error(ctxt->userData,
3729 "Misplaced DOCTYPE declaration\n");
3730 ctxt->wellFormed = 0;
3731 htmlParseDocTypeDecl(ctxt);
3732 }
3733
3734 /*
3735 * First case : a comment
3736 */
3737 if ((CUR == '<') && (NXT(1) == '!') &&
3738 (NXT(2) == '-') && (NXT(3) == '-')) {
3739 htmlParseComment(ctxt);
3740 }
3741
3742 /*
3743 * Second case : a sub-element.
3744 */
3745 else if (CUR == '<') {
3746 htmlParseElement(ctxt);
3747 }
3748
3749 /*
3750 * Third case : a reference. If if has not been resolved,
3751 * parsing returns it's Name, create the node
3752 */
3753 else if (CUR == '&') {
3754 htmlParseReference(ctxt);
3755 }
3756
3757 /*
3758 * Fourth : end of the resource
3759 */
3760 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003761 htmlAutoCloseOnEnd(ctxt);
3762 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003763 }
3764
3765 /*
3766 * Last case, text. Note that References are handled directly.
3767 */
3768 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003769 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003770 }
3771
3772 if (cons == ctxt->nbChars) {
3773 if (ctxt->node != NULL) {
3774 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3775 ctxt->sax->error(ctxt->userData,
3776 "detected an error in element content\n");
3777 ctxt->wellFormed = 0;
3778 }
3779 break;
3780 }
3781 }
3782 GROW;
3783 }
3784 if (currentNode != NULL) xmlFree(currentNode);
3785}
3786
3787/**
3788 * htmlParseElement:
3789 * @ctxt: an HTML parser context
3790 *
3791 * parse an HTML element, this is highly recursive
3792 *
3793 * [39] element ::= EmptyElemTag | STag content ETag
3794 *
3795 * [41] Attribute ::= Name Eq AttValue
3796 */
3797
3798void
3799htmlParseElement(htmlParserCtxtPtr ctxt) {
3800 xmlChar *name;
3801 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003802 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003803 htmlParserNodeInfo node_info;
3804 xmlChar *oldname;
3805 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003806 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003807
3808 /* Capture start position */
3809 if (ctxt->record_info) {
3810 node_info.begin_pos = ctxt->input->consumed +
3811 (CUR_PTR - ctxt->input->base);
3812 node_info.begin_line = ctxt->input->line;
3813 }
3814
3815 oldname = xmlStrdup(ctxt->name);
3816 htmlParseStartTag(ctxt);
3817 name = ctxt->name;
3818#ifdef DEBUG
3819 if (oldname == NULL)
3820 xmlGenericError(xmlGenericErrorContext,
3821 "Start of element %s\n", name);
3822 else if (name == NULL)
3823 xmlGenericError(xmlGenericErrorContext,
3824 "Start of element failed, was %s\n", oldname);
3825 else
3826 xmlGenericError(xmlGenericErrorContext,
3827 "Start of element %s, was %s\n", name, oldname);
3828#endif
3829 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3830 (name == NULL)) {
3831 if (CUR == '>')
3832 NEXT;
3833 if (oldname != NULL)
3834 xmlFree(oldname);
3835 return;
3836 }
3837 if (oldname != NULL)
3838 xmlFree(oldname);
3839
3840 /*
3841 * Lookup the info for that element.
3842 */
3843 info = htmlTagLookup(name);
3844 if (info == NULL) {
3845 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3846 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3847 name);
3848 ctxt->wellFormed = 0;
3849 } else if (info->depr) {
3850/***************************
3851 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3852 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3853 name);
3854 ***************************/
3855 }
3856
3857 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003858 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003859 */
3860 if ((CUR == '/') && (NXT(1) == '>')) {
3861 SKIP(2);
3862 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3863 ctxt->sax->endElement(ctxt->userData, name);
3864 oldname = htmlnamePop(ctxt);
3865#ifdef DEBUG
3866 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3867#endif
3868 if (oldname != NULL)
3869 xmlFree(oldname);
3870 return;
3871 }
3872
3873 if (CUR == '>') {
3874 NEXT;
3875 } else {
3876 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3877 ctxt->sax->error(ctxt->userData,
3878 "Couldn't find end of Start Tag %s\n",
3879 name);
3880 ctxt->wellFormed = 0;
3881
3882 /*
3883 * end of parsing of this node.
3884 */
3885 if (xmlStrEqual(name, ctxt->name)) {
3886 nodePop(ctxt);
3887 oldname = htmlnamePop(ctxt);
3888#ifdef DEBUG
3889 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3890#endif
3891 if (oldname != NULL)
3892 xmlFree(oldname);
3893 }
3894
3895 /*
3896 * Capture end position and add node
3897 */
3898 if ( currentNode != NULL && ctxt->record_info ) {
3899 node_info.end_pos = ctxt->input->consumed +
3900 (CUR_PTR - ctxt->input->base);
3901 node_info.end_line = ctxt->input->line;
3902 node_info.node = ctxt->node;
3903 xmlParserAddNodeInfo(ctxt, &node_info);
3904 }
3905 return;
3906 }
3907
3908 /*
3909 * Check for an Empty Element from DTD definition
3910 */
3911 if ((info != NULL) && (info->empty)) {
3912 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3913 ctxt->sax->endElement(ctxt->userData, name);
3914 oldname = htmlnamePop(ctxt);
3915#ifdef DEBUG
3916 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3917#endif
3918 if (oldname != NULL)
3919 xmlFree(oldname);
3920 return;
3921 }
3922
3923 /*
3924 * Parse the content of the element:
3925 */
3926 currentNode = xmlStrdup(ctxt->name);
3927 depth = ctxt->nameNr;
3928 while (IS_CHAR(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003929 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003930 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003931 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003932 if (ctxt->nameNr < depth) break;
3933 }
3934
Owen Taylor3473f882001-02-23 17:55:21 +00003935 /*
3936 * Capture end position and add node
3937 */
3938 if ( currentNode != NULL && ctxt->record_info ) {
3939 node_info.end_pos = ctxt->input->consumed +
3940 (CUR_PTR - ctxt->input->base);
3941 node_info.end_line = ctxt->input->line;
3942 node_info.node = ctxt->node;
3943 xmlParserAddNodeInfo(ctxt, &node_info);
3944 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003945 if (!IS_CHAR(CUR)) {
3946 htmlAutoCloseOnEnd(ctxt);
3947 }
3948
Owen Taylor3473f882001-02-23 17:55:21 +00003949 if (currentNode != NULL)
3950 xmlFree(currentNode);
3951}
3952
3953/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003954 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00003955 * @ctxt: an HTML parser context
3956 *
3957 * parse an HTML document (and build a tree if using the standard SAX
3958 * interface).
3959 *
3960 * Returns 0, -1 in case of error. the parser context is augmented
3961 * as a result of the parsing.
3962 */
3963
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00003964int
Owen Taylor3473f882001-02-23 17:55:21 +00003965htmlParseDocument(htmlParserCtxtPtr ctxt) {
3966 xmlDtdPtr dtd;
3967
Daniel Veillardd0463562001-10-13 09:15:48 +00003968 xmlInitParser();
3969
Owen Taylor3473f882001-02-23 17:55:21 +00003970 htmlDefaultSAXHandlerInit();
3971 ctxt->html = 1;
3972
3973 GROW;
3974 /*
3975 * SAX: beginning of the document processing.
3976 */
3977 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3978 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3979
3980 /*
3981 * Wipe out everything which is before the first '<'
3982 */
3983 SKIP_BLANKS;
3984 if (CUR == 0) {
3985 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3986 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3987 ctxt->wellFormed = 0;
3988 }
3989
3990 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3991 ctxt->sax->startDocument(ctxt->userData);
3992
3993
3994 /*
3995 * Parse possible comments before any content
3996 */
3997 while ((CUR == '<') && (NXT(1) == '!') &&
3998 (NXT(2) == '-') && (NXT(3) == '-')) {
3999 htmlParseComment(ctxt);
4000 SKIP_BLANKS;
4001 }
4002
4003
4004 /*
4005 * Then possibly doc type declaration(s) and more Misc
4006 * (doctypedecl Misc*)?
4007 */
4008 if ((CUR == '<') && (NXT(1) == '!') &&
4009 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4010 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4011 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4012 (UPP(8) == 'E')) {
4013 htmlParseDocTypeDecl(ctxt);
4014 }
4015 SKIP_BLANKS;
4016
4017 /*
4018 * Parse possible comments before any content
4019 */
4020 while ((CUR == '<') && (NXT(1) == '!') &&
4021 (NXT(2) == '-') && (NXT(3) == '-')) {
4022 htmlParseComment(ctxt);
4023 SKIP_BLANKS;
4024 }
4025
4026 /*
4027 * Time to start parsing the tree itself
4028 */
4029 htmlParseContent(ctxt);
4030
4031 /*
4032 * autoclose
4033 */
4034 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004035 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004036
4037
4038 /*
4039 * SAX: end of the document processing.
4040 */
4041 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4042 ctxt->sax->endDocument(ctxt->userData);
4043
4044 if (ctxt->myDoc != NULL) {
4045 dtd = xmlGetIntSubset(ctxt->myDoc);
4046 if (dtd == NULL)
4047 ctxt->myDoc->intSubset =
4048 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4049 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4050 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4051 }
4052 if (! ctxt->wellFormed) return(-1);
4053 return(0);
4054}
4055
4056
4057/************************************************************************
4058 * *
4059 * Parser contexts handling *
4060 * *
4061 ************************************************************************/
4062
4063/**
4064 * xmlInitParserCtxt:
4065 * @ctxt: an HTML parser context
4066 *
4067 * Initialize a parser context
4068 */
4069
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004070static void
Owen Taylor3473f882001-02-23 17:55:21 +00004071htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4072{
4073 htmlSAXHandler *sax;
4074
4075 if (ctxt == NULL) return;
4076 memset(ctxt, 0, sizeof(htmlParserCtxt));
4077
4078 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4079 if (sax == NULL) {
4080 xmlGenericError(xmlGenericErrorContext,
4081 "htmlInitParserCtxt: out of memory\n");
4082 }
4083 else
4084 memset(sax, 0, sizeof(htmlSAXHandler));
4085
4086 /* Allocate the Input stack */
4087 ctxt->inputTab = (htmlParserInputPtr *)
4088 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4089 if (ctxt->inputTab == NULL) {
4090 xmlGenericError(xmlGenericErrorContext,
4091 "htmlInitParserCtxt: out of memory\n");
4092 ctxt->inputNr = 0;
4093 ctxt->inputMax = 0;
4094 ctxt->input = NULL;
4095 return;
4096 }
4097 ctxt->inputNr = 0;
4098 ctxt->inputMax = 5;
4099 ctxt->input = NULL;
4100 ctxt->version = NULL;
4101 ctxt->encoding = NULL;
4102 ctxt->standalone = -1;
4103 ctxt->instate = XML_PARSER_START;
4104
4105 /* Allocate the Node stack */
4106 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4107 if (ctxt->nodeTab == NULL) {
4108 xmlGenericError(xmlGenericErrorContext,
4109 "htmlInitParserCtxt: out of memory\n");
4110 ctxt->nodeNr = 0;
4111 ctxt->nodeMax = 0;
4112 ctxt->node = NULL;
4113 ctxt->inputNr = 0;
4114 ctxt->inputMax = 0;
4115 ctxt->input = NULL;
4116 return;
4117 }
4118 ctxt->nodeNr = 0;
4119 ctxt->nodeMax = 10;
4120 ctxt->node = NULL;
4121
4122 /* Allocate the Name stack */
4123 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4124 if (ctxt->nameTab == NULL) {
4125 xmlGenericError(xmlGenericErrorContext,
4126 "htmlInitParserCtxt: out of memory\n");
4127 ctxt->nameNr = 0;
4128 ctxt->nameMax = 10;
4129 ctxt->name = NULL;
4130 ctxt->nodeNr = 0;
4131 ctxt->nodeMax = 0;
4132 ctxt->node = NULL;
4133 ctxt->inputNr = 0;
4134 ctxt->inputMax = 0;
4135 ctxt->input = NULL;
4136 return;
4137 }
4138 ctxt->nameNr = 0;
4139 ctxt->nameMax = 10;
4140 ctxt->name = NULL;
4141
4142 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
4143 else {
4144 ctxt->sax = sax;
4145 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
4146 }
4147 ctxt->userData = ctxt;
4148 ctxt->myDoc = NULL;
4149 ctxt->wellFormed = 1;
4150 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004151 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004152 ctxt->html = 1;
4153 ctxt->record_info = 0;
4154 ctxt->validate = 0;
4155 ctxt->nbChars = 0;
4156 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004157 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004158 xmlInitNodeInfoSeq(&ctxt->node_seq);
4159}
4160
4161/**
4162 * htmlFreeParserCtxt:
4163 * @ctxt: an HTML parser context
4164 *
4165 * Free all the memory used by a parser context. However the parsed
4166 * document in ctxt->myDoc is not freed.
4167 */
4168
4169void
4170htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4171{
4172 xmlFreeParserCtxt(ctxt);
4173}
4174
4175/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004176 * htmlNewParserCtxt:
4177 *
4178 * Allocate and initialize a new parser context.
4179 *
4180 * Returns the xmlParserCtxtPtr or NULL
4181 */
4182
4183static htmlParserCtxtPtr
4184htmlNewParserCtxt(void)
4185{
4186 xmlParserCtxtPtr ctxt;
4187
4188 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4189 if (ctxt == NULL) {
4190 xmlGenericError(xmlGenericErrorContext,
4191 "xmlNewParserCtxt : cannot allocate context\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004192 return(NULL);
4193 }
4194 memset(ctxt, 0, sizeof(xmlParserCtxt));
4195 htmlInitParserCtxt(ctxt);
4196 return(ctxt);
4197}
4198
4199/**
4200 * htmlCreateMemoryParserCtxt:
4201 * @buffer: a pointer to a char array
4202 * @size: the size of the array
4203 *
4204 * Create a parser context for an HTML in-memory document.
4205 *
4206 * Returns the new parser context or NULL
4207 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004208htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004209htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4210 xmlParserCtxtPtr ctxt;
4211 xmlParserInputPtr input;
4212 xmlParserInputBufferPtr buf;
4213
4214 if (buffer == NULL)
4215 return(NULL);
4216 if (size <= 0)
4217 return(NULL);
4218
4219 ctxt = htmlNewParserCtxt();
4220 if (ctxt == NULL)
4221 return(NULL);
4222
4223 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4224 if (buf == NULL) return(NULL);
4225
4226 input = xmlNewInputStream(ctxt);
4227 if (input == NULL) {
4228 xmlFreeParserCtxt(ctxt);
4229 return(NULL);
4230 }
4231
4232 input->filename = NULL;
4233 input->buf = buf;
4234 input->base = input->buf->buffer->content;
4235 input->cur = input->buf->buffer->content;
4236 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4237
4238 inputPush(ctxt, input);
4239 return(ctxt);
4240}
4241
4242/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004243 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004244 * @cur: a pointer to an array of xmlChar
4245 * @encoding: a free form C string describing the HTML document encoding, or NULL
4246 *
4247 * Create a parser context for an HTML document.
4248 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004249 * TODO: check the need to add encoding handling there
4250 *
Owen Taylor3473f882001-02-23 17:55:21 +00004251 * Returns the new parser context or NULL
4252 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004253static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00004254htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004255 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004256 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004257
Daniel Veillard1d995272002-07-22 16:43:32 +00004258 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004259 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004260 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004261 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4262
4263 if (encoding != NULL) {
4264 xmlCharEncoding enc;
4265 xmlCharEncodingHandlerPtr handler;
4266
4267 if (ctxt->input->encoding != NULL)
4268 xmlFree((xmlChar *) ctxt->input->encoding);
4269 ctxt->input->encoding = (const xmlChar *) encoding;
4270
4271 enc = xmlParseCharEncoding(encoding);
4272 /*
4273 * registered set of known encodings
4274 */
4275 if (enc != XML_CHAR_ENCODING_ERROR) {
4276 xmlSwitchEncoding(ctxt, enc);
4277 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4278 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4279 ctxt->sax->error(ctxt->userData,
4280 "Unsupported encoding %s\n", encoding);
4281 ctxt->input->encoding = NULL;
4282 }
4283 } else {
4284 /*
4285 * fallback for unknown encodings
4286 */
4287 handler = xmlFindCharEncodingHandler((const char *) encoding);
4288 if (handler != NULL) {
4289 xmlSwitchToEncoding(ctxt, handler);
4290 } else {
4291 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
4292 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4293 ctxt->sax->error(ctxt->userData,
4294 "Unsupported encoding %s\n", encoding);
4295 }
4296 }
4297 }
4298 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004299}
4300
4301/************************************************************************
4302 * *
4303 * Progressive parsing interfaces *
4304 * *
4305 ************************************************************************/
4306
4307/**
4308 * htmlParseLookupSequence:
4309 * @ctxt: an HTML parser context
4310 * @first: the first char to lookup
4311 * @next: the next char to lookup or zero
4312 * @third: the next char to lookup or zero
4313 *
4314 * Try to find if a sequence (first, next, third) or just (first next) or
4315 * (first) is available in the input stream.
4316 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4317 * to avoid rescanning sequences of bytes, it DOES change the state of the
4318 * parser, do not use liberally.
4319 * This is basically similar to xmlParseLookupSequence()
4320 *
4321 * Returns the index to the current parsing point if the full sequence
4322 * is available, -1 otherwise.
4323 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004324static int
Owen Taylor3473f882001-02-23 17:55:21 +00004325htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
4326 xmlChar next, xmlChar third) {
4327 int base, len;
4328 htmlParserInputPtr in;
4329 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004330 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004331
4332 in = ctxt->input;
4333 if (in == NULL) return(-1);
4334 base = in->cur - in->base;
4335 if (base < 0) return(-1);
4336 if (ctxt->checkIndex > base)
4337 base = ctxt->checkIndex;
4338 if (in->buf == NULL) {
4339 buf = in->base;
4340 len = in->length;
4341 } else {
4342 buf = in->buf->buffer->content;
4343 len = in->buf->buffer->use;
4344 }
4345 /* take into account the sequence length */
4346 if (third) len -= 2;
4347 else if (next) len --;
4348 for (;base < len;base++) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004349 if (!incomment && (base + 4 < len)) {
4350 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4351 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4352 incomment = 1;
4353 }
4354 /* do not increment base, some people use <!--> */
4355 }
4356 if (incomment) {
4357 if (base + 3 < len)
4358 return(-1);
4359 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4360 (buf[base + 2] == '>')) {
4361 incomment = 0;
4362 base += 2;
4363 }
4364 continue;
4365 }
Owen Taylor3473f882001-02-23 17:55:21 +00004366 if (buf[base] == first) {
4367 if (third != 0) {
4368 if ((buf[base + 1] != next) ||
4369 (buf[base + 2] != third)) continue;
4370 } else if (next != 0) {
4371 if (buf[base + 1] != next) continue;
4372 }
4373 ctxt->checkIndex = 0;
4374#ifdef DEBUG_PUSH
4375 if (next == 0)
4376 xmlGenericError(xmlGenericErrorContext,
4377 "HPP: lookup '%c' found at %d\n",
4378 first, base);
4379 else if (third == 0)
4380 xmlGenericError(xmlGenericErrorContext,
4381 "HPP: lookup '%c%c' found at %d\n",
4382 first, next, base);
4383 else
4384 xmlGenericError(xmlGenericErrorContext,
4385 "HPP: lookup '%c%c%c' found at %d\n",
4386 first, next, third, base);
4387#endif
4388 return(base - (in->cur - in->base));
4389 }
4390 }
4391 ctxt->checkIndex = base;
4392#ifdef DEBUG_PUSH
4393 if (next == 0)
4394 xmlGenericError(xmlGenericErrorContext,
4395 "HPP: lookup '%c' failed\n", first);
4396 else if (third == 0)
4397 xmlGenericError(xmlGenericErrorContext,
4398 "HPP: lookup '%c%c' failed\n", first, next);
4399 else
4400 xmlGenericError(xmlGenericErrorContext,
4401 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4402#endif
4403 return(-1);
4404}
4405
4406/**
4407 * htmlParseTryOrFinish:
4408 * @ctxt: an HTML parser context
4409 * @terminate: last chunk indicator
4410 *
4411 * Try to progress on parsing
4412 *
4413 * Returns zero if no parsing was possible
4414 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004415static int
Owen Taylor3473f882001-02-23 17:55:21 +00004416htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4417 int ret = 0;
4418 htmlParserInputPtr in;
4419 int avail = 0;
4420 xmlChar cur, next;
4421
4422#ifdef DEBUG_PUSH
4423 switch (ctxt->instate) {
4424 case XML_PARSER_EOF:
4425 xmlGenericError(xmlGenericErrorContext,
4426 "HPP: try EOF\n"); break;
4427 case XML_PARSER_START:
4428 xmlGenericError(xmlGenericErrorContext,
4429 "HPP: try START\n"); break;
4430 case XML_PARSER_MISC:
4431 xmlGenericError(xmlGenericErrorContext,
4432 "HPP: try MISC\n");break;
4433 case XML_PARSER_COMMENT:
4434 xmlGenericError(xmlGenericErrorContext,
4435 "HPP: try COMMENT\n");break;
4436 case XML_PARSER_PROLOG:
4437 xmlGenericError(xmlGenericErrorContext,
4438 "HPP: try PROLOG\n");break;
4439 case XML_PARSER_START_TAG:
4440 xmlGenericError(xmlGenericErrorContext,
4441 "HPP: try START_TAG\n");break;
4442 case XML_PARSER_CONTENT:
4443 xmlGenericError(xmlGenericErrorContext,
4444 "HPP: try CONTENT\n");break;
4445 case XML_PARSER_CDATA_SECTION:
4446 xmlGenericError(xmlGenericErrorContext,
4447 "HPP: try CDATA_SECTION\n");break;
4448 case XML_PARSER_END_TAG:
4449 xmlGenericError(xmlGenericErrorContext,
4450 "HPP: try END_TAG\n");break;
4451 case XML_PARSER_ENTITY_DECL:
4452 xmlGenericError(xmlGenericErrorContext,
4453 "HPP: try ENTITY_DECL\n");break;
4454 case XML_PARSER_ENTITY_VALUE:
4455 xmlGenericError(xmlGenericErrorContext,
4456 "HPP: try ENTITY_VALUE\n");break;
4457 case XML_PARSER_ATTRIBUTE_VALUE:
4458 xmlGenericError(xmlGenericErrorContext,
4459 "HPP: try ATTRIBUTE_VALUE\n");break;
4460 case XML_PARSER_DTD:
4461 xmlGenericError(xmlGenericErrorContext,
4462 "HPP: try DTD\n");break;
4463 case XML_PARSER_EPILOG:
4464 xmlGenericError(xmlGenericErrorContext,
4465 "HPP: try EPILOG\n");break;
4466 case XML_PARSER_PI:
4467 xmlGenericError(xmlGenericErrorContext,
4468 "HPP: try PI\n");break;
4469 case XML_PARSER_SYSTEM_LITERAL:
4470 xmlGenericError(xmlGenericErrorContext,
4471 "HPP: try SYSTEM_LITERAL\n");break;
4472 }
4473#endif
4474
4475 while (1) {
4476
4477 in = ctxt->input;
4478 if (in == NULL) break;
4479 if (in->buf == NULL)
4480 avail = in->length - (in->cur - in->base);
4481 else
4482 avail = in->buf->buffer->use - (in->cur - in->base);
4483 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004484 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004485 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4486 /*
4487 * SAX: end of the document processing.
4488 */
4489 ctxt->instate = XML_PARSER_EOF;
4490 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4491 ctxt->sax->endDocument(ctxt->userData);
4492 }
4493 }
4494 if (avail < 1)
4495 goto done;
4496 switch (ctxt->instate) {
4497 case XML_PARSER_EOF:
4498 /*
4499 * Document parsing is done !
4500 */
4501 goto done;
4502 case XML_PARSER_START:
4503 /*
4504 * Very first chars read from the document flow.
4505 */
4506 cur = in->cur[0];
4507 if (IS_BLANK(cur)) {
4508 SKIP_BLANKS;
4509 if (in->buf == NULL)
4510 avail = in->length - (in->cur - in->base);
4511 else
4512 avail = in->buf->buffer->use - (in->cur - in->base);
4513 }
4514 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4515 ctxt->sax->setDocumentLocator(ctxt->userData,
4516 &xmlDefaultSAXLocator);
4517 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4518 (!ctxt->disableSAX))
4519 ctxt->sax->startDocument(ctxt->userData);
4520
4521 cur = in->cur[0];
4522 next = in->cur[1];
4523 if ((cur == '<') && (next == '!') &&
4524 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4525 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4526 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4527 (UPP(8) == 'E')) {
4528 if ((!terminate) &&
4529 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4530 goto done;
4531#ifdef DEBUG_PUSH
4532 xmlGenericError(xmlGenericErrorContext,
4533 "HPP: Parsing internal subset\n");
4534#endif
4535 htmlParseDocTypeDecl(ctxt);
4536 ctxt->instate = XML_PARSER_PROLOG;
4537#ifdef DEBUG_PUSH
4538 xmlGenericError(xmlGenericErrorContext,
4539 "HPP: entering PROLOG\n");
4540#endif
4541 } else {
4542 ctxt->instate = XML_PARSER_MISC;
4543 }
4544#ifdef DEBUG_PUSH
4545 xmlGenericError(xmlGenericErrorContext,
4546 "HPP: entering MISC\n");
4547#endif
4548 break;
4549 case XML_PARSER_MISC:
4550 SKIP_BLANKS;
4551 if (in->buf == NULL)
4552 avail = in->length - (in->cur - in->base);
4553 else
4554 avail = in->buf->buffer->use - (in->cur - in->base);
4555 if (avail < 2)
4556 goto done;
4557 cur = in->cur[0];
4558 next = in->cur[1];
4559 if ((cur == '<') && (next == '!') &&
4560 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4561 if ((!terminate) &&
4562 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4563 goto done;
4564#ifdef DEBUG_PUSH
4565 xmlGenericError(xmlGenericErrorContext,
4566 "HPP: Parsing Comment\n");
4567#endif
4568 htmlParseComment(ctxt);
4569 ctxt->instate = XML_PARSER_MISC;
4570 } else if ((cur == '<') && (next == '!') &&
4571 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4572 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4573 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4574 (UPP(8) == 'E')) {
4575 if ((!terminate) &&
4576 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4577 goto done;
4578#ifdef DEBUG_PUSH
4579 xmlGenericError(xmlGenericErrorContext,
4580 "HPP: Parsing internal subset\n");
4581#endif
4582 htmlParseDocTypeDecl(ctxt);
4583 ctxt->instate = XML_PARSER_PROLOG;
4584#ifdef DEBUG_PUSH
4585 xmlGenericError(xmlGenericErrorContext,
4586 "HPP: entering PROLOG\n");
4587#endif
4588 } else if ((cur == '<') && (next == '!') &&
4589 (avail < 9)) {
4590 goto done;
4591 } else {
4592 ctxt->instate = XML_PARSER_START_TAG;
4593#ifdef DEBUG_PUSH
4594 xmlGenericError(xmlGenericErrorContext,
4595 "HPP: entering START_TAG\n");
4596#endif
4597 }
4598 break;
4599 case XML_PARSER_PROLOG:
4600 SKIP_BLANKS;
4601 if (in->buf == NULL)
4602 avail = in->length - (in->cur - in->base);
4603 else
4604 avail = in->buf->buffer->use - (in->cur - in->base);
4605 if (avail < 2)
4606 goto done;
4607 cur = in->cur[0];
4608 next = in->cur[1];
4609 if ((cur == '<') && (next == '!') &&
4610 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4611 if ((!terminate) &&
4612 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4613 goto done;
4614#ifdef DEBUG_PUSH
4615 xmlGenericError(xmlGenericErrorContext,
4616 "HPP: Parsing Comment\n");
4617#endif
4618 htmlParseComment(ctxt);
4619 ctxt->instate = XML_PARSER_PROLOG;
4620 } else if ((cur == '<') && (next == '!') &&
4621 (avail < 4)) {
4622 goto done;
4623 } else {
4624 ctxt->instate = XML_PARSER_START_TAG;
4625#ifdef DEBUG_PUSH
4626 xmlGenericError(xmlGenericErrorContext,
4627 "HPP: entering START_TAG\n");
4628#endif
4629 }
4630 break;
4631 case XML_PARSER_EPILOG:
4632 if (in->buf == NULL)
4633 avail = in->length - (in->cur - in->base);
4634 else
4635 avail = in->buf->buffer->use - (in->cur - in->base);
4636 if (avail < 1)
4637 goto done;
4638 cur = in->cur[0];
4639 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004640 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004641 goto done;
4642 }
4643 if (avail < 2)
4644 goto done;
4645 next = in->cur[1];
4646 if ((cur == '<') && (next == '!') &&
4647 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4648 if ((!terminate) &&
4649 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4650 goto done;
4651#ifdef DEBUG_PUSH
4652 xmlGenericError(xmlGenericErrorContext,
4653 "HPP: Parsing Comment\n");
4654#endif
4655 htmlParseComment(ctxt);
4656 ctxt->instate = XML_PARSER_EPILOG;
4657 } else if ((cur == '<') && (next == '!') &&
4658 (avail < 4)) {
4659 goto done;
4660 } else {
4661 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004662 ctxt->wellFormed = 0;
4663 ctxt->instate = XML_PARSER_EOF;
4664#ifdef DEBUG_PUSH
4665 xmlGenericError(xmlGenericErrorContext,
4666 "HPP: entering EOF\n");
4667#endif
4668 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4669 ctxt->sax->endDocument(ctxt->userData);
4670 goto done;
4671 }
4672 break;
4673 case XML_PARSER_START_TAG: {
4674 xmlChar *name, *oldname;
4675 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004676 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004677
4678 if (avail < 2)
4679 goto done;
4680 cur = in->cur[0];
4681 if (cur != '<') {
4682 ctxt->instate = XML_PARSER_CONTENT;
4683#ifdef DEBUG_PUSH
4684 xmlGenericError(xmlGenericErrorContext,
4685 "HPP: entering CONTENT\n");
4686#endif
4687 break;
4688 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004689 if (in->cur[1] == '/') {
4690 ctxt->instate = XML_PARSER_END_TAG;
4691 ctxt->checkIndex = 0;
4692#ifdef DEBUG_PUSH
4693 xmlGenericError(xmlGenericErrorContext,
4694 "HPP: entering END_TAG\n");
4695#endif
4696 break;
4697 }
Owen Taylor3473f882001-02-23 17:55:21 +00004698 if ((!terminate) &&
4699 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4700 goto done;
4701
4702 oldname = xmlStrdup(ctxt->name);
4703 htmlParseStartTag(ctxt);
4704 name = ctxt->name;
4705#ifdef DEBUG
4706 if (oldname == NULL)
4707 xmlGenericError(xmlGenericErrorContext,
4708 "Start of element %s\n", name);
4709 else if (name == NULL)
4710 xmlGenericError(xmlGenericErrorContext,
4711 "Start of element failed, was %s\n",
4712 oldname);
4713 else
4714 xmlGenericError(xmlGenericErrorContext,
4715 "Start of element %s, was %s\n",
4716 name, oldname);
4717#endif
4718 if (((depth == ctxt->nameNr) &&
4719 (xmlStrEqual(oldname, ctxt->name))) ||
4720 (name == NULL)) {
4721 if (CUR == '>')
4722 NEXT;
4723 if (oldname != NULL)
4724 xmlFree(oldname);
4725 break;
4726 }
4727 if (oldname != NULL)
4728 xmlFree(oldname);
4729
4730 /*
4731 * Lookup the info for that element.
4732 */
4733 info = htmlTagLookup(name);
4734 if (info == NULL) {
4735 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4736 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4737 name);
4738 ctxt->wellFormed = 0;
4739 } else if (info->depr) {
4740 /***************************
4741 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4742 ctxt->sax->warning(ctxt->userData,
4743 "Tag %s is deprecated\n",
4744 name);
4745 ***************************/
4746 }
4747
4748 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004749 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004750 */
4751 if ((CUR == '/') && (NXT(1) == '>')) {
4752 SKIP(2);
4753 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4754 ctxt->sax->endElement(ctxt->userData, name);
4755 oldname = htmlnamePop(ctxt);
4756#ifdef DEBUG
4757 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4758 oldname);
4759#endif
4760 if (oldname != NULL)
4761 xmlFree(oldname);
4762 ctxt->instate = XML_PARSER_CONTENT;
4763#ifdef DEBUG_PUSH
4764 xmlGenericError(xmlGenericErrorContext,
4765 "HPP: entering CONTENT\n");
4766#endif
4767 break;
4768 }
4769
4770 if (CUR == '>') {
4771 NEXT;
4772 } else {
4773 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4774 ctxt->sax->error(ctxt->userData,
4775 "Couldn't find end of Start Tag %s\n",
4776 name);
4777 ctxt->wellFormed = 0;
4778
4779 /*
4780 * end of parsing of this node.
4781 */
4782 if (xmlStrEqual(name, ctxt->name)) {
4783 nodePop(ctxt);
4784 oldname = htmlnamePop(ctxt);
4785#ifdef DEBUG
4786 xmlGenericError(xmlGenericErrorContext,
4787 "End of start tag problem: popping out %s\n", oldname);
4788#endif
4789 if (oldname != NULL)
4790 xmlFree(oldname);
4791 }
4792
4793 ctxt->instate = XML_PARSER_CONTENT;
4794#ifdef DEBUG_PUSH
4795 xmlGenericError(xmlGenericErrorContext,
4796 "HPP: entering CONTENT\n");
4797#endif
4798 break;
4799 }
4800
4801 /*
4802 * Check for an Empty Element from DTD definition
4803 */
4804 if ((info != NULL) && (info->empty)) {
4805 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4806 ctxt->sax->endElement(ctxt->userData, name);
4807 oldname = htmlnamePop(ctxt);
4808#ifdef DEBUG
4809 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4810#endif
4811 if (oldname != NULL)
4812 xmlFree(oldname);
4813 }
4814 ctxt->instate = XML_PARSER_CONTENT;
4815#ifdef DEBUG_PUSH
4816 xmlGenericError(xmlGenericErrorContext,
4817 "HPP: entering CONTENT\n");
4818#endif
4819 break;
4820 }
4821 case XML_PARSER_CONTENT: {
4822 long cons;
4823 /*
4824 * Handle preparsed entities and charRef
4825 */
4826 if (ctxt->token != 0) {
4827 xmlChar chr[2] = { 0 , 0 } ;
4828
4829 chr[0] = (xmlChar) ctxt->token;
4830 htmlCheckParagraph(ctxt);
4831 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4832 ctxt->sax->characters(ctxt->userData, chr, 1);
4833 ctxt->token = 0;
4834 ctxt->checkIndex = 0;
4835 }
4836 if ((avail == 1) && (terminate)) {
4837 cur = in->cur[0];
4838 if ((cur != '<') && (cur != '&')) {
4839 if (ctxt->sax != NULL) {
4840 if (IS_BLANK(cur)) {
4841 if (ctxt->sax->ignorableWhitespace != NULL)
4842 ctxt->sax->ignorableWhitespace(
4843 ctxt->userData, &cur, 1);
4844 } else {
4845 htmlCheckParagraph(ctxt);
4846 if (ctxt->sax->characters != NULL)
4847 ctxt->sax->characters(
4848 ctxt->userData, &cur, 1);
4849 }
4850 }
4851 ctxt->token = 0;
4852 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00004853 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00004854 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004855 }
Owen Taylor3473f882001-02-23 17:55:21 +00004856 }
4857 if (avail < 2)
4858 goto done;
4859 cur = in->cur[0];
4860 next = in->cur[1];
4861 cons = ctxt->nbChars;
4862 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4863 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4864 /*
4865 * Handle SCRIPT/STYLE separately
4866 */
4867 if ((!terminate) &&
4868 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4869 goto done;
4870 htmlParseScript(ctxt);
4871 if ((cur == '<') && (next == '/')) {
4872 ctxt->instate = XML_PARSER_END_TAG;
4873 ctxt->checkIndex = 0;
4874#ifdef DEBUG_PUSH
4875 xmlGenericError(xmlGenericErrorContext,
4876 "HPP: entering END_TAG\n");
4877#endif
4878 break;
4879 }
4880 } else {
4881 /*
4882 * Sometimes DOCTYPE arrives in the middle of the document
4883 */
4884 if ((cur == '<') && (next == '!') &&
4885 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4886 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4887 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4888 (UPP(8) == 'E')) {
4889 if ((!terminate) &&
4890 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4891 goto done;
4892 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4893 ctxt->sax->error(ctxt->userData,
4894 "Misplaced DOCTYPE declaration\n");
4895 ctxt->wellFormed = 0;
4896 htmlParseDocTypeDecl(ctxt);
4897 } else if ((cur == '<') && (next == '!') &&
4898 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4899 if ((!terminate) &&
4900 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4901 goto done;
4902#ifdef DEBUG_PUSH
4903 xmlGenericError(xmlGenericErrorContext,
4904 "HPP: Parsing Comment\n");
4905#endif
4906 htmlParseComment(ctxt);
4907 ctxt->instate = XML_PARSER_CONTENT;
4908 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4909 goto done;
4910 } else if ((cur == '<') && (next == '/')) {
4911 ctxt->instate = XML_PARSER_END_TAG;
4912 ctxt->checkIndex = 0;
4913#ifdef DEBUG_PUSH
4914 xmlGenericError(xmlGenericErrorContext,
4915 "HPP: entering END_TAG\n");
4916#endif
4917 break;
4918 } else if (cur == '<') {
4919 ctxt->instate = XML_PARSER_START_TAG;
4920 ctxt->checkIndex = 0;
4921#ifdef DEBUG_PUSH
4922 xmlGenericError(xmlGenericErrorContext,
4923 "HPP: entering START_TAG\n");
4924#endif
4925 break;
4926 } else if (cur == '&') {
4927 if ((!terminate) &&
4928 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4929 goto done;
4930#ifdef DEBUG_PUSH
4931 xmlGenericError(xmlGenericErrorContext,
4932 "HPP: Parsing Reference\n");
4933#endif
4934 /* TODO: check generation of subtrees if noent !!! */
4935 htmlParseReference(ctxt);
4936 } else {
4937 /* TODO Avoid the extra copy, handle directly !!!!!! */
4938 /*
Daniel Veillard01c13b52002-12-10 15:19:08 +00004939 * Goal of the following test is:
Owen Taylor3473f882001-02-23 17:55:21 +00004940 * - minimize calls to the SAX 'character' callback
4941 * when they are mergeable
4942 */
4943 if ((ctxt->inputNr == 1) &&
4944 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4945 if ((!terminate) &&
4946 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4947 goto done;
4948 }
4949 ctxt->checkIndex = 0;
4950#ifdef DEBUG_PUSH
4951 xmlGenericError(xmlGenericErrorContext,
4952 "HPP: Parsing char data\n");
4953#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004954 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004955 }
4956 }
4957 if (cons == ctxt->nbChars) {
4958 if (ctxt->node != NULL) {
4959 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4960 ctxt->sax->error(ctxt->userData,
4961 "detected an error in element content\n");
4962 ctxt->wellFormed = 0;
4963 }
4964 NEXT;
4965 break;
4966 }
4967
4968 break;
4969 }
4970 case XML_PARSER_END_TAG:
4971 if (avail < 2)
4972 goto done;
4973 if ((!terminate) &&
4974 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4975 goto done;
4976 htmlParseEndTag(ctxt);
4977 if (ctxt->nameNr == 0) {
4978 ctxt->instate = XML_PARSER_EPILOG;
4979 } else {
4980 ctxt->instate = XML_PARSER_CONTENT;
4981 }
4982 ctxt->checkIndex = 0;
4983#ifdef DEBUG_PUSH
4984 xmlGenericError(xmlGenericErrorContext,
4985 "HPP: entering CONTENT\n");
4986#endif
4987 break;
4988 case XML_PARSER_CDATA_SECTION:
4989 xmlGenericError(xmlGenericErrorContext,
4990 "HPP: internal error, state == CDATA\n");
4991 ctxt->instate = XML_PARSER_CONTENT;
4992 ctxt->checkIndex = 0;
4993#ifdef DEBUG_PUSH
4994 xmlGenericError(xmlGenericErrorContext,
4995 "HPP: entering CONTENT\n");
4996#endif
4997 break;
4998 case XML_PARSER_DTD:
4999 xmlGenericError(xmlGenericErrorContext,
5000 "HPP: internal error, state == DTD\n");
5001 ctxt->instate = XML_PARSER_CONTENT;
5002 ctxt->checkIndex = 0;
5003#ifdef DEBUG_PUSH
5004 xmlGenericError(xmlGenericErrorContext,
5005 "HPP: entering CONTENT\n");
5006#endif
5007 break;
5008 case XML_PARSER_COMMENT:
5009 xmlGenericError(xmlGenericErrorContext,
5010 "HPP: internal error, state == COMMENT\n");
5011 ctxt->instate = XML_PARSER_CONTENT;
5012 ctxt->checkIndex = 0;
5013#ifdef DEBUG_PUSH
5014 xmlGenericError(xmlGenericErrorContext,
5015 "HPP: entering CONTENT\n");
5016#endif
5017 break;
5018 case XML_PARSER_PI:
5019 xmlGenericError(xmlGenericErrorContext,
5020 "HPP: internal error, state == PI\n");
5021 ctxt->instate = XML_PARSER_CONTENT;
5022 ctxt->checkIndex = 0;
5023#ifdef DEBUG_PUSH
5024 xmlGenericError(xmlGenericErrorContext,
5025 "HPP: entering CONTENT\n");
5026#endif
5027 break;
5028 case XML_PARSER_ENTITY_DECL:
5029 xmlGenericError(xmlGenericErrorContext,
5030 "HPP: internal error, state == ENTITY_DECL\n");
5031 ctxt->instate = XML_PARSER_CONTENT;
5032 ctxt->checkIndex = 0;
5033#ifdef DEBUG_PUSH
5034 xmlGenericError(xmlGenericErrorContext,
5035 "HPP: entering CONTENT\n");
5036#endif
5037 break;
5038 case XML_PARSER_ENTITY_VALUE:
5039 xmlGenericError(xmlGenericErrorContext,
5040 "HPP: internal error, state == ENTITY_VALUE\n");
5041 ctxt->instate = XML_PARSER_CONTENT;
5042 ctxt->checkIndex = 0;
5043#ifdef DEBUG_PUSH
5044 xmlGenericError(xmlGenericErrorContext,
5045 "HPP: entering DTD\n");
5046#endif
5047 break;
5048 case XML_PARSER_ATTRIBUTE_VALUE:
5049 xmlGenericError(xmlGenericErrorContext,
5050 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
5051 ctxt->instate = XML_PARSER_START_TAG;
5052 ctxt->checkIndex = 0;
5053#ifdef DEBUG_PUSH
5054 xmlGenericError(xmlGenericErrorContext,
5055 "HPP: entering START_TAG\n");
5056#endif
5057 break;
5058 case XML_PARSER_SYSTEM_LITERAL:
5059 xmlGenericError(xmlGenericErrorContext,
5060 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
5061 ctxt->instate = XML_PARSER_CONTENT;
5062 ctxt->checkIndex = 0;
5063#ifdef DEBUG_PUSH
5064 xmlGenericError(xmlGenericErrorContext,
5065 "HPP: entering CONTENT\n");
5066#endif
5067 break;
5068 case XML_PARSER_IGNORE:
5069 xmlGenericError(xmlGenericErrorContext,
5070 "HPP: internal error, state == XML_PARSER_IGNORE\n");
5071 ctxt->instate = XML_PARSER_CONTENT;
5072 ctxt->checkIndex = 0;
5073#ifdef DEBUG_PUSH
5074 xmlGenericError(xmlGenericErrorContext,
5075 "HPP: entering CONTENT\n");
5076#endif
5077 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005078 case XML_PARSER_PUBLIC_LITERAL:
5079 xmlGenericError(xmlGenericErrorContext,
5080 "HPP: internal error, state == XML_PARSER_LITERAL\n");
5081 ctxt->instate = XML_PARSER_CONTENT;
5082 ctxt->checkIndex = 0;
5083#ifdef DEBUG_PUSH
5084 xmlGenericError(xmlGenericErrorContext,
5085 "HPP: entering CONTENT\n");
5086#endif
5087 break;
5088
Owen Taylor3473f882001-02-23 17:55:21 +00005089 }
5090 }
5091done:
5092 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005093 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005094 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5095 /*
5096 * SAX: end of the document processing.
5097 */
5098 ctxt->instate = XML_PARSER_EOF;
5099 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5100 ctxt->sax->endDocument(ctxt->userData);
5101 }
5102 }
5103 if ((ctxt->myDoc != NULL) &&
5104 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5105 (ctxt->instate == XML_PARSER_EPILOG))) {
5106 xmlDtdPtr dtd;
5107 dtd = xmlGetIntSubset(ctxt->myDoc);
5108 if (dtd == NULL)
5109 ctxt->myDoc->intSubset =
5110 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
5111 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5112 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5113 }
5114#ifdef DEBUG_PUSH
5115 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5116#endif
5117 return(ret);
5118}
5119
5120/**
Owen Taylor3473f882001-02-23 17:55:21 +00005121 * htmlParseChunk:
5122 * @ctxt: an XML parser context
5123 * @chunk: an char array
5124 * @size: the size in byte of the chunk
5125 * @terminate: last chunk indicator
5126 *
5127 * Parse a Chunk of memory
5128 *
5129 * Returns zero if no error, the xmlParserErrors otherwise.
5130 */
5131int
5132htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5133 int terminate) {
5134 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5135 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5136 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5137 int cur = ctxt->input->cur - ctxt->input->base;
5138
5139 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5140 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5141 ctxt->input->cur = ctxt->input->base + cur;
5142#ifdef DEBUG_PUSH
5143 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5144#endif
5145
5146 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5147 htmlParseTryOrFinish(ctxt, terminate);
5148 } else if (ctxt->instate != XML_PARSER_EOF) {
5149 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
5150 htmlParseTryOrFinish(ctxt, terminate);
5151 }
5152 if (terminate) {
5153 if ((ctxt->instate != XML_PARSER_EOF) &&
5154 (ctxt->instate != XML_PARSER_EPILOG) &&
5155 (ctxt->instate != XML_PARSER_MISC)) {
5156 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005157 ctxt->wellFormed = 0;
5158 }
5159 if (ctxt->instate != XML_PARSER_EOF) {
5160 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5161 ctxt->sax->endDocument(ctxt->userData);
5162 }
5163 ctxt->instate = XML_PARSER_EOF;
5164 }
5165 return((xmlParserErrors) ctxt->errNo);
5166}
5167
5168/************************************************************************
5169 * *
5170 * User entry points *
5171 * *
5172 ************************************************************************/
5173
5174/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005175 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005176 * @sax: a SAX handler
5177 * @user_data: The user data returned on SAX callbacks
5178 * @chunk: a pointer to an array of chars
5179 * @size: number of chars in the array
5180 * @filename: an optional file name or URI
5181 * @enc: an optional encoding
5182 *
5183 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005184 * The value of @filename is used for fetching external entities
5185 * and error/warning reports.
5186 *
5187 * Returns the new parser context or NULL
5188 */
5189htmlParserCtxtPtr
5190htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5191 const char *chunk, int size, const char *filename,
5192 xmlCharEncoding enc) {
5193 htmlParserCtxtPtr ctxt;
5194 htmlParserInputPtr inputStream;
5195 xmlParserInputBufferPtr buf;
5196
Daniel Veillardd0463562001-10-13 09:15:48 +00005197 xmlInitParser();
5198
Owen Taylor3473f882001-02-23 17:55:21 +00005199 buf = xmlAllocParserInputBuffer(enc);
5200 if (buf == NULL) return(NULL);
5201
5202 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
5203 if (ctxt == NULL) {
5204 xmlFree(buf);
5205 return(NULL);
5206 }
5207 memset(ctxt, 0, sizeof(htmlParserCtxt));
5208 htmlInitParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005209 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5210 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005211 if (sax != NULL) {
5212 if (ctxt->sax != &htmlDefaultSAXHandler)
5213 xmlFree(ctxt->sax);
5214 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5215 if (ctxt->sax == NULL) {
5216 xmlFree(buf);
5217 xmlFree(ctxt);
5218 return(NULL);
5219 }
5220 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5221 if (user_data != NULL)
5222 ctxt->userData = user_data;
5223 }
5224 if (filename == NULL) {
5225 ctxt->directory = NULL;
5226 } else {
5227 ctxt->directory = xmlParserGetDirectory(filename);
5228 }
5229
5230 inputStream = htmlNewInputStream(ctxt);
5231 if (inputStream == NULL) {
5232 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005233 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005234 return(NULL);
5235 }
5236
5237 if (filename == NULL)
5238 inputStream->filename = NULL;
5239 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005240 inputStream->filename = (char *)
5241 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005242 inputStream->buf = buf;
5243 inputStream->base = inputStream->buf->buffer->content;
5244 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillard5f704af2003-03-05 10:01:43 +00005245 inputStream->end =
5246 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005247
5248 inputPush(ctxt, inputStream);
5249
5250 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5251 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005252 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5253 int cur = ctxt->input->cur - ctxt->input->base;
5254
Owen Taylor3473f882001-02-23 17:55:21 +00005255 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005256
5257 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5258 ctxt->input->cur = ctxt->input->base + cur;
5259 ctxt->input->end =
5260 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005261#ifdef DEBUG_PUSH
5262 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5263#endif
5264 }
5265
5266 return(ctxt);
5267}
5268
5269/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005270 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005271 * @cur: a pointer to an array of xmlChar
5272 * @encoding: a free form C string describing the HTML document encoding, or NULL
5273 * @sax: the SAX handler block
5274 * @userData: if using SAX, this pointer will be provided on callbacks.
5275 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005276 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5277 * to handle parse events. If sax is NULL, fallback to the default DOM
5278 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00005279 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005280 * Returns the resulting document tree unless SAX is NULL or the document is
5281 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005282 */
5283
5284htmlDocPtr
5285htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5286 htmlDocPtr ret;
5287 htmlParserCtxtPtr ctxt;
5288
Daniel Veillardd0463562001-10-13 09:15:48 +00005289 xmlInitParser();
5290
Owen Taylor3473f882001-02-23 17:55:21 +00005291 if (cur == NULL) return(NULL);
5292
5293
5294 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5295 if (ctxt == NULL) return(NULL);
5296 if (sax != NULL) {
5297 ctxt->sax = sax;
5298 ctxt->userData = userData;
5299 }
5300
5301 htmlParseDocument(ctxt);
5302 ret = ctxt->myDoc;
5303 if (sax != NULL) {
5304 ctxt->sax = NULL;
5305 ctxt->userData = NULL;
5306 }
5307 htmlFreeParserCtxt(ctxt);
5308
5309 return(ret);
5310}
5311
5312/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005313 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005314 * @cur: a pointer to an array of xmlChar
5315 * @encoding: a free form C string describing the HTML document encoding, or NULL
5316 *
5317 * parse an HTML in-memory document and build a tree.
5318 *
5319 * Returns the resulting document tree
5320 */
5321
5322htmlDocPtr
5323htmlParseDoc(xmlChar *cur, const char *encoding) {
5324 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5325}
5326
5327
5328/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005329 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005330 * @filename: the filename
5331 * @encoding: a free form C string describing the HTML document encoding, or NULL
5332 *
5333 * Create a parser context for a file content.
5334 * Automatic support for ZLIB/Compress compressed document is provided
5335 * by default if found at compile-time.
5336 *
5337 * Returns the new parser context or NULL
5338 */
5339htmlParserCtxtPtr
5340htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5341{
5342 htmlParserCtxtPtr ctxt;
5343 htmlParserInputPtr inputStream;
5344 xmlParserInputBufferPtr buf;
5345 /* htmlCharEncoding enc; */
5346 xmlChar *content, *content_line = (xmlChar *) "charset=";
5347
5348 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
5349 if (buf == NULL) return(NULL);
5350
5351 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
5352 if (ctxt == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00005353 xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00005354 return(NULL);
5355 }
5356 memset(ctxt, 0, sizeof(htmlParserCtxt));
5357 htmlInitParserCtxt(ctxt);
5358 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
5359 if (inputStream == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00005360 xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00005361 xmlFree(ctxt);
5362 return(NULL);
5363 }
5364 memset(inputStream, 0, sizeof(htmlParserInput));
5365
Daniel Veillarda646cfd2002-09-17 21:50:03 +00005366 inputStream->filename = (char *)
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +00005367 xmlCanonicPath((xmlChar *)filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005368 inputStream->line = 1;
5369 inputStream->col = 1;
5370 inputStream->buf = buf;
5371 inputStream->directory = NULL;
5372
5373 inputStream->base = inputStream->buf->buffer->content;
5374 inputStream->cur = inputStream->buf->buffer->content;
5375 inputStream->free = NULL;
5376
5377 inputPush(ctxt, inputStream);
5378
5379 /* set encoding */
5380 if (encoding) {
5381 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
5382 if (content) {
5383 strcpy ((char *)content, (char *)content_line);
5384 strcat ((char *)content, (char *)encoding);
5385 htmlCheckEncoding (ctxt, content);
5386 xmlFree (content);
5387 }
5388 }
5389
5390 return(ctxt);
5391}
5392
5393/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005394 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005395 * @filename: the filename
5396 * @encoding: a free form C string describing the HTML document encoding, or NULL
5397 * @sax: the SAX handler block
5398 * @userData: if using SAX, this pointer will be provided on callbacks.
5399 *
5400 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5401 * compressed document is provided by default if found at compile-time.
5402 * It use the given SAX function block to handle the parsing callback.
5403 * If sax is NULL, fallback to the default DOM tree building routines.
5404 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005405 * Returns the resulting document tree unless SAX is NULL or the document is
5406 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005407 */
5408
5409htmlDocPtr
5410htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5411 void *userData) {
5412 htmlDocPtr ret;
5413 htmlParserCtxtPtr ctxt;
5414 htmlSAXHandlerPtr oldsax = NULL;
5415
Daniel Veillardd0463562001-10-13 09:15:48 +00005416 xmlInitParser();
5417
Owen Taylor3473f882001-02-23 17:55:21 +00005418 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5419 if (ctxt == NULL) return(NULL);
5420 if (sax != NULL) {
5421 oldsax = ctxt->sax;
5422 ctxt->sax = sax;
5423 ctxt->userData = userData;
5424 }
5425
5426 htmlParseDocument(ctxt);
5427
5428 ret = ctxt->myDoc;
5429 if (sax != NULL) {
5430 ctxt->sax = oldsax;
5431 ctxt->userData = NULL;
5432 }
5433 htmlFreeParserCtxt(ctxt);
5434
5435 return(ret);
5436}
5437
5438/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005439 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005440 * @filename: the filename
5441 * @encoding: a free form C string describing the HTML document encoding, or NULL
5442 *
5443 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5444 * compressed document is provided by default if found at compile-time.
5445 *
5446 * Returns the resulting document tree
5447 */
5448
5449htmlDocPtr
5450htmlParseFile(const char *filename, const char *encoding) {
5451 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5452}
5453
5454/**
5455 * htmlHandleOmittedElem:
5456 * @val: int 0 or 1
5457 *
5458 * Set and return the previous value for handling HTML omitted tags.
5459 *
5460 * Returns the last value for 0 for no handling, 1 for auto insertion.
5461 */
5462
5463int
5464htmlHandleOmittedElem(int val) {
5465 int old = htmlOmittedDefaultValue;
5466
5467 htmlOmittedDefaultValue = val;
5468 return(old);
5469}
5470
Daniel Veillard930dfb62003-02-05 10:17:38 +00005471/**
5472 * htmlElementAllowedHere:
5473 * @parent: HTML parent element
5474 * @elt: HTML element
5475 *
5476 * Checks whether an HTML element may be a direct child of a parent element.
5477 * Note - doesn't check for deprecated elements
5478 *
5479 * Returns 1 if allowed; 0 otherwise.
5480 */
5481int
5482htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5483 const char** p ;
5484
5485 if ( ! elt || ! parent || ! parent->subelts )
5486 return 0 ;
5487
5488 for ( p = parent->subelts; *p; ++p )
5489 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5490 return 1 ;
5491
5492 return 0 ;
5493}
5494/**
5495 * htmlElementStatusHere:
5496 * @parent: HTML parent element
5497 * @elt: HTML element
5498 *
5499 * Checks whether an HTML element may be a direct child of a parent element.
5500 * and if so whether it is valid or deprecated.
5501 *
5502 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5503 */
5504htmlStatus
5505htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5506 if ( ! parent || ! elt )
5507 return HTML_INVALID ;
5508 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5509 return HTML_INVALID ;
5510
5511 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5512}
5513/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005514 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005515 * @elt: HTML element
5516 * @attr: HTML attribute
5517 * @legacy: whether to allow deprecated attributes
5518 *
5519 * Checks whether an attribute is valid for an element
5520 * Has full knowledge of Required and Deprecated attributes
5521 *
5522 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5523 */
5524htmlStatus
5525htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5526 const char** p ;
5527
5528 if ( !elt || ! attr )
5529 return HTML_INVALID ;
5530
5531 if ( elt->attrs_req )
5532 for ( p = elt->attrs_req; *p; ++p)
5533 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5534 return HTML_REQUIRED ;
5535
5536 if ( elt->attrs_opt )
5537 for ( p = elt->attrs_opt; *p; ++p)
5538 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5539 return HTML_VALID ;
5540
5541 if ( legacy && elt->attrs_depr )
5542 for ( p = elt->attrs_depr; *p; ++p)
5543 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5544 return HTML_DEPRECATED ;
5545
5546 return HTML_INVALID ;
5547}
5548/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005549 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005550 * @node: an htmlNodePtr in a tree
5551 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005552 * for Element nodes)
5553 *
5554 * Checks whether the tree node is valid. Experimental (the author
5555 * only uses the HTML enhancements in a SAX parser)
5556 *
5557 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5558 * legacy allowed) or htmlElementStatusHere (otherwise).
5559 * for Attribute nodes, a return from htmlAttrAllowed
5560 * for other nodes, HTML_NA (no checks performed)
5561 */
5562htmlStatus
5563htmlNodeStatus(const htmlNodePtr node, int legacy) {
5564 if ( ! node )
5565 return HTML_INVALID ;
5566
5567 switch ( node->type ) {
5568 case XML_ELEMENT_NODE:
5569 return legacy
5570 ? ( htmlElementAllowedHere (
5571 htmlTagLookup(node->parent->name) , node->name
5572 ) ? HTML_VALID : HTML_INVALID )
5573 : htmlElementStatusHere(
5574 htmlTagLookup(node->parent->name) ,
5575 htmlTagLookup(node->name) )
5576 ;
5577 case XML_ATTRIBUTE_NODE:
5578 return htmlAttrAllowed(
5579 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5580 default: return HTML_NA ;
5581 }
5582}
Owen Taylor3473f882001-02-23 17:55:21 +00005583#endif /* LIBXML_HTML_ENABLED */