blob: 29070655be67b9f88fda2c8acc22dece3304cb54 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Owen Taylor3473f882001-02-23 17:55:21 +000045
46#define HTML_MAX_NAMELEN 1000
47#define HTML_PARSER_BIG_BUFFER_SIZE 1000
48#define HTML_PARSER_BUFFER_SIZE 100
49
50/* #define DEBUG */
51/* #define DEBUG_PUSH */
52
Daniel Veillard22090732001-07-16 00:06:07 +000053static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000054
Daniel Veillard56a4cb82001-03-24 17:00:36 +000055xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
56 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000057static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000058
59/************************************************************************
60 * *
Owen Taylor3473f882001-02-23 17:55:21 +000061 * Parser stacks related functions and macros *
62 * *
63 ************************************************************************/
64
Daniel Veillard1c732d22002-11-30 11:22:59 +000065/**
66 * htmlnamePush:
67 * @ctxt: an HTML parser context
68 * @value: the element name
69 *
70 * Pushes a new element name on top of the name stack
71 *
72 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +000073 */
Daniel Veillard1c732d22002-11-30 11:22:59 +000074static int
75htmlnamePush(htmlParserCtxtPtr ctxt, xmlChar * value)
76{
77 if (ctxt->nameNr >= ctxt->nameMax) {
78 ctxt->nameMax *= 2;
79 ctxt->nameTab =
80 (xmlChar * *)xmlRealloc(ctxt->nameTab,
81 ctxt->nameMax *
82 sizeof(ctxt->nameTab[0]));
83 if (ctxt->nameTab == NULL) {
84 xmlGenericError(xmlGenericErrorContext, "realloc failed !\n");
85 return (0);
86 }
87 }
88 ctxt->nameTab[ctxt->nameNr] = value;
89 ctxt->name = value;
90 return (ctxt->nameNr++);
91}
92/**
93 * htmlnamePop:
94 * @ctxt: an HTML parser context
95 *
96 * Pops the top element name from the name stack
97 *
98 * Returns the name just removed
99 */
100static xmlChar *
101htmlnamePop(htmlParserCtxtPtr ctxt)
102{
103 xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000104
Daniel Veillard1c732d22002-11-30 11:22:59 +0000105 if (ctxt->nameNr <= 0)
106 return (0);
107 ctxt->nameNr--;
108 if (ctxt->nameNr < 0)
109 return (0);
110 if (ctxt->nameNr > 0)
111 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
112 else
113 ctxt->name = NULL;
114 ret = ctxt->nameTab[ctxt->nameNr];
115 ctxt->nameTab[ctxt->nameNr] = 0;
116 return (ret);
117}
Owen Taylor3473f882001-02-23 17:55:21 +0000118
119/*
120 * Macros for accessing the content. Those should be used only by the parser,
121 * and not exported.
122 *
123 * Dirty macros, i.e. one need to make assumption on the context to use them
124 *
125 * CUR_PTR return the current pointer to the xmlChar to be parsed.
126 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
127 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
128 * in UNICODE mode. This should be used internally by the parser
129 * only to compare to ASCII values otherwise it would break when
130 * running with UTF-8 encoding.
131 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
132 * to compare on ASCII based substring.
133 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
134 * it should be used only to compare on ASCII based substring.
135 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
136 * strings within the parser.
137 *
138 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
139 *
140 * CURRENT Returns the current char value, with the full decoding of
141 * UTF-8 if we are using this mode. It returns an int.
142 * NEXT Skip to the next character, this does the proper decoding
143 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
144 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
145 */
146
147#define UPPER (toupper(*ctxt->input->cur))
148
149#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
150
151#define NXT(val) ctxt->input->cur[(val)]
152
153#define UPP(val) (toupper(ctxt->input->cur[(val)]))
154
155#define CUR_PTR ctxt->input->cur
156
157#define SHRINK xmlParserInputShrink(ctxt->input)
158
159#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
160
161#define CURRENT ((int) (*ctxt->input->cur))
162
163#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
164
165/* Inported from XML */
166
Daniel Veillard561b7f82002-03-20 21:55:57 +0000167/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
168#define CUR ((int) (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000169#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
170
Daniel Veillard561b7f82002-03-20 21:55:57 +0000171#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000172#define NXT(val) ctxt->input->cur[(val)]
173#define CUR_PTR ctxt->input->cur
174
175
176#define NEXTL(l) do { \
177 if (*(ctxt->input->cur) == '\n') { \
178 ctxt->input->line++; ctxt->input->col = 1; \
179 } else ctxt->input->col++; \
180 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
181 } while (0)
182
183/************
184 \
185 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
186 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
187 ************/
188
189#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
190#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
191
192#define COPY_BUF(l,b,i,v) \
193 if (l == 1) b[i++] = (xmlChar) v; \
194 else i += xmlCopyChar(l,&b[i],v)
195
196/**
197 * htmlCurrentChar:
198 * @ctxt: the HTML parser context
199 * @len: pointer to the length of the char read
200 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000201 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000202 * bytes in the input buffer. Implement the end of line normalization:
203 * 2.11 End-of-Line Handling
204 * If the encoding is unspecified, in the case we find an ISO-Latin-1
205 * char, then the encoding converter is plugged in automatically.
206 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000207 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000208 */
209
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000210static int
Owen Taylor3473f882001-02-23 17:55:21 +0000211htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
212 if (ctxt->instate == XML_PARSER_EOF)
213 return(0);
214
215 if (ctxt->token != 0) {
216 *len = 0;
217 return(ctxt->token);
218 }
219 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
220 /*
221 * We are supposed to handle UTF8, check it's valid
222 * From rfc2044: encoding of the Unicode values on UTF-8:
223 *
224 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
225 * 0000 0000-0000 007F 0xxxxxxx
226 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
227 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
228 *
229 * Check for the 0x110000 limit too
230 */
231 const unsigned char *cur = ctxt->input->cur;
232 unsigned char c;
233 unsigned int val;
234
235 c = *cur;
236 if (c & 0x80) {
237 if (cur[1] == 0)
238 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
239 if ((cur[1] & 0xc0) != 0x80)
240 goto encoding_error;
241 if ((c & 0xe0) == 0xe0) {
242
243 if (cur[2] == 0)
244 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
245 if ((cur[2] & 0xc0) != 0x80)
246 goto encoding_error;
247 if ((c & 0xf0) == 0xf0) {
248 if (cur[3] == 0)
249 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
250 if (((c & 0xf8) != 0xf0) ||
251 ((cur[3] & 0xc0) != 0x80))
252 goto encoding_error;
253 /* 4-byte code */
254 *len = 4;
255 val = (cur[0] & 0x7) << 18;
256 val |= (cur[1] & 0x3f) << 12;
257 val |= (cur[2] & 0x3f) << 6;
258 val |= cur[3] & 0x3f;
259 } else {
260 /* 3-byte code */
261 *len = 3;
262 val = (cur[0] & 0xf) << 12;
263 val |= (cur[1] & 0x3f) << 6;
264 val |= cur[2] & 0x3f;
265 }
266 } else {
267 /* 2-byte code */
268 *len = 2;
269 val = (cur[0] & 0x1f) << 6;
270 val |= cur[1] & 0x3f;
271 }
272 if (!IS_CHAR(val)) {
273 ctxt->errNo = XML_ERR_INVALID_ENCODING;
274 if ((ctxt->sax != NULL) &&
275 (ctxt->sax->error != NULL))
276 ctxt->sax->error(ctxt->userData,
277 "Char 0x%X out of allowed range\n", val);
278 ctxt->wellFormed = 0;
Daniel Veillarddad3f682002-11-17 16:47:27 +0000279 if (ctxt->recovery == 0) ctxt->disableSAX = 1;
Owen Taylor3473f882001-02-23 17:55:21 +0000280 }
281 return(val);
282 } else {
283 /* 1-byte code */
284 *len = 1;
285 return((int) *ctxt->input->cur);
286 }
287 }
288 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000289 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000290 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000291 * XML constructs only use < 128 chars
292 */
293 *len = 1;
294 if ((int) *ctxt->input->cur < 0x80)
295 return((int) *ctxt->input->cur);
296
297 /*
298 * Humm this is bad, do an automatic flow conversion
299 */
300 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
301 ctxt->charset = XML_CHAR_ENCODING_UTF8;
302 return(xmlCurrentChar(ctxt, len));
303
304encoding_error:
305 /*
306 * If we detect an UTF8 error that probably mean that the
307 * input encoding didn't get properly advertized in the
308 * declaration header. Report the error and switch the encoding
309 * to ISO-Latin-1 (if you don't like this policy, just declare the
310 * encoding !)
311 */
312 ctxt->errNo = XML_ERR_INVALID_ENCODING;
313 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
314 ctxt->sax->error(ctxt->userData,
315 "Input is not proper UTF-8, indicate encoding !\n");
316 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
317 ctxt->input->cur[0], ctxt->input->cur[1],
318 ctxt->input->cur[2], ctxt->input->cur[3]);
319 }
320
321 ctxt->charset = XML_CHAR_ENCODING_8859_1;
322 *len = 1;
323 return((int) *ctxt->input->cur);
324}
325
326/**
Owen Taylor3473f882001-02-23 17:55:21 +0000327 * htmlSkipBlankChars:
328 * @ctxt: the HTML parser context
329 *
330 * skip all blanks character found at that point in the input streams.
331 *
332 * Returns the number of space chars skipped
333 */
334
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000335static int
Owen Taylor3473f882001-02-23 17:55:21 +0000336htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
337 int res = 0;
338
339 while (IS_BLANK(*(ctxt->input->cur))) {
340 if ((*ctxt->input->cur == 0) &&
341 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
342 xmlPopInput(ctxt);
343 } else {
344 if (*(ctxt->input->cur) == '\n') {
345 ctxt->input->line++; ctxt->input->col = 1;
346 } else ctxt->input->col++;
347 ctxt->input->cur++;
348 ctxt->nbChars++;
349 if (*ctxt->input->cur == 0)
350 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
351 }
352 res++;
353 }
354 return(res);
355}
356
357
358
359/************************************************************************
360 * *
361 * The list of HTML elements and their properties *
362 * *
363 ************************************************************************/
364
365/*
366 * Start Tag: 1 means the start tag can be ommited
367 * End Tag: 1 means the end tag can be ommited
368 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000369 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000370 * Depr: this element is deprecated
371 * DTD: 1 means that this element is valid only in the Loose DTD
372 * 2 means that this element is valid only in the Frameset DTD
373 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000374 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000375 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000376 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000377
378/* Definitions and a couple of vars for HTML Elements */
379
380#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
381#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
382#define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
383#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
384#define BLOCK HEADING LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
385#define FORMCTRL "input", "select", "textarea", "label", "button"
386#define PCDATA
387#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
388#define LIST "ul", "ol", "dir", "menu"
389#define MODIFIER
390#define FLOW BLOCK,INLINE
391#define EMPTY NULL
392
393
394static const char* html_flow[] = { FLOW, NULL } ;
395static const char* html_inline[] = { INLINE, NULL } ;
396
397/* placeholders: elts with content but no subelements */
398static const char* html_pcdata[] = { NULL } ;
399#define html_cdata html_pcdata
400
401
402/* ... and for HTML Attributes */
403
404#define COREATTRS "id", "class", "style", "title"
405#define I18N "lang", "dir"
406#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
407#define ATTRS COREATTRS,I18N,EVENTS
408#define CELLHALIGN "align", "char", "charoff"
409#define CELLVALIGN "valign"
410
411static const char* html_attrs[] = { ATTRS, NULL } ;
412static const char* core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
413static const char* core_attrs[] = { COREATTRS, NULL } ;
414static const char* i18n_attrs[] = { I18N, NULL } ;
415
416
417/* Other declarations that should go inline ... */
418static const char* a_attrs[] = { ATTRS, "charset", "type", "name",
419 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
420 "tabindex", "onfocus", "onblur", NULL } ;
421static const char* target_attr[] = { "target", NULL } ;
422static const char* rows_cols_attr[] = { "rows", "cols", NULL } ;
423static const char* alt_attr[] = { "alt", NULL } ;
424static const char* src_alt_attrs[] = { "src", "alt", NULL } ;
425static const char* href_attrs[] = { "href", NULL } ;
426static const char* clear_attrs[] = { "clear", NULL } ;
427static const char* inline_p[] = { INLINE, "p", NULL } ;
428static const char* flow_param[] = { FLOW, "param", NULL } ;
429static const char* applet_attrs[] = { COREATTRS , "codebase",
430 "archive", "alt", "name", "height", "width", "align",
431 "hspace", "vspace", NULL } ;
432static const char* area_attrs[] = { "shape", "coords", "href", "nohref",
433 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
434static const char* basefont_attrs[] =
435 { "id", "size", "color", "face", NULL } ;
436static const char* quote_attrs[] = { ATTRS, "cite", NULL } ;
437static const char* body_contents[] = { FLOW, "ins", "del", NULL } ;
438static const char* body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
439static const char* body_depr[] = { "background", "bgcolor", "text",
440 "link", "vlink", "alink", NULL } ;
441static const char* button_attrs[] = { ATTRS, "name", "value", "type",
442 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
443
444
445static const char* col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
446static const char* col_elt[] = { "col", NULL } ;
447static const char* edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
448static const char* compact_attrs[] = { ATTRS, "compact", NULL } ;
449static const char* dl_contents[] = { "dt", "dd", NULL } ;
450static const char* compact_attr[] = { "compact", NULL } ;
451static const char* label_attr[] = { "label", NULL } ;
452static const char* fieldset_contents[] = { FLOW, "legend" } ;
453static const char* font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
454static const char* form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
455static const char* form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
456static const char* frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
457static const char* frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
458static const char* frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
459static const char* head_attrs[] = { I18N, "profile", NULL } ;
460static const char* head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
461static const char* hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
462static const char* version_attr[] = { "version", NULL } ;
463static const char* html_content[] = { "head", "body", "frameset", NULL } ;
464static const char* iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
465static const char* img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
466static const char* input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
467static const char* prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
468static const char* label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
469static const char* legend_attrs[] = { ATTRS, "accesskey", NULL } ;
470static const char* align_attr[] = { "align", NULL } ;
471static const char* link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
472static const char* map_contents[] = { BLOCK, "area", NULL } ;
473static const char* name_attr[] = { "name", NULL } ;
474static const char* action_attr[] = { "action", NULL } ;
475static const char* blockli_elt[] = { BLOCK, "li", NULL } ;
476static const char* meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
477static const char* content_attr[] = { "content", NULL } ;
478static const char* type_attr[] = { "type", NULL } ;
479static const char* noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
480static const char* object_contents[] = { FLOW, "param", NULL } ;
481static const char* object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
482static const char* object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
483static const char* ol_attrs[] = { "type", "compact", "start", NULL} ;
484static const char* option_elt[] = { "option", NULL } ;
485static const char* optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
486static const char* option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
487static const char* param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
488static const char* width_attr[] = { "width", NULL } ;
489static const char* pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
490static const char* script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
491static const char* language_attr[] = { "language", NULL } ;
492static const char* select_content[] = { "optgroup", "option", NULL } ;
493static const char* select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
494static const char* style_attrs[] = { I18N, "media", "title", NULL } ;
495static const char* table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
496static const char* table_depr[] = { "align", "bgcolor", NULL } ;
497static const char* table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
498static const char* tr_elt[] = { "tr", NULL } ;
499static const char* talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
500static const char* th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
501static const char* th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
502static const char* textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
503static const char* tr_contents[] = { "th", "td", NULL } ;
504static const char* bgcolor_attr[] = { "bgcolor", NULL } ;
505static const char* li_elt[] = { "li", NULL } ;
506static const char* ul_depr[] = { "type", "compact", NULL} ;
507static const char* dir_attr[] = { "dir", NULL} ;
508
509#define DECL (const char**)
510
Daniel Veillard22090732001-07-16 00:06:07 +0000511static const htmlElemDesc
512html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000513{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
514 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
515},
516{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
517 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
518},
519{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
520 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
521},
522{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
523 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
524},
525{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
526 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
527},
528{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
529 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
530},
531{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
532 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
533},
534{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
535 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
536},
537{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
538 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
539},
540{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
541 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
542},
543{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
544 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
545},
546{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
547 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
548},
549{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
550 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
551},
552{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
553 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
554},
555{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
556 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
557},
558{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
559 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
560},
561{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
562 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
563},
564{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
565 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
566},
567{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
568 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
569},
570{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
571 EMPTY , NULL , DECL col_attrs , NULL, NULL
572},
573{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
574 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
575},
576{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
577 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
578},
579{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
580 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
581},
582{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
583 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
584},
585{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
586 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
587},
588{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
589 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
590},
591{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
592 DECL dl_contents , "dd" , html_attrs, DECL compact_attr, NULL
593},
594{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
595 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
596},
597{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
598 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
599},
600{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
601 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
602},
603{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
604 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
605},
606{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
607 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
608},
609{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
610 EMPTY, NULL, NULL, DECL frame_attrs, NULL
611},
612{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
613 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
614},
615{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
616 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
617},
618{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
619 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
620},
621{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
622 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
623},
624{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
625 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
626},
627{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
628 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
629},
630{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
631 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
632},
633{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
634 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
635},
636{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
637 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
638},
639{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
640 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
641},
642{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
643 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
644},
645{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
646 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
647},
648{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
649 EMPTY, NULL, DECL img_attrs, DECL align_attr, src_alt_attrs
650},
651{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
652 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
653},
654{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
655 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
656},
657{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
658 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
659},
660{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
661 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
662},
663{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
664 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
665},
666{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
667 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
668},
669{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
670 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
671},
672{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
673 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
674},
675{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
676 DECL map_contents , NULL, DECL html_attrs , NULL, name_attr
677},
678{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
679 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
680},
681{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
682 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
683},
684{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
685 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
686},
687{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
688 DECL html_flow, "div", DECL html_attrs, NULL, NULL
689},
690{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
691 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
692},
693{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
694 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
695},
696{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
697 option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
698},
699{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
700 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
701},
702{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
703 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
704},
705{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
706 EMPTY, NULL, DECL param_attrs, NULL, name_attr
707},
708{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
709 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
710},
711{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
712 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
713},
714{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
715 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
716},
717{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
718 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
719},
720{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
721 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
722},
723{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
724 DECL select_content, NULL, DECL select_attrs, NULL, NULL
725},
726{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
727 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
728},
729{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
730 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
731},
732{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
733 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
734},
735{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
736 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
737},
738{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
739 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
740},
741{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
742 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
743},
744{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
745 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
746},
747{ "table", 0, 0, 0, 0, 0, 0, 0, "",
748 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
749},
750{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
751 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
752},
753{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
754 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
755},
756{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
757 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
758},
759{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
760 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
761},
762{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
763 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
764},
765{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
766 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
767},
768{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
769 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
770},
771{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
772 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
773},
774{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
775 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
776},
777{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
778 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
779},
780{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
781 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
782},
783{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
784 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
785}
Owen Taylor3473f882001-02-23 17:55:21 +0000786};
787
788/*
Owen Taylor3473f882001-02-23 17:55:21 +0000789 * start tags that imply the end of current element
790 */
Daniel Veillard22090732001-07-16 00:06:07 +0000791static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000792"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
793 "dl", "ul", "ol", "menu", "dir", "address", "pre",
794 "listing", "xmp", "head", NULL,
795"head", "p", NULL,
796"title", "p", NULL,
797"body", "head", "style", "link", "title", "p", NULL,
798"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
799 "pre", "listing", "xmp", "head", "li", NULL,
800"hr", "p", "head", NULL,
801"h1", "p", "head", NULL,
802"h2", "p", "head", NULL,
803"h3", "p", "head", NULL,
804"h4", "p", "head", NULL,
805"h5", "p", "head", NULL,
806"h6", "p", "head", NULL,
807"dir", "p", "head", NULL,
808"address", "p", "head", "ul", NULL,
809"pre", "p", "head", "ul", NULL,
810"listing", "p", "head", NULL,
811"xmp", "p", "head", NULL,
812"blockquote", "p", "head", NULL,
813"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
814 "xmp", "head", NULL,
815"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
816 "head", "dd", NULL,
817"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
818 "head", "dt", NULL,
819"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
820 "listing", "xmp", NULL,
821"ol", "p", "head", "ul", NULL,
822"menu", "p", "head", "ul", NULL,
823"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
824"div", "p", "head", NULL,
825"noscript", "p", "head", NULL,
826"center", "font", "b", "i", "p", "head", NULL,
827"a", "a", NULL,
828"caption", "p", NULL,
829"colgroup", "caption", "colgroup", "col", "p", NULL,
830"col", "caption", "col", "p", NULL,
831"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
832 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000833"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
834"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000835"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
836"thead", "caption", "col", "colgroup", NULL,
837"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
838 "tbody", "p", NULL,
839"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
840 "tfoot", "tbody", "p", NULL,
841"optgroup", "option", NULL,
842"option", "option", NULL,
843"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
844 "pre", "listing", "xmp", "a", NULL,
845NULL
846};
847
848/*
849 * The list of HTML elements which are supposed not to have
850 * CDATA content and where a p element will be implied
851 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000852 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000853 * implied paragraph
854 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000855static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000856 "html",
857 "head",
858 "body",
859 NULL
860};
861
862/*
863 * The list of HTML attributes which are of content %Script;
864 * NOTE: when adding ones, check htmlIsScriptAttribute() since
865 * it assumes the name starts with 'on'
866 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000867static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000868 "onclick",
869 "ondblclick",
870 "onmousedown",
871 "onmouseup",
872 "onmouseover",
873 "onmousemove",
874 "onmouseout",
875 "onkeypress",
876 "onkeydown",
877 "onkeyup",
878 "onload",
879 "onunload",
880 "onfocus",
881 "onblur",
882 "onsubmit",
883 "onrest",
884 "onchange",
885 "onselect"
886};
887
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000888/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000889 * This table is used by the htmlparser to know what to do with
890 * broken html pages. By assigning different priorities to different
891 * elements the parser can decide how to handle extra endtags.
892 * Endtags are only allowed to close elements with lower or equal
893 * priority.
894 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000895
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000896typedef struct {
897 const char *name;
898 int priority;
899} elementPriority;
900
Daniel Veillard22090732001-07-16 00:06:07 +0000901static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000902 {"div", 150},
903 {"td", 160},
904 {"th", 160},
905 {"tr", 170},
906 {"thead", 180},
907 {"tbody", 180},
908 {"tfoot", 180},
909 {"table", 190},
910 {"head", 200},
911 {"body", 200},
912 {"html", 220},
913 {NULL, 100} /* Default priority */
914};
Owen Taylor3473f882001-02-23 17:55:21 +0000915
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000916static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000917static int htmlStartCloseIndexinitialized = 0;
918
919/************************************************************************
920 * *
921 * functions to handle HTML specific data *
922 * *
923 ************************************************************************/
924
925/**
926 * htmlInitAutoClose:
927 *
928 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
929 * This is not reentrant. Call xmlInitParser() once before processing in
930 * case of use in multithreaded programs.
931 */
932void
933htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000934 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000935
936 if (htmlStartCloseIndexinitialized) return;
937
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000938 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
939 indx = 0;
940 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
941 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000942 while (htmlStartClose[i] != NULL) i++;
943 i++;
944 }
945 htmlStartCloseIndexinitialized = 1;
946}
947
948/**
949 * htmlTagLookup:
950 * @tag: The tag name in lowercase
951 *
952 * Lookup the HTML tag in the ElementTable
953 *
954 * Returns the related htmlElemDescPtr or NULL if not found.
955 */
Daniel Veillardbb371292001-08-16 23:26:59 +0000956const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +0000957htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000958 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000959
960 for (i = 0; i < (sizeof(html40ElementTable) /
961 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000962 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard22090732001-07-16 00:06:07 +0000963 return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +0000964 }
965 return(NULL);
966}
967
968/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000969 * htmlGetEndPriority:
970 * @name: The name of the element to look up the priority for.
971 *
972 * Return value: The "endtag" priority.
973 **/
974static int
975htmlGetEndPriority (const xmlChar *name) {
976 int i = 0;
977
978 while ((htmlEndPriority[i].name != NULL) &&
979 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
980 i++;
981
982 return(htmlEndPriority[i].priority);
983}
984
985/**
Owen Taylor3473f882001-02-23 17:55:21 +0000986 * htmlCheckAutoClose:
987 * @newtag: The new tag name
988 * @oldtag: The old tag name
989 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000990 * Checks whether the new tag is one of the registered valid tags for
991 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +0000992 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
993 *
994 * Returns 0 if no, 1 if yes.
995 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000996static int
Owen Taylor3473f882001-02-23 17:55:21 +0000997htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000998 int i, indx;
999 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001000
1001 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
1002
1003 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001004 for (indx = 0; indx < 100;indx++) {
1005 closed = htmlStartCloseIndex[indx];
1006 if (closed == NULL) return(0);
1007 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +00001008 }
1009
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001010 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001011 i++;
1012 while (htmlStartClose[i] != NULL) {
1013 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1014 return(1);
1015 }
1016 i++;
1017 }
1018 return(0);
1019}
1020
1021/**
1022 * htmlAutoCloseOnClose:
1023 * @ctxt: an HTML parser context
1024 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001025 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001026 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001027 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001028 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001029static void
Owen Taylor3473f882001-02-23 17:55:21 +00001030htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardbb371292001-08-16 23:26:59 +00001031 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00001032 xmlChar *oldname;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001033 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001034
1035#ifdef DEBUG
1036 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
1037 for (i = 0;i < ctxt->nameNr;i++)
1038 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
1039#endif
1040
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001041 priority = htmlGetEndPriority (newtag);
1042
Owen Taylor3473f882001-02-23 17:55:21 +00001043 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001044
Owen Taylor3473f882001-02-23 17:55:21 +00001045 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001046 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001047 * A missplaced endtag can only close elements with lower
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001048 * or equal priority, so if we find an element with higher
1049 * priority before we find an element with
1050 * matching name, we just ignore this endtag
1051 */
1052 if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor3473f882001-02-23 17:55:21 +00001053 }
1054 if (i < 0) return;
1055
1056 while (!xmlStrEqual(newtag, ctxt->name)) {
1057 info = htmlTagLookup(ctxt->name);
1058 if ((info == NULL) || (info->endTag == 1)) {
1059#ifdef DEBUG
1060 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
1061#endif
Daniel Veillard56098d42001-04-24 12:51:09 +00001062 } else if (info->endTag == 3) {
1063#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001064 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack1633d182001-10-05 15:41:19 +00001065
Daniel Veillard56098d42001-04-24 12:51:09 +00001066#endif
1067 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1068 ctxt->sax->error(ctxt->userData,
1069 "Opening and ending tag mismatch: %s and %s\n",
1070 newtag, ctxt->name);
1071 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001072 }
1073 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1074 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1075 oldname = htmlnamePop(ctxt);
1076 if (oldname != NULL) {
1077#ifdef DEBUG
1078 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
1079#endif
1080 xmlFree(oldname);
1081 }
1082 }
1083}
1084
1085/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001086 * htmlAutoCloseOnEnd:
1087 * @ctxt: an HTML parser context
1088 *
1089 * Close all remaining tags at the end of the stream
1090 */
1091static void
1092htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
1093 xmlChar *oldname;
1094 int i;
1095
1096 if (ctxt->nameNr == 0)
1097 return;
1098#ifdef DEBUG
1099 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
1100#endif
1101
1102 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
1103#ifdef DEBUG
1104 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
1105#endif
1106 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1107 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1108 oldname = htmlnamePop(ctxt);
1109 if (oldname != NULL) {
1110#ifdef DEBUG
1111 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
1112#endif
1113 xmlFree(oldname);
1114 }
1115 }
1116}
1117
1118/**
Owen Taylor3473f882001-02-23 17:55:21 +00001119 * htmlAutoClose:
1120 * @ctxt: an HTML parser context
1121 * @newtag: The new tag name or NULL
1122 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001123 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001124 * The list is kept in htmlStartClose array. This function is
1125 * called when a new tag has been detected and generates the
1126 * appropriates closes if possible/needed.
1127 * If newtag is NULL this mean we are at the end of the resource
1128 * and we should check
1129 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001130static void
Owen Taylor3473f882001-02-23 17:55:21 +00001131htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1132 xmlChar *oldname;
1133 while ((newtag != NULL) && (ctxt->name != NULL) &&
1134 (htmlCheckAutoClose(newtag, ctxt->name))) {
1135#ifdef DEBUG
1136 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
1137#endif
1138 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1139 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1140 oldname = htmlnamePop(ctxt);
1141 if (oldname != NULL) {
1142#ifdef DEBUG
1143 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
1144#endif
1145 xmlFree(oldname);
1146 }
1147 }
1148 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001149 htmlAutoCloseOnEnd(ctxt);
1150 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001151 }
1152 while ((newtag == NULL) && (ctxt->name != NULL) &&
1153 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
1154 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
1155 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
1156#ifdef DEBUG
1157 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
1158#endif
1159 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1160 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1161 oldname = htmlnamePop(ctxt);
1162 if (oldname != NULL) {
1163#ifdef DEBUG
1164 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
1165#endif
1166 xmlFree(oldname);
1167 }
1168 }
1169
1170}
1171
1172/**
1173 * htmlAutoCloseTag:
1174 * @doc: the HTML document
1175 * @name: The tag name
1176 * @elem: the HTML element
1177 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001178 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001179 * The list is kept in htmlStartClose array. This function checks
1180 * if the element or one of it's children would autoclose the
1181 * given tag.
1182 *
1183 * Returns 1 if autoclose, 0 otherwise
1184 */
1185int
1186htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1187 htmlNodePtr child;
1188
1189 if (elem == NULL) return(1);
1190 if (xmlStrEqual(name, elem->name)) return(0);
1191 if (htmlCheckAutoClose(elem->name, name)) return(1);
1192 child = elem->children;
1193 while (child != NULL) {
1194 if (htmlAutoCloseTag(doc, name, child)) return(1);
1195 child = child->next;
1196 }
1197 return(0);
1198}
1199
1200/**
1201 * htmlIsAutoClosed:
1202 * @doc: the HTML document
1203 * @elem: the HTML element
1204 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001205 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001206 * The list is kept in htmlStartClose array. This function checks
1207 * if a tag is autoclosed by one of it's child
1208 *
1209 * Returns 1 if autoclosed, 0 otherwise
1210 */
1211int
1212htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1213 htmlNodePtr child;
1214
1215 if (elem == NULL) return(1);
1216 child = elem->children;
1217 while (child != NULL) {
1218 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1219 child = child->next;
1220 }
1221 return(0);
1222}
1223
1224/**
1225 * htmlCheckImplied:
1226 * @ctxt: an HTML parser context
1227 * @newtag: The new tag name
1228 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001229 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001230 * called when a new tag has been detected and generates the
1231 * appropriates implicit tags if missing
1232 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001233static void
Owen Taylor3473f882001-02-23 17:55:21 +00001234htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1235 if (!htmlOmittedDefaultValue)
1236 return;
1237 if (xmlStrEqual(newtag, BAD_CAST"html"))
1238 return;
1239 if (ctxt->nameNr <= 0) {
1240#ifdef DEBUG
1241 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
1242#endif
1243 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
1244 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1245 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1246 }
1247 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1248 return;
1249 if ((ctxt->nameNr <= 1) &&
1250 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1251 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1252 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1253 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1254 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1255 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1256 /*
1257 * dropped OBJECT ... i you put it first BODY will be
1258 * assumed !
1259 */
1260#ifdef DEBUG
1261 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
1262#endif
1263 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
1264 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1265 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1266 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1267 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1268 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1269 int i;
1270 for (i = 0;i < ctxt->nameNr;i++) {
1271 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1272 return;
1273 }
1274 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1275 return;
1276 }
1277 }
1278
1279#ifdef DEBUG
1280 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
1281#endif
1282 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
1283 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1284 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1285 }
1286}
1287
1288/**
1289 * htmlCheckParagraph
1290 * @ctxt: an HTML parser context
1291 *
1292 * Check whether a p element need to be implied before inserting
1293 * characters in the current element.
1294 *
1295 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1296 * in case of error.
1297 */
1298
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001299static int
Owen Taylor3473f882001-02-23 17:55:21 +00001300htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1301 const xmlChar *tag;
1302 int i;
1303
1304 if (ctxt == NULL)
1305 return(-1);
1306 tag = ctxt->name;
1307 if (tag == NULL) {
1308 htmlAutoClose(ctxt, BAD_CAST"p");
1309 htmlCheckImplied(ctxt, BAD_CAST"p");
1310 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
1311 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1312 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1313 return(1);
1314 }
1315 if (!htmlOmittedDefaultValue)
1316 return(0);
1317 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1318 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1319#ifdef DEBUG
1320 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
1321#endif
1322 htmlAutoClose(ctxt, BAD_CAST"p");
1323 htmlCheckImplied(ctxt, BAD_CAST"p");
1324 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
1325 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1326 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1327 return(1);
1328 }
1329 }
1330 return(0);
1331}
1332
1333/**
1334 * htmlIsScriptAttribute:
1335 * @name: an attribute name
1336 *
1337 * Check if an attribute is of content type Script
1338 *
1339 * Returns 1 is the attribute is a script 0 otherwise
1340 */
1341int
1342htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001343 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001344
1345 if (name == NULL)
1346 return(0);
1347 /*
1348 * all script attributes start with 'on'
1349 */
1350 if ((name[0] != 'o') || (name[1] != 'n'))
1351 return(0);
1352 for (i = 0;
1353 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1354 i++) {
1355 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1356 return(1);
1357 }
1358 return(0);
1359}
1360
1361/************************************************************************
1362 * *
1363 * The list of HTML predefined entities *
1364 * *
1365 ************************************************************************/
1366
1367
Daniel Veillard22090732001-07-16 00:06:07 +00001368static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001369/*
1370 * the 4 absolute ones, plus apostrophe.
1371 */
1372{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1373{ 38, "amp", "ampersand, U+0026 ISOnum" },
1374{ 39, "apos", "single quote" },
1375{ 60, "lt", "less-than sign, U+003C ISOnum" },
1376{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1377
1378/*
1379 * A bunch still in the 128-255 range
1380 * Replacing them depend really on the charset used.
1381 */
1382{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1383{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1384{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1385{ 163, "pound","pound sign, U+00A3 ISOnum" },
1386{ 164, "curren","currency sign, U+00A4 ISOnum" },
1387{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1388{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1389{ 167, "sect", "section sign, U+00A7 ISOnum" },
1390{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1391{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1392{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1393{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1394{ 172, "not", "not sign, U+00AC ISOnum" },
1395{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1396{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1397{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1398{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1399{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1400{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1401{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1402{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1403{ 181, "micro","micro sign, U+00B5 ISOnum" },
1404{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1405{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1406{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1407{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1408{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1409{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1410{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1411{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1412{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1413{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1414{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1415{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1416{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1417{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1418{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1419{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1420{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1421{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1422{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1423{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1424{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1425{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1426{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1427{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1428{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1429{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1430{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1431{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1432{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1433{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1434{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1435{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1436{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1437{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1438{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1439{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1440{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1441{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1442{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1443{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1444{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1445{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1446{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1447{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1448{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1449{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1450{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1451{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1452{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1453{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1454{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1455{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1456{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1457{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1458{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1459{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1460{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1461{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1462{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1463{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1464{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1465{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1466{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1467{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1468{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1469{ 247, "divide","division sign, U+00F7 ISOnum" },
1470{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1471{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1472{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1473{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1474{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1475{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1476{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1477{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1478
1479{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1480{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1481{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1482{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1483{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1484
1485/*
1486 * Anything below should really be kept as entities references
1487 */
1488{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1489
1490{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1491{ 732, "tilde","small tilde, U+02DC ISOdia" },
1492
1493{ 913, "Alpha","greek capital letter alpha, U+0391" },
1494{ 914, "Beta", "greek capital letter beta, U+0392" },
1495{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1496{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1497{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1498{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1499{ 919, "Eta", "greek capital letter eta, U+0397" },
1500{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1501{ 921, "Iota", "greek capital letter iota, U+0399" },
1502{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001503{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001504{ 924, "Mu", "greek capital letter mu, U+039C" },
1505{ 925, "Nu", "greek capital letter nu, U+039D" },
1506{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1507{ 927, "Omicron","greek capital letter omicron, U+039F" },
1508{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1509{ 929, "Rho", "greek capital letter rho, U+03A1" },
1510{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1511{ 932, "Tau", "greek capital letter tau, U+03A4" },
1512{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1513{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1514{ 935, "Chi", "greek capital letter chi, U+03A7" },
1515{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1516{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1517
1518{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1519{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1520{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1521{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1522{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1523{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1524{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1525{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1526{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1527{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1528{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1529{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1530{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1531{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1532{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1533{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1534{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1535{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1536{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1537{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1538{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1539{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1540{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1541{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1542{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1543{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1544{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1545{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1546
1547{ 8194, "ensp", "en space, U+2002 ISOpub" },
1548{ 8195, "emsp", "em space, U+2003 ISOpub" },
1549{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1550{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1551{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1552{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1553{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1554{ 8211, "ndash","en dash, U+2013 ISOpub" },
1555{ 8212, "mdash","em dash, U+2014 ISOpub" },
1556{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1557{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1558{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1559{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1560{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1561{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1562{ 8224, "dagger","dagger, U+2020 ISOpub" },
1563{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1564
1565{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1566{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1567
1568{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1569
1570{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1571{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1572
1573{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1574{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1575
1576{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1577{ 8260, "frasl","fraction slash, U+2044 NEW" },
1578
1579{ 8364, "euro", "euro sign, U+20AC NEW" },
1580
1581{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1582{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1583{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1584{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1585{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1586{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1587{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1588{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1589{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1590{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1591{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1592{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1593{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1594{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1595{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1596{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1597
1598{ 8704, "forall","for all, U+2200 ISOtech" },
1599{ 8706, "part", "partial differential, U+2202 ISOtech" },
1600{ 8707, "exist","there exists, U+2203 ISOtech" },
1601{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1602{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1603{ 8712, "isin", "element of, U+2208 ISOtech" },
1604{ 8713, "notin","not an element of, U+2209 ISOtech" },
1605{ 8715, "ni", "contains as member, U+220B ISOtech" },
1606{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001607{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001608{ 8722, "minus","minus sign, U+2212 ISOtech" },
1609{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1610{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1611{ 8733, "prop", "proportional to, U+221D ISOtech" },
1612{ 8734, "infin","infinity, U+221E ISOtech" },
1613{ 8736, "ang", "angle, U+2220 ISOamso" },
1614{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1615{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1616{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1617{ 8746, "cup", "union = cup, U+222A ISOtech" },
1618{ 8747, "int", "integral, U+222B ISOtech" },
1619{ 8756, "there4","therefore, U+2234 ISOtech" },
1620{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1621{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1622{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1623{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1624{ 8801, "equiv","identical to, U+2261 ISOtech" },
1625{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1626{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1627{ 8834, "sub", "subset of, U+2282 ISOtech" },
1628{ 8835, "sup", "superset of, U+2283 ISOtech" },
1629{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1630{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1631{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1632{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1633{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1634{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1635{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1636{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1637{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1638{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1639{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1640{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1641{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1642{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1643
1644{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1645{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1646{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1647{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1648
1649};
1650
1651/************************************************************************
1652 * *
1653 * Commodity functions to handle entities *
1654 * *
1655 ************************************************************************/
1656
1657/*
1658 * Macro used to grow the current buffer.
1659 */
1660#define growBuffer(buffer) { \
1661 buffer##_size *= 2; \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001662 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Owen Taylor3473f882001-02-23 17:55:21 +00001663 if (buffer == NULL) { \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001664 xmlGenericError(xmlGenericErrorContext, "realloc failed\n"); \
Owen Taylor3473f882001-02-23 17:55:21 +00001665 return(NULL); \
1666 } \
1667}
1668
1669/**
1670 * htmlEntityLookup:
1671 * @name: the entity name
1672 *
1673 * Lookup the given entity in EntitiesTable
1674 *
1675 * TODO: the linear scan is really ugly, an hash table is really needed.
1676 *
1677 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1678 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001679const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001680htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001681 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001682
1683 for (i = 0;i < (sizeof(html40EntitiesTable)/
1684 sizeof(html40EntitiesTable[0]));i++) {
1685 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1686#ifdef DEBUG
1687 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1688#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001689 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001690 }
1691 }
1692 return(NULL);
1693}
1694
1695/**
1696 * htmlEntityValueLookup:
1697 * @value: the entity's unicode value
1698 *
1699 * Lookup the given entity in EntitiesTable
1700 *
1701 * TODO: the linear scan is really ugly, an hash table is really needed.
1702 *
1703 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1704 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001705const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001706htmlEntityValueLookup(unsigned int value) {
1707 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001708#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001709 unsigned int lv = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001710#endif
1711
1712 for (i = 0;i < (sizeof(html40EntitiesTable)/
1713 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001714 if (html40EntitiesTable[i].value >= value) {
1715 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001716 break;
1717#ifdef DEBUG
1718 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1719#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001720 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001721 }
1722#ifdef DEBUG
1723 if (lv > html40EntitiesTable[i].value) {
1724 xmlGenericError(xmlGenericErrorContext,
1725 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1726 lv, html40EntitiesTable[i].value);
1727 }
1728 lv = html40EntitiesTable[i].value;
1729#endif
1730 }
1731 return(NULL);
1732}
1733
1734/**
1735 * UTF8ToHtml:
1736 * @out: a pointer to an array of bytes to store the result
1737 * @outlen: the length of @out
1738 * @in: a pointer to an array of UTF-8 chars
1739 * @inlen: the length of @in
1740 *
1741 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1742 * plus HTML entities block of chars out.
1743 *
1744 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1745 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001746 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001747 * The value of @outlen after return is the number of octets consumed.
1748 */
1749int
1750UTF8ToHtml(unsigned char* out, int *outlen,
1751 const unsigned char* in, int *inlen) {
1752 const unsigned char* processed = in;
1753 const unsigned char* outend;
1754 const unsigned char* outstart = out;
1755 const unsigned char* instart = in;
1756 const unsigned char* inend;
1757 unsigned int c, d;
1758 int trailing;
1759
1760 if (in == NULL) {
1761 /*
1762 * initialization nothing to do
1763 */
1764 *outlen = 0;
1765 *inlen = 0;
1766 return(0);
1767 }
1768 inend = in + (*inlen);
1769 outend = out + (*outlen);
1770 while (in < inend) {
1771 d = *in++;
1772 if (d < 0x80) { c= d; trailing= 0; }
1773 else if (d < 0xC0) {
1774 /* trailing byte in leading position */
1775 *outlen = out - outstart;
1776 *inlen = processed - instart;
1777 return(-2);
1778 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1779 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1780 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1781 else {
1782 /* no chance for this in Ascii */
1783 *outlen = out - outstart;
1784 *inlen = processed - instart;
1785 return(-2);
1786 }
1787
1788 if (inend - in < trailing) {
1789 break;
1790 }
1791
1792 for ( ; trailing; trailing--) {
1793 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1794 break;
1795 c <<= 6;
1796 c |= d & 0x3F;
1797 }
1798
1799 /* assertion: c is a single UTF-4 value */
1800 if (c < 0x80) {
1801 if (out + 1 >= outend)
1802 break;
1803 *out++ = c;
1804 } else {
1805 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001806 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001807
1808 /*
1809 * Try to lookup a predefined HTML entity for it
1810 */
1811
1812 ent = htmlEntityValueLookup(c);
1813 if (ent == NULL) {
1814 /* no chance for this in Ascii */
1815 *outlen = out - outstart;
1816 *inlen = processed - instart;
1817 return(-2);
1818 }
1819 len = strlen(ent->name);
1820 if (out + 2 + len >= outend)
1821 break;
1822 *out++ = '&';
1823 memcpy(out, ent->name, len);
1824 out += len;
1825 *out++ = ';';
1826 }
1827 processed = in;
1828 }
1829 *outlen = out - outstart;
1830 *inlen = processed - instart;
1831 return(0);
1832}
1833
1834/**
1835 * htmlEncodeEntities:
1836 * @out: a pointer to an array of bytes to store the result
1837 * @outlen: the length of @out
1838 * @in: a pointer to an array of UTF-8 chars
1839 * @inlen: the length of @in
1840 * @quoteChar: the quote character to escape (' or ") or zero.
1841 *
1842 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1843 * plus HTML entities block of chars out.
1844 *
1845 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1846 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001847 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001848 * The value of @outlen after return is the number of octets consumed.
1849 */
1850int
1851htmlEncodeEntities(unsigned char* out, int *outlen,
1852 const unsigned char* in, int *inlen, int quoteChar) {
1853 const unsigned char* processed = in;
1854 const unsigned char* outend = out + (*outlen);
1855 const unsigned char* outstart = out;
1856 const unsigned char* instart = in;
1857 const unsigned char* inend = in + (*inlen);
1858 unsigned int c, d;
1859 int trailing;
1860
1861 while (in < inend) {
1862 d = *in++;
1863 if (d < 0x80) { c= d; trailing= 0; }
1864 else if (d < 0xC0) {
1865 /* trailing byte in leading position */
1866 *outlen = out - outstart;
1867 *inlen = processed - instart;
1868 return(-2);
1869 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1870 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1871 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1872 else {
1873 /* no chance for this in Ascii */
1874 *outlen = out - outstart;
1875 *inlen = processed - instart;
1876 return(-2);
1877 }
1878
1879 if (inend - in < trailing)
1880 break;
1881
1882 while (trailing--) {
1883 if (((d= *in++) & 0xC0) != 0x80) {
1884 *outlen = out - outstart;
1885 *inlen = processed - instart;
1886 return(-2);
1887 }
1888 c <<= 6;
1889 c |= d & 0x3F;
1890 }
1891
1892 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001893 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1894 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001895 if (out >= outend)
1896 break;
1897 *out++ = c;
1898 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001899 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001900 const char *cp;
1901 char nbuf[16];
1902 int len;
1903
1904 /*
1905 * Try to lookup a predefined HTML entity for it
1906 */
1907 ent = htmlEntityValueLookup(c);
1908 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001909 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001910 cp = nbuf;
1911 }
1912 else
1913 cp = ent->name;
1914 len = strlen(cp);
1915 if (out + 2 + len > outend)
1916 break;
1917 *out++ = '&';
1918 memcpy(out, cp, len);
1919 out += len;
1920 *out++ = ';';
1921 }
1922 processed = in;
1923 }
1924 *outlen = out - outstart;
1925 *inlen = processed - instart;
1926 return(0);
1927}
1928
1929/**
1930 * htmlDecodeEntities:
1931 * @ctxt: the parser context
1932 * @len: the len to decode (in bytes !), -1 for no size limit
1933 * @end: an end marker xmlChar, 0 if none
1934 * @end2: an end marker xmlChar, 0 if none
1935 * @end3: an end marker xmlChar, 0 if none
1936 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001937 * Substitute the HTML entities by their value
Owen Taylor3473f882001-02-23 17:55:21 +00001938 *
1939 * DEPRECATED !!!!
1940 *
1941 * Returns A newly allocated string with the substitution done. The caller
1942 * must deallocate it !
1943 */
1944xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001945htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1946 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001947 static int deprecated = 0;
1948 if (!deprecated) {
1949 xmlGenericError(xmlGenericErrorContext,
1950 "htmlDecodeEntities() deprecated function reached\n");
1951 deprecated = 1;
1952 }
1953 return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001954}
1955
1956/************************************************************************
1957 * *
1958 * Commodity functions to handle streams *
1959 * *
1960 ************************************************************************/
1961
1962/**
Owen Taylor3473f882001-02-23 17:55:21 +00001963 * htmlNewInputStream:
1964 * @ctxt: an HTML parser context
1965 *
1966 * Create a new input stream structure
1967 * Returns the new input stream or NULL
1968 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001969static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001970htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1971 htmlParserInputPtr input;
1972
1973 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1974 if (input == NULL) {
1975 ctxt->errNo = XML_ERR_NO_MEMORY;
1976 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1977 ctxt->sax->error(ctxt->userData,
1978 "malloc: couldn't allocate a new input stream\n");
1979 return(NULL);
1980 }
1981 memset(input, 0, sizeof(htmlParserInput));
1982 input->filename = NULL;
1983 input->directory = NULL;
1984 input->base = NULL;
1985 input->cur = NULL;
1986 input->buf = NULL;
1987 input->line = 1;
1988 input->col = 1;
1989 input->buf = NULL;
1990 input->free = NULL;
1991 input->version = NULL;
1992 input->consumed = 0;
1993 input->length = 0;
1994 return(input);
1995}
1996
1997
1998/************************************************************************
1999 * *
2000 * Commodity functions, cleanup needed ? *
2001 * *
2002 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002003/*
2004 * all tags allowing pc data from the html 4.01 loose dtd
2005 * NOTE: it might be more apropriate to integrate this information
2006 * into the html40ElementTable array but I don't want to risk any
2007 * binary incomptibility
2008 */
2009static const char *allowPCData[] = {
2010 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2011 "blockquote", "body", "button", "caption", "center", "cite", "code",
2012 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2013 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2014 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2015 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2016};
Owen Taylor3473f882001-02-23 17:55:21 +00002017
2018/**
2019 * areBlanks:
2020 * @ctxt: an HTML parser context
2021 * @str: a xmlChar *
2022 * @len: the size of @str
2023 *
2024 * Is this a sequence of blank chars that one can ignore ?
2025 *
2026 * Returns 1 if ignorable 0 otherwise.
2027 */
2028
2029static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002030 unsigned int i;
2031 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002032 xmlNodePtr lastChild;
2033
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002034 for (j = 0;j < len;j++)
2035 if (!(IS_BLANK(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002036
2037 if (CUR == 0) return(1);
2038 if (CUR != '<') return(0);
2039 if (ctxt->name == NULL)
2040 return(1);
2041 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2042 return(1);
2043 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2044 return(1);
2045 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
2046 return(1);
2047 if (ctxt->node == NULL) return(0);
2048 lastChild = xmlGetLastChild(ctxt->node);
2049 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002050 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2051 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002052 /* keep ws in constructs like ...<b> </b>...
2053 for all tags "b" allowing PCDATA */
2054 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2055 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2056 return(0);
2057 }
2058 }
Owen Taylor3473f882001-02-23 17:55:21 +00002059 } else if (xmlNodeIsText(lastChild)) {
2060 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002061 } else {
2062 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2063 for all tags "p" allowing PCDATA */
2064 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2065 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2066 return(0);
2067 }
2068 }
Owen Taylor3473f882001-02-23 17:55:21 +00002069 }
2070 return(1);
2071}
2072
2073/**
Owen Taylor3473f882001-02-23 17:55:21 +00002074 * htmlNewDocNoDtD:
2075 * @URI: URI for the dtd, or NULL
2076 * @ExternalID: the external ID of the DTD, or NULL
2077 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002078 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2079 * are NULL
2080 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002081 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002082 */
2083htmlDocPtr
2084htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2085 xmlDocPtr cur;
2086
2087 /*
2088 * Allocate a new document and fill the fields.
2089 */
2090 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2091 if (cur == NULL) {
2092 xmlGenericError(xmlGenericErrorContext,
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002093 "htmlNewDocNoDtD : malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002094 return(NULL);
2095 }
2096 memset(cur, 0, sizeof(xmlDoc));
2097
2098 cur->type = XML_HTML_DOCUMENT_NODE;
2099 cur->version = NULL;
2100 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002101 cur->doc = cur;
2102 cur->name = NULL;
2103 cur->children = NULL;
2104 cur->extSubset = NULL;
2105 cur->oldNs = NULL;
2106 cur->encoding = NULL;
2107 cur->standalone = 1;
2108 cur->compression = 0;
2109 cur->ids = NULL;
2110 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002111 cur->_private = NULL;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002112 if ((ExternalID != NULL) ||
2113 (URI != NULL))
Daniel Veillard5151c062001-10-23 13:10:19 +00002114 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002115 return(cur);
2116}
2117
2118/**
2119 * htmlNewDoc:
2120 * @URI: URI for the dtd, or NULL
2121 * @ExternalID: the external ID of the DTD, or NULL
2122 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002123 * Creates a new HTML document
2124 *
Owen Taylor3473f882001-02-23 17:55:21 +00002125 * Returns a new document
2126 */
2127htmlDocPtr
2128htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2129 if ((URI == NULL) && (ExternalID == NULL))
2130 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002131 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2132 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002133
2134 return(htmlNewDocNoDtD(URI, ExternalID));
2135}
2136
2137
2138/************************************************************************
2139 * *
2140 * The parser itself *
2141 * Relates to http://www.w3.org/TR/html40 *
2142 * *
2143 ************************************************************************/
2144
2145/************************************************************************
2146 * *
2147 * The parser itself *
2148 * *
2149 ************************************************************************/
2150
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002151static xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2152
Owen Taylor3473f882001-02-23 17:55:21 +00002153/**
2154 * htmlParseHTMLName:
2155 * @ctxt: an HTML parser context
2156 *
2157 * parse an HTML tag or attribute name, note that we convert it to lowercase
2158 * since HTML names are not case-sensitive.
2159 *
2160 * Returns the Tag Name parsed or NULL
2161 */
2162
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002163static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002164htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2165 xmlChar *ret = NULL;
2166 int i = 0;
2167 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2168
2169 if (!IS_LETTER(CUR) && (CUR != '_') &&
2170 (CUR != ':')) return(NULL);
2171
2172 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2173 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2174 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2175 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2176 else loc[i] = CUR;
2177 i++;
2178
2179 NEXT;
2180 }
2181
2182 ret = xmlStrndup(loc, i);
2183
2184 return(ret);
2185}
2186
2187/**
2188 * htmlParseName:
2189 * @ctxt: an HTML parser context
2190 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002191 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002192 *
2193 * Returns the Name parsed or NULL
2194 */
2195
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002196static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002197htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002198 const xmlChar *in;
2199 xmlChar *ret;
2200 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002201
2202 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002203
2204 /*
2205 * Accelerator for simple ASCII names
2206 */
2207 in = ctxt->input->cur;
2208 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2209 ((*in >= 0x41) && (*in <= 0x5A)) ||
2210 (*in == '_') || (*in == ':')) {
2211 in++;
2212 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2213 ((*in >= 0x41) && (*in <= 0x5A)) ||
2214 ((*in >= 0x30) && (*in <= 0x39)) ||
2215 (*in == '_') || (*in == '-') ||
2216 (*in == ':') || (*in == '.'))
2217 in++;
2218 if ((*in > 0) && (*in < 0x80)) {
2219 count = in - ctxt->input->cur;
2220 ret = xmlStrndup(ctxt->input->cur, count);
2221 ctxt->input->cur = in;
2222 return(ret);
2223 }
2224 }
2225 return(htmlParseNameComplex(ctxt));
2226}
2227
2228static xmlChar *
2229htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2230 xmlChar buf[XML_MAX_NAMELEN + 5];
2231 int len = 0, l;
2232 int c;
2233 int count = 0;
2234
2235 /*
2236 * Handler for more complex cases
2237 */
2238 GROW;
2239 c = CUR_CHAR(l);
2240 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2241 (!IS_LETTER(c) && (c != '_') &&
2242 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002243 return(NULL);
2244 }
2245
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002246 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2247 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2248 (c == '.') || (c == '-') ||
2249 (c == '_') || (c == ':') ||
2250 (IS_COMBINING(c)) ||
2251 (IS_EXTENDER(c)))) {
2252 if (count++ > 100) {
2253 count = 0;
2254 GROW;
2255 }
2256 COPY_BUF(l,buf,len,c);
2257 NEXTL(l);
2258 c = CUR_CHAR(l);
2259 if (len >= XML_MAX_NAMELEN) {
2260 /*
2261 * Okay someone managed to make a huge name, so he's ready to pay
2262 * for the processing speed.
2263 */
2264 xmlChar *buffer;
2265 int max = len * 2;
2266
2267 buffer = (xmlChar *) xmlMalloc(max * sizeof(xmlChar));
2268 if (buffer == NULL) {
2269 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2270 ctxt->sax->error(ctxt->userData,
2271 "htmlParseNameComplex: out of memory\n");
2272 return(NULL);
2273 }
2274 memcpy(buffer, buf, len);
2275 while ((IS_LETTER(c)) || (IS_DIGIT(c)) || /* test bigname.xml */
2276 (c == '.') || (c == '-') ||
2277 (c == '_') || (c == ':') ||
2278 (IS_COMBINING(c)) ||
2279 (IS_EXTENDER(c))) {
2280 if (count++ > 100) {
2281 count = 0;
2282 GROW;
2283 }
2284 if (len + 10 > max) {
2285 max *= 2;
2286 buffer = (xmlChar *) xmlRealloc(buffer,
2287 max * sizeof(xmlChar));
2288 if (buffer == NULL) {
2289 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2290 ctxt->sax->error(ctxt->userData,
2291 "htmlParseNameComplex: out of memory\n");
2292 return(NULL);
2293 }
2294 }
2295 COPY_BUF(l,buffer,len,c);
2296 NEXTL(l);
2297 c = CUR_CHAR(l);
2298 }
2299 buffer[len] = 0;
2300 return(buffer);
Owen Taylor3473f882001-02-23 17:55:21 +00002301 }
2302 }
2303 return(xmlStrndup(buf, len));
2304}
2305
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002306
Owen Taylor3473f882001-02-23 17:55:21 +00002307/**
2308 * htmlParseHTMLAttribute:
2309 * @ctxt: an HTML parser context
2310 * @stop: a char stop value
2311 *
2312 * parse an HTML attribute value till the stop (quote), if
2313 * stop is 0 then it stops at the first space
2314 *
2315 * Returns the attribute parsed or NULL
2316 */
2317
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002318static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002319htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2320 xmlChar *buffer = NULL;
2321 int buffer_size = 0;
2322 xmlChar *out = NULL;
2323 xmlChar *name = NULL;
2324
2325 xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002326 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002327
2328 /*
2329 * allocate a translation buffer.
2330 */
2331 buffer_size = HTML_PARSER_BUFFER_SIZE;
2332 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
2333 if (buffer == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00002334 xmlGenericError(xmlGenericErrorContext,
2335 "htmlParseHTMLAttribute: malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002336 return(NULL);
2337 }
2338 out = buffer;
2339
2340 /*
2341 * Ok loop until we reach one of the ending chars
2342 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002343 while ((CUR != 0) && (CUR != stop)) {
2344 if ((stop == 0) && (CUR == '>')) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002345 if ((stop == 0) && (IS_BLANK(CUR))) break;
2346 if (CUR == '&') {
2347 if (NXT(1) == '#') {
2348 unsigned int c;
2349 int bits;
2350
2351 c = htmlParseCharRef(ctxt);
2352 if (c < 0x80)
2353 { *out++ = c; bits= -6; }
2354 else if (c < 0x800)
2355 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2356 else if (c < 0x10000)
2357 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2358 else
2359 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2360
2361 for ( ; bits >= 0; bits-= 6) {
2362 *out++ = ((c >> bits) & 0x3F) | 0x80;
2363 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002364
2365 if (out - buffer > buffer_size - 100) {
2366 int indx = out - buffer;
2367
2368 growBuffer(buffer);
2369 out = &buffer[indx];
2370 }
Owen Taylor3473f882001-02-23 17:55:21 +00002371 } else {
2372 ent = htmlParseEntityRef(ctxt, &name);
2373 if (name == NULL) {
2374 *out++ = '&';
2375 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002376 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002377
2378 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002379 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002380 }
2381 } else if (ent == NULL) {
2382 *out++ = '&';
2383 cur = name;
2384 while (*cur != 0) {
2385 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002386 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002387
2388 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002389 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002390 }
2391 *out++ = *cur++;
2392 }
2393 xmlFree(name);
2394 } else {
2395 unsigned int c;
2396 int bits;
2397
2398 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002399 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002400
2401 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002402 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002403 }
2404 c = (xmlChar)ent->value;
2405 if (c < 0x80)
2406 { *out++ = c; bits= -6; }
2407 else if (c < 0x800)
2408 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2409 else if (c < 0x10000)
2410 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2411 else
2412 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2413
2414 for ( ; bits >= 0; bits-= 6) {
2415 *out++ = ((c >> bits) & 0x3F) | 0x80;
2416 }
2417 xmlFree(name);
2418 }
2419 }
2420 } else {
2421 unsigned int c;
2422 int bits, l;
2423
2424 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002425 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002426
2427 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002428 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002429 }
2430 c = CUR_CHAR(l);
2431 if (c < 0x80)
2432 { *out++ = c; bits= -6; }
2433 else if (c < 0x800)
2434 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2435 else if (c < 0x10000)
2436 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2437 else
2438 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2439
2440 for ( ; bits >= 0; bits-= 6) {
2441 *out++ = ((c >> bits) & 0x3F) | 0x80;
2442 }
2443 NEXT;
2444 }
2445 }
2446 *out++ = 0;
2447 return(buffer);
2448}
2449
2450/**
Owen Taylor3473f882001-02-23 17:55:21 +00002451 * htmlParseEntityRef:
2452 * @ctxt: an HTML parser context
2453 * @str: location to store the entity name
2454 *
2455 * parse an HTML ENTITY references
2456 *
2457 * [68] EntityRef ::= '&' Name ';'
2458 *
2459 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2460 * if non-NULL *str will have to be freed by the caller.
2461 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002462const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00002463htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2464 xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002465 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002466 *str = NULL;
2467
2468 if (CUR == '&') {
2469 NEXT;
2470 name = htmlParseName(ctxt);
2471 if (name == NULL) {
2472 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2473 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2474 ctxt->wellFormed = 0;
2475 } else {
2476 GROW;
2477 if (CUR == ';') {
2478 *str = name;
2479
2480 /*
2481 * Lookup the entity in the table.
2482 */
2483 ent = htmlEntityLookup(name);
2484 if (ent != NULL) /* OK that's ugly !!! */
2485 NEXT;
2486 } else {
2487 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2488 ctxt->sax->error(ctxt->userData,
2489 "htmlParseEntityRef: expecting ';'\n");
2490 *str = name;
2491 }
2492 }
2493 }
2494 return(ent);
2495}
2496
2497/**
2498 * htmlParseAttValue:
2499 * @ctxt: an HTML parser context
2500 *
2501 * parse a value for an attribute
2502 * Note: the parser won't do substitution of entities here, this
2503 * will be handled later in xmlStringGetNodeList, unless it was
2504 * asked for ctxt->replaceEntities != 0
2505 *
2506 * Returns the AttValue parsed or NULL.
2507 */
2508
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002509static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002510htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2511 xmlChar *ret = NULL;
2512
2513 if (CUR == '"') {
2514 NEXT;
2515 ret = htmlParseHTMLAttribute(ctxt, '"');
2516 if (CUR != '"') {
2517 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2518 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2519 ctxt->wellFormed = 0;
2520 } else
2521 NEXT;
2522 } else if (CUR == '\'') {
2523 NEXT;
2524 ret = htmlParseHTMLAttribute(ctxt, '\'');
2525 if (CUR != '\'') {
2526 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2527 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2528 ctxt->wellFormed = 0;
2529 } else
2530 NEXT;
2531 } else {
2532 /*
2533 * That's an HTMLism, the attribute value may not be quoted
2534 */
2535 ret = htmlParseHTMLAttribute(ctxt, 0);
2536 if (ret == NULL) {
2537 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2538 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2539 ctxt->wellFormed = 0;
2540 }
2541 }
2542 return(ret);
2543}
2544
2545/**
2546 * htmlParseSystemLiteral:
2547 * @ctxt: an HTML parser context
2548 *
2549 * parse an HTML Literal
2550 *
2551 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2552 *
2553 * Returns the SystemLiteral parsed or NULL
2554 */
2555
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002556static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002557htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2558 const xmlChar *q;
2559 xmlChar *ret = NULL;
2560
2561 if (CUR == '"') {
2562 NEXT;
2563 q = CUR_PTR;
2564 while ((IS_CHAR(CUR)) && (CUR != '"'))
2565 NEXT;
2566 if (!IS_CHAR(CUR)) {
2567 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2568 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2569 ctxt->wellFormed = 0;
2570 } else {
2571 ret = xmlStrndup(q, CUR_PTR - q);
2572 NEXT;
2573 }
2574 } else if (CUR == '\'') {
2575 NEXT;
2576 q = CUR_PTR;
2577 while ((IS_CHAR(CUR)) && (CUR != '\''))
2578 NEXT;
2579 if (!IS_CHAR(CUR)) {
2580 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2581 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2582 ctxt->wellFormed = 0;
2583 } else {
2584 ret = xmlStrndup(q, CUR_PTR - q);
2585 NEXT;
2586 }
2587 } else {
2588 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2589 ctxt->sax->error(ctxt->userData,
2590 "SystemLiteral \" or ' expected\n");
2591 ctxt->wellFormed = 0;
2592 }
2593
2594 return(ret);
2595}
2596
2597/**
2598 * htmlParsePubidLiteral:
2599 * @ctxt: an HTML parser context
2600 *
2601 * parse an HTML public literal
2602 *
2603 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2604 *
2605 * Returns the PubidLiteral parsed or NULL.
2606 */
2607
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002608static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002609htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2610 const xmlChar *q;
2611 xmlChar *ret = NULL;
2612 /*
2613 * Name ::= (Letter | '_') (NameChar)*
2614 */
2615 if (CUR == '"') {
2616 NEXT;
2617 q = CUR_PTR;
2618 while (IS_PUBIDCHAR(CUR)) NEXT;
2619 if (CUR != '"') {
2620 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2621 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2622 ctxt->wellFormed = 0;
2623 } else {
2624 ret = xmlStrndup(q, CUR_PTR - q);
2625 NEXT;
2626 }
2627 } else if (CUR == '\'') {
2628 NEXT;
2629 q = CUR_PTR;
2630 while ((IS_LETTER(CUR)) && (CUR != '\''))
2631 NEXT;
2632 if (!IS_LETTER(CUR)) {
2633 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2634 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2635 ctxt->wellFormed = 0;
2636 } else {
2637 ret = xmlStrndup(q, CUR_PTR - q);
2638 NEXT;
2639 }
2640 } else {
2641 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2642 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2643 ctxt->wellFormed = 0;
2644 }
2645
2646 return(ret);
2647}
2648
2649/**
2650 * htmlParseScript:
2651 * @ctxt: an HTML parser context
2652 *
2653 * parse the content of an HTML SCRIPT or STYLE element
2654 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2655 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2656 * http://www.w3.org/TR/html4/types.html#type-script
2657 * http://www.w3.org/TR/html4/types.html#h-6.15
2658 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2659 *
2660 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2661 * element and the value of intrinsic event attributes. User agents must
2662 * not evaluate script data as HTML markup but instead must pass it on as
2663 * data to a script engine.
2664 * NOTES:
2665 * - The content is passed like CDATA
2666 * - the attributes for style and scripting "onXXX" are also described
2667 * as CDATA but SGML allows entities references in attributes so their
2668 * processing is identical as other attributes
2669 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002670static void
Owen Taylor3473f882001-02-23 17:55:21 +00002671htmlParseScript(htmlParserCtxtPtr ctxt) {
2672 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2673 int nbchar = 0;
2674 xmlChar cur;
2675
2676 SHRINK;
2677 cur = CUR;
2678 while (IS_CHAR(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002679 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2680 (NXT(3) == '-')) {
2681 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2682 if (ctxt->sax->cdataBlock!= NULL) {
2683 /*
2684 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2685 */
2686 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2687 }
2688 }
2689 nbchar = 0;
2690 htmlParseComment(ctxt);
2691 cur = CUR;
2692 continue;
2693 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002694 /*
2695 * One should break here, the specification is clear:
2696 * Authors should therefore escape "</" within the content.
2697 * Escape mechanisms are specific to each scripting or
2698 * style sheet language.
2699 */
2700 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2701 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2702 break; /* while */
2703 }
2704 buf[nbchar++] = cur;
2705 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2706 if (ctxt->sax->cdataBlock!= NULL) {
2707 /*
2708 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2709 */
2710 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2711 }
2712 nbchar = 0;
2713 }
2714 NEXT;
2715 cur = CUR;
2716 }
2717 if (!(IS_CHAR(cur))) {
2718 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2719 ctxt->sax->error(ctxt->userData,
2720 "Invalid char in CDATA 0x%X\n", cur);
2721 ctxt->wellFormed = 0;
2722 NEXT;
2723 }
2724
2725 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2726 if (ctxt->sax->cdataBlock!= NULL) {
2727 /*
2728 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2729 */
2730 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2731 }
2732 }
2733}
2734
2735
2736/**
2737 * htmlParseCharData:
2738 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002739 *
2740 * parse a CharData section.
2741 * if we are within a CDATA section ']]>' marks an end of section.
2742 *
2743 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2744 */
2745
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002746static void
2747htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002748 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2749 int nbchar = 0;
2750 int cur, l;
2751
2752 SHRINK;
2753 cur = CUR_CHAR(l);
2754 while (((cur != '<') || (ctxt->token == '<')) &&
2755 ((cur != '&') || (ctxt->token == '&')) &&
2756 (IS_CHAR(cur))) {
2757 COPY_BUF(l,buf,nbchar,cur);
2758 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2759 /*
2760 * Ok the segment is to be consumed as chars.
2761 */
2762 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2763 if (areBlanks(ctxt, buf, nbchar)) {
2764 if (ctxt->sax->ignorableWhitespace != NULL)
2765 ctxt->sax->ignorableWhitespace(ctxt->userData,
2766 buf, nbchar);
2767 } else {
2768 htmlCheckParagraph(ctxt);
2769 if (ctxt->sax->characters != NULL)
2770 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2771 }
2772 }
2773 nbchar = 0;
2774 }
2775 NEXTL(l);
2776 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002777 if (cur == 0) {
2778 SHRINK;
2779 GROW;
2780 cur = CUR_CHAR(l);
2781 }
Owen Taylor3473f882001-02-23 17:55:21 +00002782 }
2783 if (nbchar != 0) {
2784 /*
2785 * Ok the segment is to be consumed as chars.
2786 */
2787 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2788 if (areBlanks(ctxt, buf, nbchar)) {
2789 if (ctxt->sax->ignorableWhitespace != NULL)
2790 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2791 } else {
2792 htmlCheckParagraph(ctxt);
2793 if (ctxt->sax->characters != NULL)
2794 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2795 }
2796 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002797 } else {
2798 /*
2799 * Loop detection
2800 */
2801 if (cur == 0)
2802 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002803 }
2804}
2805
2806/**
2807 * htmlParseExternalID:
2808 * @ctxt: an HTML parser context
2809 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002810 *
2811 * Parse an External ID or a Public ID
2812 *
Owen Taylor3473f882001-02-23 17:55:21 +00002813 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2814 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2815 *
2816 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2817 *
2818 * Returns the function returns SystemLiteral and in the second
2819 * case publicID receives PubidLiteral, is strict is off
2820 * it is possible to return NULL and have publicID set.
2821 */
2822
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002823static xmlChar *
2824htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002825 xmlChar *URI = NULL;
2826
2827 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2828 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2829 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2830 SKIP(6);
2831 if (!IS_BLANK(CUR)) {
2832 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2833 ctxt->sax->error(ctxt->userData,
2834 "Space required after 'SYSTEM'\n");
2835 ctxt->wellFormed = 0;
2836 }
2837 SKIP_BLANKS;
2838 URI = htmlParseSystemLiteral(ctxt);
2839 if (URI == NULL) {
2840 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2841 ctxt->sax->error(ctxt->userData,
2842 "htmlParseExternalID: SYSTEM, no URI\n");
2843 ctxt->wellFormed = 0;
2844 }
2845 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2846 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2847 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2848 SKIP(6);
2849 if (!IS_BLANK(CUR)) {
2850 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2851 ctxt->sax->error(ctxt->userData,
2852 "Space required after 'PUBLIC'\n");
2853 ctxt->wellFormed = 0;
2854 }
2855 SKIP_BLANKS;
2856 *publicID = htmlParsePubidLiteral(ctxt);
2857 if (*publicID == NULL) {
2858 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2859 ctxt->sax->error(ctxt->userData,
2860 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2861 ctxt->wellFormed = 0;
2862 }
2863 SKIP_BLANKS;
2864 if ((CUR == '"') || (CUR == '\'')) {
2865 URI = htmlParseSystemLiteral(ctxt);
2866 }
2867 }
2868 return(URI);
2869}
2870
2871/**
2872 * htmlParseComment:
2873 * @ctxt: an HTML parser context
2874 *
2875 * Parse an XML (SGML) comment <!-- .... -->
2876 *
2877 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2878 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002879static void
Owen Taylor3473f882001-02-23 17:55:21 +00002880htmlParseComment(htmlParserCtxtPtr ctxt) {
2881 xmlChar *buf = NULL;
2882 int len;
2883 int size = HTML_PARSER_BUFFER_SIZE;
2884 int q, ql;
2885 int r, rl;
2886 int cur, l;
2887 xmlParserInputState state;
2888
2889 /*
2890 * Check that there is a comment right here.
2891 */
2892 if ((RAW != '<') || (NXT(1) != '!') ||
2893 (NXT(2) != '-') || (NXT(3) != '-')) return;
2894
2895 state = ctxt->instate;
2896 ctxt->instate = XML_PARSER_COMMENT;
2897 SHRINK;
2898 SKIP(4);
2899 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2900 if (buf == NULL) {
2901 xmlGenericError(xmlGenericErrorContext,
2902 "malloc of %d byte failed\n", size);
2903 ctxt->instate = state;
2904 return;
2905 }
2906 q = CUR_CHAR(ql);
2907 NEXTL(ql);
2908 r = CUR_CHAR(rl);
2909 NEXTL(rl);
2910 cur = CUR_CHAR(l);
2911 len = 0;
2912 while (IS_CHAR(cur) &&
2913 ((cur != '>') ||
2914 (r != '-') || (q != '-'))) {
2915 if (len + 5 >= size) {
2916 size *= 2;
2917 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2918 if (buf == NULL) {
2919 xmlGenericError(xmlGenericErrorContext,
2920 "realloc of %d byte failed\n", size);
2921 ctxt->instate = state;
2922 return;
2923 }
2924 }
2925 COPY_BUF(ql,buf,len,q);
2926 q = r;
2927 ql = rl;
2928 r = cur;
2929 rl = l;
2930 NEXTL(l);
2931 cur = CUR_CHAR(l);
2932 if (cur == 0) {
2933 SHRINK;
2934 GROW;
2935 cur = CUR_CHAR(l);
2936 }
2937 }
2938 buf[len] = 0;
2939 if (!IS_CHAR(cur)) {
2940 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2941 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2942 ctxt->sax->error(ctxt->userData,
2943 "Comment not terminated \n<!--%.50s\n", buf);
2944 ctxt->wellFormed = 0;
2945 xmlFree(buf);
2946 } else {
2947 NEXT;
2948 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2949 (!ctxt->disableSAX))
2950 ctxt->sax->comment(ctxt->userData, buf);
2951 xmlFree(buf);
2952 }
2953 ctxt->instate = state;
2954}
2955
2956/**
2957 * htmlParseCharRef:
2958 * @ctxt: an HTML parser context
2959 *
2960 * parse Reference declarations
2961 *
2962 * [66] CharRef ::= '&#' [0-9]+ ';' |
2963 * '&#x' [0-9a-fA-F]+ ';'
2964 *
2965 * Returns the value parsed (as an int)
2966 */
2967int
2968htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2969 int val = 0;
2970
2971 if ((CUR == '&') && (NXT(1) == '#') &&
2972 (NXT(2) == 'x')) {
2973 SKIP(3);
2974 while (CUR != ';') {
2975 if ((CUR >= '0') && (CUR <= '9'))
2976 val = val * 16 + (CUR - '0');
2977 else if ((CUR >= 'a') && (CUR <= 'f'))
2978 val = val * 16 + (CUR - 'a') + 10;
2979 else if ((CUR >= 'A') && (CUR <= 'F'))
2980 val = val * 16 + (CUR - 'A') + 10;
2981 else {
2982 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2983 ctxt->sax->error(ctxt->userData,
2984 "htmlParseCharRef: invalid hexadecimal value\n");
2985 ctxt->wellFormed = 0;
2986 return(0);
2987 }
2988 NEXT;
2989 }
2990 if (CUR == ';')
2991 NEXT;
2992 } else if ((CUR == '&') && (NXT(1) == '#')) {
2993 SKIP(2);
2994 while (CUR != ';') {
2995 if ((CUR >= '0') && (CUR <= '9'))
2996 val = val * 10 + (CUR - '0');
2997 else {
2998 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2999 ctxt->sax->error(ctxt->userData,
3000 "htmlParseCharRef: invalid decimal value\n");
3001 ctxt->wellFormed = 0;
3002 return(0);
3003 }
3004 NEXT;
3005 }
3006 if (CUR == ';')
3007 NEXT;
3008 } else {
3009 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3010 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
3011 ctxt->wellFormed = 0;
3012 }
3013 /*
3014 * Check the value IS_CHAR ...
3015 */
3016 if (IS_CHAR(val)) {
3017 return(val);
3018 } else {
3019 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3020 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
3021 val);
3022 ctxt->wellFormed = 0;
3023 }
3024 return(0);
3025}
3026
3027
3028/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003029 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003030 * @ctxt: an HTML parser context
3031 *
3032 * parse a DOCTYPE declaration
3033 *
3034 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3035 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3036 */
3037
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003038static void
Owen Taylor3473f882001-02-23 17:55:21 +00003039htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3040 xmlChar *name;
3041 xmlChar *ExternalID = NULL;
3042 xmlChar *URI = NULL;
3043
3044 /*
3045 * We know that '<!DOCTYPE' has been detected.
3046 */
3047 SKIP(9);
3048
3049 SKIP_BLANKS;
3050
3051 /*
3052 * Parse the DOCTYPE name.
3053 */
3054 name = htmlParseName(ctxt);
3055 if (name == NULL) {
3056 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3057 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
3058 ctxt->wellFormed = 0;
3059 }
3060 /*
3061 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3062 */
3063
3064 SKIP_BLANKS;
3065
3066 /*
3067 * Check for SystemID and ExternalID
3068 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003069 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003070 SKIP_BLANKS;
3071
3072 /*
3073 * We should be at the end of the DOCTYPE declaration.
3074 */
3075 if (CUR != '>') {
3076 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardf6ed8bc2001-10-02 09:22:47 +00003077 ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003078 ctxt->wellFormed = 0;
3079 /* We shouldn't try to resynchronize ... */
3080 }
3081 NEXT;
3082
3083 /*
3084 * Create or update the document accordingly to the DOCTYPE
3085 */
3086 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3087 (!ctxt->disableSAX))
3088 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3089
3090 /*
3091 * Cleanup, since we don't use all those identifiers
3092 */
3093 if (URI != NULL) xmlFree(URI);
3094 if (ExternalID != NULL) xmlFree(ExternalID);
3095 if (name != NULL) xmlFree(name);
3096}
3097
3098/**
3099 * htmlParseAttribute:
3100 * @ctxt: an HTML parser context
3101 * @value: a xmlChar ** used to store the value of the attribute
3102 *
3103 * parse an attribute
3104 *
3105 * [41] Attribute ::= Name Eq AttValue
3106 *
3107 * [25] Eq ::= S? '=' S?
3108 *
3109 * With namespace:
3110 *
3111 * [NS 11] Attribute ::= QName Eq AttValue
3112 *
3113 * Also the case QName == xmlns:??? is handled independently as a namespace
3114 * definition.
3115 *
3116 * Returns the attribute name, and the value in *value.
3117 */
3118
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003119static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003120htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3121 xmlChar *name, *val = NULL;
3122
3123 *value = NULL;
3124 name = htmlParseHTMLName(ctxt);
3125 if (name == NULL) {
3126 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3127 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
3128 ctxt->wellFormed = 0;
3129 return(NULL);
3130 }
3131
3132 /*
3133 * read the value
3134 */
3135 SKIP_BLANKS;
3136 if (CUR == '=') {
3137 NEXT;
3138 SKIP_BLANKS;
3139 val = htmlParseAttValue(ctxt);
3140 /******
3141 } else {
3142 * TODO : some attribute must have values, some may not
3143 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3144 ctxt->sax->warning(ctxt->userData,
3145 "No value for attribute %s\n", name); */
3146 }
3147
3148 *value = val;
3149 return(name);
3150}
3151
3152/**
3153 * htmlCheckEncoding:
3154 * @ctxt: an HTML parser context
3155 * @attvalue: the attribute value
3156 *
3157 * Checks an http-equiv attribute from a Meta tag to detect
3158 * the encoding
3159 * If a new encoding is detected the parser is switched to decode
3160 * it and pass UTF8
3161 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003162static void
Owen Taylor3473f882001-02-23 17:55:21 +00003163htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3164 const xmlChar *encoding;
3165
3166 if ((ctxt == NULL) || (attvalue == NULL))
3167 return;
3168
3169 /* do not change encoding */
3170 if (ctxt->input->encoding != NULL)
3171 return;
3172
3173 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3174 if (encoding != NULL) {
3175 encoding += 8;
3176 } else {
3177 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3178 if (encoding != NULL)
3179 encoding += 9;
3180 }
3181 if (encoding != NULL) {
3182 xmlCharEncoding enc;
3183 xmlCharEncodingHandlerPtr handler;
3184
3185 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3186
3187 if (ctxt->input->encoding != NULL)
3188 xmlFree((xmlChar *) ctxt->input->encoding);
3189 ctxt->input->encoding = xmlStrdup(encoding);
3190
3191 enc = xmlParseCharEncoding((const char *) encoding);
3192 /*
3193 * registered set of known encodings
3194 */
3195 if (enc != XML_CHAR_ENCODING_ERROR) {
3196 xmlSwitchEncoding(ctxt, enc);
3197 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3198 } else {
3199 /*
3200 * fallback for unknown encodings
3201 */
3202 handler = xmlFindCharEncodingHandler((const char *) encoding);
3203 if (handler != NULL) {
3204 xmlSwitchToEncoding(ctxt, handler);
3205 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3206 } else {
3207 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3208 }
3209 }
3210
3211 if ((ctxt->input->buf != NULL) &&
3212 (ctxt->input->buf->encoder != NULL) &&
3213 (ctxt->input->buf->raw != NULL) &&
3214 (ctxt->input->buf->buffer != NULL)) {
3215 int nbchars;
3216 int processed;
3217
3218 /*
3219 * convert as much as possible to the parser reading buffer.
3220 */
3221 processed = ctxt->input->cur - ctxt->input->base;
3222 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3223 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3224 ctxt->input->buf->buffer,
3225 ctxt->input->buf->raw);
3226 if (nbchars < 0) {
3227 ctxt->errNo = XML_ERR_INVALID_ENCODING;
3228 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3229 ctxt->sax->error(ctxt->userData,
3230 "htmlCheckEncoding: encoder error\n");
3231 }
3232 ctxt->input->base =
3233 ctxt->input->cur = ctxt->input->buf->buffer->content;
3234 }
3235 }
3236}
3237
3238/**
3239 * htmlCheckMeta:
3240 * @ctxt: an HTML parser context
3241 * @atts: the attributes values
3242 *
3243 * Checks an attributes from a Meta tag
3244 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003245static void
Owen Taylor3473f882001-02-23 17:55:21 +00003246htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3247 int i;
3248 const xmlChar *att, *value;
3249 int http = 0;
3250 const xmlChar *content = NULL;
3251
3252 if ((ctxt == NULL) || (atts == NULL))
3253 return;
3254
3255 i = 0;
3256 att = atts[i++];
3257 while (att != NULL) {
3258 value = atts[i++];
3259 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3260 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3261 http = 1;
3262 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3263 content = value;
3264 att = atts[i++];
3265 }
3266 if ((http) && (content != NULL))
3267 htmlCheckEncoding(ctxt, content);
3268
3269}
3270
3271/**
3272 * htmlParseStartTag:
3273 * @ctxt: an HTML parser context
3274 *
3275 * parse a start of tag either for rule element or
3276 * EmptyElement. In both case we don't parse the tag closing chars.
3277 *
3278 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3279 *
3280 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3281 *
3282 * With namespace:
3283 *
3284 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3285 *
3286 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3287 *
3288 */
3289
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003290static void
Owen Taylor3473f882001-02-23 17:55:21 +00003291htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3292 xmlChar *name;
3293 xmlChar *attname;
3294 xmlChar *attvalue;
3295 const xmlChar **atts = NULL;
3296 int nbatts = 0;
3297 int maxatts = 0;
3298 int meta = 0;
3299 int i;
3300
3301 if (CUR != '<') return;
3302 NEXT;
3303
3304 GROW;
3305 name = htmlParseHTMLName(ctxt);
3306 if (name == NULL) {
3307 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3308 ctxt->sax->error(ctxt->userData,
3309 "htmlParseStartTag: invalid element name\n");
3310 ctxt->wellFormed = 0;
3311 /* Dump the bogus tag like browsers do */
3312 while ((IS_CHAR(CUR)) && (CUR != '>'))
3313 NEXT;
3314 return;
3315 }
3316 if (xmlStrEqual(name, BAD_CAST"meta"))
3317 meta = 1;
3318
3319 /*
3320 * Check for auto-closure of HTML elements.
3321 */
3322 htmlAutoClose(ctxt, name);
3323
3324 /*
3325 * Check for implied HTML elements.
3326 */
3327 htmlCheckImplied(ctxt, name);
3328
3329 /*
3330 * Avoid html at any level > 0, head at any level != 1
3331 * or any attempt to recurse body
3332 */
3333 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3334 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3335 ctxt->sax->error(ctxt->userData,
3336 "htmlParseStartTag: misplaced <html> tag\n");
3337 ctxt->wellFormed = 0;
3338 xmlFree(name);
3339 return;
3340 }
3341 if ((ctxt->nameNr != 1) &&
3342 (xmlStrEqual(name, BAD_CAST"head"))) {
3343 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3344 ctxt->sax->error(ctxt->userData,
3345 "htmlParseStartTag: misplaced <head> tag\n");
3346 ctxt->wellFormed = 0;
3347 xmlFree(name);
3348 return;
3349 }
3350 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003351 int indx;
3352 for (indx = 0;indx < ctxt->nameNr;indx++) {
3353 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00003354 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3355 ctxt->sax->error(ctxt->userData,
3356 "htmlParseStartTag: misplaced <body> tag\n");
3357 ctxt->wellFormed = 0;
3358 xmlFree(name);
3359 return;
3360 }
3361 }
3362 }
3363
3364 /*
3365 * Now parse the attributes, it ends up with the ending
3366 *
3367 * (S Attribute)* S?
3368 */
3369 SKIP_BLANKS;
3370 while ((IS_CHAR(CUR)) &&
3371 (CUR != '>') &&
3372 ((CUR != '/') || (NXT(1) != '>'))) {
3373 long cons = ctxt->nbChars;
3374
3375 GROW;
3376 attname = htmlParseAttribute(ctxt, &attvalue);
3377 if (attname != NULL) {
3378
3379 /*
3380 * Well formedness requires at most one declaration of an attribute
3381 */
3382 for (i = 0; i < nbatts;i += 2) {
3383 if (xmlStrEqual(atts[i], attname)) {
3384 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3385 ctxt->sax->error(ctxt->userData,
3386 "Attribute %s redefined\n",
3387 attname);
3388 ctxt->wellFormed = 0;
3389 xmlFree(attname);
3390 if (attvalue != NULL)
3391 xmlFree(attvalue);
3392 goto failed;
3393 }
3394 }
3395
3396 /*
3397 * Add the pair to atts
3398 */
3399 if (atts == NULL) {
3400 maxatts = 10;
3401 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
3402 if (atts == NULL) {
3403 xmlGenericError(xmlGenericErrorContext,
3404 "malloc of %ld byte failed\n",
3405 maxatts * (long)sizeof(xmlChar *));
3406 if (name != NULL) xmlFree(name);
3407 return;
3408 }
3409 } else if (nbatts + 4 > maxatts) {
3410 maxatts *= 2;
3411 atts = (const xmlChar **) xmlRealloc((void *) atts,
3412 maxatts * sizeof(xmlChar *));
3413 if (atts == NULL) {
3414 xmlGenericError(xmlGenericErrorContext,
3415 "realloc of %ld byte failed\n",
3416 maxatts * (long)sizeof(xmlChar *));
3417 if (name != NULL) xmlFree(name);
3418 return;
3419 }
3420 }
3421 atts[nbatts++] = attname;
3422 atts[nbatts++] = attvalue;
3423 atts[nbatts] = NULL;
3424 atts[nbatts + 1] = NULL;
3425 }
3426 else {
3427 /* Dump the bogus attribute string up to the next blank or
3428 * the end of the tag. */
Daniel Veillard561b7f82002-03-20 21:55:57 +00003429 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3430 && ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003431 NEXT;
3432 }
3433
3434failed:
3435 SKIP_BLANKS;
3436 if (cons == ctxt->nbChars) {
3437 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3438 ctxt->sax->error(ctxt->userData,
3439 "htmlParseStartTag: problem parsing attributes\n");
3440 ctxt->wellFormed = 0;
3441 break;
3442 }
3443 }
3444
3445 /*
3446 * Handle specific association to the META tag
3447 */
3448 if (meta)
3449 htmlCheckMeta(ctxt, atts);
3450
3451 /*
3452 * SAX: Start of Element !
3453 */
3454 htmlnamePush(ctxt, xmlStrdup(name));
3455#ifdef DEBUG
3456 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3457#endif
3458 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3459 ctxt->sax->startElement(ctxt->userData, name, atts);
3460
3461 if (atts != NULL) {
3462 for (i = 0;i < nbatts;i++) {
3463 if (atts[i] != NULL)
3464 xmlFree((xmlChar *) atts[i]);
3465 }
3466 xmlFree((void *) atts);
3467 }
3468 if (name != NULL) xmlFree(name);
3469}
3470
3471/**
3472 * htmlParseEndTag:
3473 * @ctxt: an HTML parser context
3474 *
3475 * parse an end of tag
3476 *
3477 * [42] ETag ::= '</' Name S? '>'
3478 *
3479 * With namespace
3480 *
3481 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003482 *
3483 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003484 */
3485
Daniel Veillardf420ac52001-07-04 16:04:09 +00003486static int
Owen Taylor3473f882001-02-23 17:55:21 +00003487htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3488 xmlChar *name;
3489 xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003490 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003491
3492 if ((CUR != '<') || (NXT(1) != '/')) {
3493 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3494 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3495 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003496 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003497 }
3498 SKIP(2);
3499
3500 name = htmlParseHTMLName(ctxt);
Daniel Veillardf420ac52001-07-04 16:04:09 +00003501 if (name == NULL) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003502
3503 /*
3504 * We should definitely be at the ending "S? '>'" part
3505 */
3506 SKIP_BLANKS;
3507 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3508 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3509 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3510 ctxt->wellFormed = 0;
3511 } else
3512 NEXT;
3513
3514 /*
3515 * If the name read is not one of the element in the parsing stack
3516 * then return, it's just an error.
3517 */
3518 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3519 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3520 }
3521 if (i < 0) {
3522 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3523 ctxt->sax->error(ctxt->userData,
3524 "Unexpected end tag : %s\n", name);
3525 xmlFree(name);
3526 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003527 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003528 }
3529
3530
3531 /*
3532 * Check for auto-closure of HTML elements.
3533 */
3534
3535 htmlAutoCloseOnClose(ctxt, name);
3536
3537 /*
3538 * Well formedness constraints, opening and closing must match.
3539 * With the exception that the autoclose may have popped stuff out
3540 * of the stack.
3541 */
3542 if (!xmlStrEqual(name, ctxt->name)) {
3543#ifdef DEBUG
3544 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3545#endif
3546 if ((ctxt->name != NULL) &&
3547 (!xmlStrEqual(ctxt->name, name))) {
3548 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3549 ctxt->sax->error(ctxt->userData,
3550 "Opening and ending tag mismatch: %s and %s\n",
3551 name, ctxt->name);
3552 ctxt->wellFormed = 0;
3553 }
3554 }
3555
3556 /*
3557 * SAX: End of Tag
3558 */
3559 oldname = ctxt->name;
3560 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3561 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3562 ctxt->sax->endElement(ctxt->userData, name);
3563 oldname = htmlnamePop(ctxt);
3564 if (oldname != NULL) {
3565#ifdef DEBUG
3566 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3567#endif
3568 xmlFree(oldname);
3569#ifdef DEBUG
3570 } else {
3571 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3572#endif
3573 }
Daniel Veillardf420ac52001-07-04 16:04:09 +00003574 ret = 1;
3575 } else {
3576 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003577 }
3578
3579 if (name != NULL)
3580 xmlFree(name);
3581
Daniel Veillardf420ac52001-07-04 16:04:09 +00003582 return(ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003583}
3584
3585
3586/**
3587 * htmlParseReference:
3588 * @ctxt: an HTML parser context
3589 *
3590 * parse and handle entity references in content,
3591 * this will end-up in a call to character() since this is either a
3592 * CharRef, or a predefined entity.
3593 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003594static void
Owen Taylor3473f882001-02-23 17:55:21 +00003595htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003596 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003597 xmlChar out[6];
3598 xmlChar *name;
3599 if (CUR != '&') return;
3600
3601 if (NXT(1) == '#') {
3602 unsigned int c;
3603 int bits, i = 0;
3604
3605 c = htmlParseCharRef(ctxt);
3606 if (c == 0)
3607 return;
3608
3609 if (c < 0x80) { out[i++]= c; bits= -6; }
3610 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3611 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3612 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3613
3614 for ( ; bits >= 0; bits-= 6) {
3615 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3616 }
3617 out[i] = 0;
3618
3619 htmlCheckParagraph(ctxt);
3620 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3621 ctxt->sax->characters(ctxt->userData, out, i);
3622 } else {
3623 ent = htmlParseEntityRef(ctxt, &name);
3624 if (name == NULL) {
3625 htmlCheckParagraph(ctxt);
3626 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3627 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3628 return;
3629 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003630 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003631 htmlCheckParagraph(ctxt);
3632 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3633 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3634 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3635 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3636 }
3637 } else {
3638 unsigned int c;
3639 int bits, i = 0;
3640
3641 c = ent->value;
3642 if (c < 0x80)
3643 { out[i++]= c; bits= -6; }
3644 else if (c < 0x800)
3645 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3646 else if (c < 0x10000)
3647 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3648 else
3649 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3650
3651 for ( ; bits >= 0; bits-= 6) {
3652 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3653 }
3654 out[i] = 0;
3655
3656 htmlCheckParagraph(ctxt);
3657 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3658 ctxt->sax->characters(ctxt->userData, out, i);
3659 }
3660 xmlFree(name);
3661 }
3662}
3663
3664/**
3665 * htmlParseContent:
3666 * @ctxt: an HTML parser context
3667 * @name: the node name
3668 *
3669 * Parse a content: comment, sub-element, reference or text.
3670 *
3671 */
3672
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003673static void
Owen Taylor3473f882001-02-23 17:55:21 +00003674htmlParseContent(htmlParserCtxtPtr ctxt) {
3675 xmlChar *currentNode;
3676 int depth;
3677
3678 currentNode = xmlStrdup(ctxt->name);
3679 depth = ctxt->nameNr;
3680 while (1) {
3681 long cons = ctxt->nbChars;
3682
3683 GROW;
3684 /*
3685 * Our tag or one of it's parent or children is ending.
3686 */
3687 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003688 if (htmlParseEndTag(ctxt) &&
3689 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3690 if (currentNode != NULL)
3691 xmlFree(currentNode);
3692 return;
3693 }
3694 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003695 }
3696
3697 /*
3698 * Has this node been popped out during parsing of
3699 * the next element
3700 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003701 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3702 (!xmlStrEqual(currentNode, ctxt->name)))
3703 {
Owen Taylor3473f882001-02-23 17:55:21 +00003704 if (currentNode != NULL) xmlFree(currentNode);
3705 return;
3706 }
3707
Daniel Veillardf9533d12001-03-03 10:04:57 +00003708 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3709 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003710 /*
3711 * Handle SCRIPT/STYLE separately
3712 */
3713 htmlParseScript(ctxt);
3714 } else {
3715 /*
3716 * Sometimes DOCTYPE arrives in the middle of the document
3717 */
3718 if ((CUR == '<') && (NXT(1) == '!') &&
3719 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3720 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3721 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3722 (UPP(8) == 'E')) {
3723 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3724 ctxt->sax->error(ctxt->userData,
3725 "Misplaced DOCTYPE declaration\n");
3726 ctxt->wellFormed = 0;
3727 htmlParseDocTypeDecl(ctxt);
3728 }
3729
3730 /*
3731 * First case : a comment
3732 */
3733 if ((CUR == '<') && (NXT(1) == '!') &&
3734 (NXT(2) == '-') && (NXT(3) == '-')) {
3735 htmlParseComment(ctxt);
3736 }
3737
3738 /*
3739 * Second case : a sub-element.
3740 */
3741 else if (CUR == '<') {
3742 htmlParseElement(ctxt);
3743 }
3744
3745 /*
3746 * Third case : a reference. If if has not been resolved,
3747 * parsing returns it's Name, create the node
3748 */
3749 else if (CUR == '&') {
3750 htmlParseReference(ctxt);
3751 }
3752
3753 /*
3754 * Fourth : end of the resource
3755 */
3756 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003757 htmlAutoCloseOnEnd(ctxt);
3758 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003759 }
3760
3761 /*
3762 * Last case, text. Note that References are handled directly.
3763 */
3764 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003765 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003766 }
3767
3768 if (cons == ctxt->nbChars) {
3769 if (ctxt->node != NULL) {
3770 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3771 ctxt->sax->error(ctxt->userData,
3772 "detected an error in element content\n");
3773 ctxt->wellFormed = 0;
3774 }
3775 break;
3776 }
3777 }
3778 GROW;
3779 }
3780 if (currentNode != NULL) xmlFree(currentNode);
3781}
3782
3783/**
3784 * htmlParseElement:
3785 * @ctxt: an HTML parser context
3786 *
3787 * parse an HTML element, this is highly recursive
3788 *
3789 * [39] element ::= EmptyElemTag | STag content ETag
3790 *
3791 * [41] Attribute ::= Name Eq AttValue
3792 */
3793
3794void
3795htmlParseElement(htmlParserCtxtPtr ctxt) {
3796 xmlChar *name;
3797 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003798 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003799 htmlParserNodeInfo node_info;
3800 xmlChar *oldname;
3801 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003802 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003803
3804 /* Capture start position */
3805 if (ctxt->record_info) {
3806 node_info.begin_pos = ctxt->input->consumed +
3807 (CUR_PTR - ctxt->input->base);
3808 node_info.begin_line = ctxt->input->line;
3809 }
3810
3811 oldname = xmlStrdup(ctxt->name);
3812 htmlParseStartTag(ctxt);
3813 name = ctxt->name;
3814#ifdef DEBUG
3815 if (oldname == NULL)
3816 xmlGenericError(xmlGenericErrorContext,
3817 "Start of element %s\n", name);
3818 else if (name == NULL)
3819 xmlGenericError(xmlGenericErrorContext,
3820 "Start of element failed, was %s\n", oldname);
3821 else
3822 xmlGenericError(xmlGenericErrorContext,
3823 "Start of element %s, was %s\n", name, oldname);
3824#endif
3825 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3826 (name == NULL)) {
3827 if (CUR == '>')
3828 NEXT;
3829 if (oldname != NULL)
3830 xmlFree(oldname);
3831 return;
3832 }
3833 if (oldname != NULL)
3834 xmlFree(oldname);
3835
3836 /*
3837 * Lookup the info for that element.
3838 */
3839 info = htmlTagLookup(name);
3840 if (info == NULL) {
3841 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3842 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3843 name);
3844 ctxt->wellFormed = 0;
3845 } else if (info->depr) {
3846/***************************
3847 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3848 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3849 name);
3850 ***************************/
3851 }
3852
3853 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003854 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003855 */
3856 if ((CUR == '/') && (NXT(1) == '>')) {
3857 SKIP(2);
3858 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3859 ctxt->sax->endElement(ctxt->userData, name);
3860 oldname = htmlnamePop(ctxt);
3861#ifdef DEBUG
3862 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3863#endif
3864 if (oldname != NULL)
3865 xmlFree(oldname);
3866 return;
3867 }
3868
3869 if (CUR == '>') {
3870 NEXT;
3871 } else {
3872 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3873 ctxt->sax->error(ctxt->userData,
3874 "Couldn't find end of Start Tag %s\n",
3875 name);
3876 ctxt->wellFormed = 0;
3877
3878 /*
3879 * end of parsing of this node.
3880 */
3881 if (xmlStrEqual(name, ctxt->name)) {
3882 nodePop(ctxt);
3883 oldname = htmlnamePop(ctxt);
3884#ifdef DEBUG
3885 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3886#endif
3887 if (oldname != NULL)
3888 xmlFree(oldname);
3889 }
3890
3891 /*
3892 * Capture end position and add node
3893 */
3894 if ( currentNode != NULL && ctxt->record_info ) {
3895 node_info.end_pos = ctxt->input->consumed +
3896 (CUR_PTR - ctxt->input->base);
3897 node_info.end_line = ctxt->input->line;
3898 node_info.node = ctxt->node;
3899 xmlParserAddNodeInfo(ctxt, &node_info);
3900 }
3901 return;
3902 }
3903
3904 /*
3905 * Check for an Empty Element from DTD definition
3906 */
3907 if ((info != NULL) && (info->empty)) {
3908 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3909 ctxt->sax->endElement(ctxt->userData, name);
3910 oldname = htmlnamePop(ctxt);
3911#ifdef DEBUG
3912 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3913#endif
3914 if (oldname != NULL)
3915 xmlFree(oldname);
3916 return;
3917 }
3918
3919 /*
3920 * Parse the content of the element:
3921 */
3922 currentNode = xmlStrdup(ctxt->name);
3923 depth = ctxt->nameNr;
3924 while (IS_CHAR(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003925 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003926 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003927 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003928 if (ctxt->nameNr < depth) break;
3929 }
3930
Owen Taylor3473f882001-02-23 17:55:21 +00003931 /*
3932 * Capture end position and add node
3933 */
3934 if ( currentNode != NULL && ctxt->record_info ) {
3935 node_info.end_pos = ctxt->input->consumed +
3936 (CUR_PTR - ctxt->input->base);
3937 node_info.end_line = ctxt->input->line;
3938 node_info.node = ctxt->node;
3939 xmlParserAddNodeInfo(ctxt, &node_info);
3940 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003941 if (!IS_CHAR(CUR)) {
3942 htmlAutoCloseOnEnd(ctxt);
3943 }
3944
Owen Taylor3473f882001-02-23 17:55:21 +00003945 if (currentNode != NULL)
3946 xmlFree(currentNode);
3947}
3948
3949/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003950 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00003951 * @ctxt: an HTML parser context
3952 *
3953 * parse an HTML document (and build a tree if using the standard SAX
3954 * interface).
3955 *
3956 * Returns 0, -1 in case of error. the parser context is augmented
3957 * as a result of the parsing.
3958 */
3959
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00003960int
Owen Taylor3473f882001-02-23 17:55:21 +00003961htmlParseDocument(htmlParserCtxtPtr ctxt) {
3962 xmlDtdPtr dtd;
3963
Daniel Veillardd0463562001-10-13 09:15:48 +00003964 xmlInitParser();
3965
Owen Taylor3473f882001-02-23 17:55:21 +00003966 htmlDefaultSAXHandlerInit();
3967 ctxt->html = 1;
3968
3969 GROW;
3970 /*
3971 * SAX: beginning of the document processing.
3972 */
3973 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3974 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3975
3976 /*
3977 * Wipe out everything which is before the first '<'
3978 */
3979 SKIP_BLANKS;
3980 if (CUR == 0) {
3981 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3982 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3983 ctxt->wellFormed = 0;
3984 }
3985
3986 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3987 ctxt->sax->startDocument(ctxt->userData);
3988
3989
3990 /*
3991 * Parse possible comments before any content
3992 */
3993 while ((CUR == '<') && (NXT(1) == '!') &&
3994 (NXT(2) == '-') && (NXT(3) == '-')) {
3995 htmlParseComment(ctxt);
3996 SKIP_BLANKS;
3997 }
3998
3999
4000 /*
4001 * Then possibly doc type declaration(s) and more Misc
4002 * (doctypedecl Misc*)?
4003 */
4004 if ((CUR == '<') && (NXT(1) == '!') &&
4005 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4006 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4007 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4008 (UPP(8) == 'E')) {
4009 htmlParseDocTypeDecl(ctxt);
4010 }
4011 SKIP_BLANKS;
4012
4013 /*
4014 * Parse possible comments before any content
4015 */
4016 while ((CUR == '<') && (NXT(1) == '!') &&
4017 (NXT(2) == '-') && (NXT(3) == '-')) {
4018 htmlParseComment(ctxt);
4019 SKIP_BLANKS;
4020 }
4021
4022 /*
4023 * Time to start parsing the tree itself
4024 */
4025 htmlParseContent(ctxt);
4026
4027 /*
4028 * autoclose
4029 */
4030 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004031 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004032
4033
4034 /*
4035 * SAX: end of the document processing.
4036 */
4037 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4038 ctxt->sax->endDocument(ctxt->userData);
4039
4040 if (ctxt->myDoc != NULL) {
4041 dtd = xmlGetIntSubset(ctxt->myDoc);
4042 if (dtd == NULL)
4043 ctxt->myDoc->intSubset =
4044 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4045 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4046 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4047 }
4048 if (! ctxt->wellFormed) return(-1);
4049 return(0);
4050}
4051
4052
4053/************************************************************************
4054 * *
4055 * Parser contexts handling *
4056 * *
4057 ************************************************************************/
4058
4059/**
4060 * xmlInitParserCtxt:
4061 * @ctxt: an HTML parser context
4062 *
4063 * Initialize a parser context
4064 */
4065
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004066static void
Owen Taylor3473f882001-02-23 17:55:21 +00004067htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4068{
4069 htmlSAXHandler *sax;
4070
4071 if (ctxt == NULL) return;
4072 memset(ctxt, 0, sizeof(htmlParserCtxt));
4073
4074 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4075 if (sax == NULL) {
4076 xmlGenericError(xmlGenericErrorContext,
4077 "htmlInitParserCtxt: out of memory\n");
4078 }
4079 else
4080 memset(sax, 0, sizeof(htmlSAXHandler));
4081
4082 /* Allocate the Input stack */
4083 ctxt->inputTab = (htmlParserInputPtr *)
4084 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4085 if (ctxt->inputTab == NULL) {
4086 xmlGenericError(xmlGenericErrorContext,
4087 "htmlInitParserCtxt: out of memory\n");
4088 ctxt->inputNr = 0;
4089 ctxt->inputMax = 0;
4090 ctxt->input = NULL;
4091 return;
4092 }
4093 ctxt->inputNr = 0;
4094 ctxt->inputMax = 5;
4095 ctxt->input = NULL;
4096 ctxt->version = NULL;
4097 ctxt->encoding = NULL;
4098 ctxt->standalone = -1;
4099 ctxt->instate = XML_PARSER_START;
4100
4101 /* Allocate the Node stack */
4102 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4103 if (ctxt->nodeTab == NULL) {
4104 xmlGenericError(xmlGenericErrorContext,
4105 "htmlInitParserCtxt: out of memory\n");
4106 ctxt->nodeNr = 0;
4107 ctxt->nodeMax = 0;
4108 ctxt->node = NULL;
4109 ctxt->inputNr = 0;
4110 ctxt->inputMax = 0;
4111 ctxt->input = NULL;
4112 return;
4113 }
4114 ctxt->nodeNr = 0;
4115 ctxt->nodeMax = 10;
4116 ctxt->node = NULL;
4117
4118 /* Allocate the Name stack */
4119 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4120 if (ctxt->nameTab == NULL) {
4121 xmlGenericError(xmlGenericErrorContext,
4122 "htmlInitParserCtxt: out of memory\n");
4123 ctxt->nameNr = 0;
4124 ctxt->nameMax = 10;
4125 ctxt->name = NULL;
4126 ctxt->nodeNr = 0;
4127 ctxt->nodeMax = 0;
4128 ctxt->node = NULL;
4129 ctxt->inputNr = 0;
4130 ctxt->inputMax = 0;
4131 ctxt->input = NULL;
4132 return;
4133 }
4134 ctxt->nameNr = 0;
4135 ctxt->nameMax = 10;
4136 ctxt->name = NULL;
4137
4138 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
4139 else {
4140 ctxt->sax = sax;
4141 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
4142 }
4143 ctxt->userData = ctxt;
4144 ctxt->myDoc = NULL;
4145 ctxt->wellFormed = 1;
4146 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004147 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004148 ctxt->html = 1;
4149 ctxt->record_info = 0;
4150 ctxt->validate = 0;
4151 ctxt->nbChars = 0;
4152 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004153 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004154 xmlInitNodeInfoSeq(&ctxt->node_seq);
4155}
4156
4157/**
4158 * htmlFreeParserCtxt:
4159 * @ctxt: an HTML parser context
4160 *
4161 * Free all the memory used by a parser context. However the parsed
4162 * document in ctxt->myDoc is not freed.
4163 */
4164
4165void
4166htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4167{
4168 xmlFreeParserCtxt(ctxt);
4169}
4170
4171/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004172 * htmlNewParserCtxt:
4173 *
4174 * Allocate and initialize a new parser context.
4175 *
4176 * Returns the xmlParserCtxtPtr or NULL
4177 */
4178
4179static htmlParserCtxtPtr
4180htmlNewParserCtxt(void)
4181{
4182 xmlParserCtxtPtr ctxt;
4183
4184 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4185 if (ctxt == NULL) {
4186 xmlGenericError(xmlGenericErrorContext,
4187 "xmlNewParserCtxt : cannot allocate context\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004188 return(NULL);
4189 }
4190 memset(ctxt, 0, sizeof(xmlParserCtxt));
4191 htmlInitParserCtxt(ctxt);
4192 return(ctxt);
4193}
4194
4195/**
4196 * htmlCreateMemoryParserCtxt:
4197 * @buffer: a pointer to a char array
4198 * @size: the size of the array
4199 *
4200 * Create a parser context for an HTML in-memory document.
4201 *
4202 * Returns the new parser context or NULL
4203 */
4204static htmlParserCtxtPtr
4205htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4206 xmlParserCtxtPtr ctxt;
4207 xmlParserInputPtr input;
4208 xmlParserInputBufferPtr buf;
4209
4210 if (buffer == NULL)
4211 return(NULL);
4212 if (size <= 0)
4213 return(NULL);
4214
4215 ctxt = htmlNewParserCtxt();
4216 if (ctxt == NULL)
4217 return(NULL);
4218
4219 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4220 if (buf == NULL) return(NULL);
4221
4222 input = xmlNewInputStream(ctxt);
4223 if (input == NULL) {
4224 xmlFreeParserCtxt(ctxt);
4225 return(NULL);
4226 }
4227
4228 input->filename = NULL;
4229 input->buf = buf;
4230 input->base = input->buf->buffer->content;
4231 input->cur = input->buf->buffer->content;
4232 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4233
4234 inputPush(ctxt, input);
4235 return(ctxt);
4236}
4237
4238/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004239 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004240 * @cur: a pointer to an array of xmlChar
4241 * @encoding: a free form C string describing the HTML document encoding, or NULL
4242 *
4243 * Create a parser context for an HTML document.
4244 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004245 * TODO: check the need to add encoding handling there
4246 *
Owen Taylor3473f882001-02-23 17:55:21 +00004247 * Returns the new parser context or NULL
4248 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004249static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00004250htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004251 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004252 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004253
Daniel Veillard1d995272002-07-22 16:43:32 +00004254 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004255 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004256 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004257 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4258
4259 if (encoding != NULL) {
4260 xmlCharEncoding enc;
4261 xmlCharEncodingHandlerPtr handler;
4262
4263 if (ctxt->input->encoding != NULL)
4264 xmlFree((xmlChar *) ctxt->input->encoding);
4265 ctxt->input->encoding = (const xmlChar *) encoding;
4266
4267 enc = xmlParseCharEncoding(encoding);
4268 /*
4269 * registered set of known encodings
4270 */
4271 if (enc != XML_CHAR_ENCODING_ERROR) {
4272 xmlSwitchEncoding(ctxt, enc);
4273 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4274 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4275 ctxt->sax->error(ctxt->userData,
4276 "Unsupported encoding %s\n", encoding);
4277 ctxt->input->encoding = NULL;
4278 }
4279 } else {
4280 /*
4281 * fallback for unknown encodings
4282 */
4283 handler = xmlFindCharEncodingHandler((const char *) encoding);
4284 if (handler != NULL) {
4285 xmlSwitchToEncoding(ctxt, handler);
4286 } else {
4287 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
4288 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4289 ctxt->sax->error(ctxt->userData,
4290 "Unsupported encoding %s\n", encoding);
4291 }
4292 }
4293 }
4294 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004295}
4296
4297/************************************************************************
4298 * *
4299 * Progressive parsing interfaces *
4300 * *
4301 ************************************************************************/
4302
4303/**
4304 * htmlParseLookupSequence:
4305 * @ctxt: an HTML parser context
4306 * @first: the first char to lookup
4307 * @next: the next char to lookup or zero
4308 * @third: the next char to lookup or zero
4309 *
4310 * Try to find if a sequence (first, next, third) or just (first next) or
4311 * (first) is available in the input stream.
4312 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4313 * to avoid rescanning sequences of bytes, it DOES change the state of the
4314 * parser, do not use liberally.
4315 * This is basically similar to xmlParseLookupSequence()
4316 *
4317 * Returns the index to the current parsing point if the full sequence
4318 * is available, -1 otherwise.
4319 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004320static int
Owen Taylor3473f882001-02-23 17:55:21 +00004321htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
4322 xmlChar next, xmlChar third) {
4323 int base, len;
4324 htmlParserInputPtr in;
4325 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004326 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004327
4328 in = ctxt->input;
4329 if (in == NULL) return(-1);
4330 base = in->cur - in->base;
4331 if (base < 0) return(-1);
4332 if (ctxt->checkIndex > base)
4333 base = ctxt->checkIndex;
4334 if (in->buf == NULL) {
4335 buf = in->base;
4336 len = in->length;
4337 } else {
4338 buf = in->buf->buffer->content;
4339 len = in->buf->buffer->use;
4340 }
4341 /* take into account the sequence length */
4342 if (third) len -= 2;
4343 else if (next) len --;
4344 for (;base < len;base++) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004345 if (!incomment && (base + 4 < len)) {
4346 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4347 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4348 incomment = 1;
4349 }
4350 /* do not increment base, some people use <!--> */
4351 }
4352 if (incomment) {
4353 if (base + 3 < len)
4354 return(-1);
4355 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4356 (buf[base + 2] == '>')) {
4357 incomment = 0;
4358 base += 2;
4359 }
4360 continue;
4361 }
Owen Taylor3473f882001-02-23 17:55:21 +00004362 if (buf[base] == first) {
4363 if (third != 0) {
4364 if ((buf[base + 1] != next) ||
4365 (buf[base + 2] != third)) continue;
4366 } else if (next != 0) {
4367 if (buf[base + 1] != next) continue;
4368 }
4369 ctxt->checkIndex = 0;
4370#ifdef DEBUG_PUSH
4371 if (next == 0)
4372 xmlGenericError(xmlGenericErrorContext,
4373 "HPP: lookup '%c' found at %d\n",
4374 first, base);
4375 else if (third == 0)
4376 xmlGenericError(xmlGenericErrorContext,
4377 "HPP: lookup '%c%c' found at %d\n",
4378 first, next, base);
4379 else
4380 xmlGenericError(xmlGenericErrorContext,
4381 "HPP: lookup '%c%c%c' found at %d\n",
4382 first, next, third, base);
4383#endif
4384 return(base - (in->cur - in->base));
4385 }
4386 }
4387 ctxt->checkIndex = base;
4388#ifdef DEBUG_PUSH
4389 if (next == 0)
4390 xmlGenericError(xmlGenericErrorContext,
4391 "HPP: lookup '%c' failed\n", first);
4392 else if (third == 0)
4393 xmlGenericError(xmlGenericErrorContext,
4394 "HPP: lookup '%c%c' failed\n", first, next);
4395 else
4396 xmlGenericError(xmlGenericErrorContext,
4397 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4398#endif
4399 return(-1);
4400}
4401
4402/**
4403 * htmlParseTryOrFinish:
4404 * @ctxt: an HTML parser context
4405 * @terminate: last chunk indicator
4406 *
4407 * Try to progress on parsing
4408 *
4409 * Returns zero if no parsing was possible
4410 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004411static int
Owen Taylor3473f882001-02-23 17:55:21 +00004412htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4413 int ret = 0;
4414 htmlParserInputPtr in;
4415 int avail = 0;
4416 xmlChar cur, next;
4417
4418#ifdef DEBUG_PUSH
4419 switch (ctxt->instate) {
4420 case XML_PARSER_EOF:
4421 xmlGenericError(xmlGenericErrorContext,
4422 "HPP: try EOF\n"); break;
4423 case XML_PARSER_START:
4424 xmlGenericError(xmlGenericErrorContext,
4425 "HPP: try START\n"); break;
4426 case XML_PARSER_MISC:
4427 xmlGenericError(xmlGenericErrorContext,
4428 "HPP: try MISC\n");break;
4429 case XML_PARSER_COMMENT:
4430 xmlGenericError(xmlGenericErrorContext,
4431 "HPP: try COMMENT\n");break;
4432 case XML_PARSER_PROLOG:
4433 xmlGenericError(xmlGenericErrorContext,
4434 "HPP: try PROLOG\n");break;
4435 case XML_PARSER_START_TAG:
4436 xmlGenericError(xmlGenericErrorContext,
4437 "HPP: try START_TAG\n");break;
4438 case XML_PARSER_CONTENT:
4439 xmlGenericError(xmlGenericErrorContext,
4440 "HPP: try CONTENT\n");break;
4441 case XML_PARSER_CDATA_SECTION:
4442 xmlGenericError(xmlGenericErrorContext,
4443 "HPP: try CDATA_SECTION\n");break;
4444 case XML_PARSER_END_TAG:
4445 xmlGenericError(xmlGenericErrorContext,
4446 "HPP: try END_TAG\n");break;
4447 case XML_PARSER_ENTITY_DECL:
4448 xmlGenericError(xmlGenericErrorContext,
4449 "HPP: try ENTITY_DECL\n");break;
4450 case XML_PARSER_ENTITY_VALUE:
4451 xmlGenericError(xmlGenericErrorContext,
4452 "HPP: try ENTITY_VALUE\n");break;
4453 case XML_PARSER_ATTRIBUTE_VALUE:
4454 xmlGenericError(xmlGenericErrorContext,
4455 "HPP: try ATTRIBUTE_VALUE\n");break;
4456 case XML_PARSER_DTD:
4457 xmlGenericError(xmlGenericErrorContext,
4458 "HPP: try DTD\n");break;
4459 case XML_PARSER_EPILOG:
4460 xmlGenericError(xmlGenericErrorContext,
4461 "HPP: try EPILOG\n");break;
4462 case XML_PARSER_PI:
4463 xmlGenericError(xmlGenericErrorContext,
4464 "HPP: try PI\n");break;
4465 case XML_PARSER_SYSTEM_LITERAL:
4466 xmlGenericError(xmlGenericErrorContext,
4467 "HPP: try SYSTEM_LITERAL\n");break;
4468 }
4469#endif
4470
4471 while (1) {
4472
4473 in = ctxt->input;
4474 if (in == NULL) break;
4475 if (in->buf == NULL)
4476 avail = in->length - (in->cur - in->base);
4477 else
4478 avail = in->buf->buffer->use - (in->cur - in->base);
4479 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004480 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004481 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4482 /*
4483 * SAX: end of the document processing.
4484 */
4485 ctxt->instate = XML_PARSER_EOF;
4486 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4487 ctxt->sax->endDocument(ctxt->userData);
4488 }
4489 }
4490 if (avail < 1)
4491 goto done;
4492 switch (ctxt->instate) {
4493 case XML_PARSER_EOF:
4494 /*
4495 * Document parsing is done !
4496 */
4497 goto done;
4498 case XML_PARSER_START:
4499 /*
4500 * Very first chars read from the document flow.
4501 */
4502 cur = in->cur[0];
4503 if (IS_BLANK(cur)) {
4504 SKIP_BLANKS;
4505 if (in->buf == NULL)
4506 avail = in->length - (in->cur - in->base);
4507 else
4508 avail = in->buf->buffer->use - (in->cur - in->base);
4509 }
4510 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4511 ctxt->sax->setDocumentLocator(ctxt->userData,
4512 &xmlDefaultSAXLocator);
4513 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4514 (!ctxt->disableSAX))
4515 ctxt->sax->startDocument(ctxt->userData);
4516
4517 cur = in->cur[0];
4518 next = in->cur[1];
4519 if ((cur == '<') && (next == '!') &&
4520 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4521 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4522 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4523 (UPP(8) == 'E')) {
4524 if ((!terminate) &&
4525 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4526 goto done;
4527#ifdef DEBUG_PUSH
4528 xmlGenericError(xmlGenericErrorContext,
4529 "HPP: Parsing internal subset\n");
4530#endif
4531 htmlParseDocTypeDecl(ctxt);
4532 ctxt->instate = XML_PARSER_PROLOG;
4533#ifdef DEBUG_PUSH
4534 xmlGenericError(xmlGenericErrorContext,
4535 "HPP: entering PROLOG\n");
4536#endif
4537 } else {
4538 ctxt->instate = XML_PARSER_MISC;
4539 }
4540#ifdef DEBUG_PUSH
4541 xmlGenericError(xmlGenericErrorContext,
4542 "HPP: entering MISC\n");
4543#endif
4544 break;
4545 case XML_PARSER_MISC:
4546 SKIP_BLANKS;
4547 if (in->buf == NULL)
4548 avail = in->length - (in->cur - in->base);
4549 else
4550 avail = in->buf->buffer->use - (in->cur - in->base);
4551 if (avail < 2)
4552 goto done;
4553 cur = in->cur[0];
4554 next = in->cur[1];
4555 if ((cur == '<') && (next == '!') &&
4556 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4557 if ((!terminate) &&
4558 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4559 goto done;
4560#ifdef DEBUG_PUSH
4561 xmlGenericError(xmlGenericErrorContext,
4562 "HPP: Parsing Comment\n");
4563#endif
4564 htmlParseComment(ctxt);
4565 ctxt->instate = XML_PARSER_MISC;
4566 } else if ((cur == '<') && (next == '!') &&
4567 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4568 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4569 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4570 (UPP(8) == 'E')) {
4571 if ((!terminate) &&
4572 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4573 goto done;
4574#ifdef DEBUG_PUSH
4575 xmlGenericError(xmlGenericErrorContext,
4576 "HPP: Parsing internal subset\n");
4577#endif
4578 htmlParseDocTypeDecl(ctxt);
4579 ctxt->instate = XML_PARSER_PROLOG;
4580#ifdef DEBUG_PUSH
4581 xmlGenericError(xmlGenericErrorContext,
4582 "HPP: entering PROLOG\n");
4583#endif
4584 } else if ((cur == '<') && (next == '!') &&
4585 (avail < 9)) {
4586 goto done;
4587 } else {
4588 ctxt->instate = XML_PARSER_START_TAG;
4589#ifdef DEBUG_PUSH
4590 xmlGenericError(xmlGenericErrorContext,
4591 "HPP: entering START_TAG\n");
4592#endif
4593 }
4594 break;
4595 case XML_PARSER_PROLOG:
4596 SKIP_BLANKS;
4597 if (in->buf == NULL)
4598 avail = in->length - (in->cur - in->base);
4599 else
4600 avail = in->buf->buffer->use - (in->cur - in->base);
4601 if (avail < 2)
4602 goto done;
4603 cur = in->cur[0];
4604 next = in->cur[1];
4605 if ((cur == '<') && (next == '!') &&
4606 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4607 if ((!terminate) &&
4608 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4609 goto done;
4610#ifdef DEBUG_PUSH
4611 xmlGenericError(xmlGenericErrorContext,
4612 "HPP: Parsing Comment\n");
4613#endif
4614 htmlParseComment(ctxt);
4615 ctxt->instate = XML_PARSER_PROLOG;
4616 } else if ((cur == '<') && (next == '!') &&
4617 (avail < 4)) {
4618 goto done;
4619 } else {
4620 ctxt->instate = XML_PARSER_START_TAG;
4621#ifdef DEBUG_PUSH
4622 xmlGenericError(xmlGenericErrorContext,
4623 "HPP: entering START_TAG\n");
4624#endif
4625 }
4626 break;
4627 case XML_PARSER_EPILOG:
4628 if (in->buf == NULL)
4629 avail = in->length - (in->cur - in->base);
4630 else
4631 avail = in->buf->buffer->use - (in->cur - in->base);
4632 if (avail < 1)
4633 goto done;
4634 cur = in->cur[0];
4635 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004636 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004637 goto done;
4638 }
4639 if (avail < 2)
4640 goto done;
4641 next = in->cur[1];
4642 if ((cur == '<') && (next == '!') &&
4643 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4644 if ((!terminate) &&
4645 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4646 goto done;
4647#ifdef DEBUG_PUSH
4648 xmlGenericError(xmlGenericErrorContext,
4649 "HPP: Parsing Comment\n");
4650#endif
4651 htmlParseComment(ctxt);
4652 ctxt->instate = XML_PARSER_EPILOG;
4653 } else if ((cur == '<') && (next == '!') &&
4654 (avail < 4)) {
4655 goto done;
4656 } else {
4657 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004658 ctxt->wellFormed = 0;
4659 ctxt->instate = XML_PARSER_EOF;
4660#ifdef DEBUG_PUSH
4661 xmlGenericError(xmlGenericErrorContext,
4662 "HPP: entering EOF\n");
4663#endif
4664 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4665 ctxt->sax->endDocument(ctxt->userData);
4666 goto done;
4667 }
4668 break;
4669 case XML_PARSER_START_TAG: {
4670 xmlChar *name, *oldname;
4671 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004672 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004673
4674 if (avail < 2)
4675 goto done;
4676 cur = in->cur[0];
4677 if (cur != '<') {
4678 ctxt->instate = XML_PARSER_CONTENT;
4679#ifdef DEBUG_PUSH
4680 xmlGenericError(xmlGenericErrorContext,
4681 "HPP: entering CONTENT\n");
4682#endif
4683 break;
4684 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004685 if (in->cur[1] == '/') {
4686 ctxt->instate = XML_PARSER_END_TAG;
4687 ctxt->checkIndex = 0;
4688#ifdef DEBUG_PUSH
4689 xmlGenericError(xmlGenericErrorContext,
4690 "HPP: entering END_TAG\n");
4691#endif
4692 break;
4693 }
Owen Taylor3473f882001-02-23 17:55:21 +00004694 if ((!terminate) &&
4695 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4696 goto done;
4697
4698 oldname = xmlStrdup(ctxt->name);
4699 htmlParseStartTag(ctxt);
4700 name = ctxt->name;
4701#ifdef DEBUG
4702 if (oldname == NULL)
4703 xmlGenericError(xmlGenericErrorContext,
4704 "Start of element %s\n", name);
4705 else if (name == NULL)
4706 xmlGenericError(xmlGenericErrorContext,
4707 "Start of element failed, was %s\n",
4708 oldname);
4709 else
4710 xmlGenericError(xmlGenericErrorContext,
4711 "Start of element %s, was %s\n",
4712 name, oldname);
4713#endif
4714 if (((depth == ctxt->nameNr) &&
4715 (xmlStrEqual(oldname, ctxt->name))) ||
4716 (name == NULL)) {
4717 if (CUR == '>')
4718 NEXT;
4719 if (oldname != NULL)
4720 xmlFree(oldname);
4721 break;
4722 }
4723 if (oldname != NULL)
4724 xmlFree(oldname);
4725
4726 /*
4727 * Lookup the info for that element.
4728 */
4729 info = htmlTagLookup(name);
4730 if (info == NULL) {
4731 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4732 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4733 name);
4734 ctxt->wellFormed = 0;
4735 } else if (info->depr) {
4736 /***************************
4737 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4738 ctxt->sax->warning(ctxt->userData,
4739 "Tag %s is deprecated\n",
4740 name);
4741 ***************************/
4742 }
4743
4744 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004745 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004746 */
4747 if ((CUR == '/') && (NXT(1) == '>')) {
4748 SKIP(2);
4749 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4750 ctxt->sax->endElement(ctxt->userData, name);
4751 oldname = htmlnamePop(ctxt);
4752#ifdef DEBUG
4753 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4754 oldname);
4755#endif
4756 if (oldname != NULL)
4757 xmlFree(oldname);
4758 ctxt->instate = XML_PARSER_CONTENT;
4759#ifdef DEBUG_PUSH
4760 xmlGenericError(xmlGenericErrorContext,
4761 "HPP: entering CONTENT\n");
4762#endif
4763 break;
4764 }
4765
4766 if (CUR == '>') {
4767 NEXT;
4768 } else {
4769 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4770 ctxt->sax->error(ctxt->userData,
4771 "Couldn't find end of Start Tag %s\n",
4772 name);
4773 ctxt->wellFormed = 0;
4774
4775 /*
4776 * end of parsing of this node.
4777 */
4778 if (xmlStrEqual(name, ctxt->name)) {
4779 nodePop(ctxt);
4780 oldname = htmlnamePop(ctxt);
4781#ifdef DEBUG
4782 xmlGenericError(xmlGenericErrorContext,
4783 "End of start tag problem: popping out %s\n", oldname);
4784#endif
4785 if (oldname != NULL)
4786 xmlFree(oldname);
4787 }
4788
4789 ctxt->instate = XML_PARSER_CONTENT;
4790#ifdef DEBUG_PUSH
4791 xmlGenericError(xmlGenericErrorContext,
4792 "HPP: entering CONTENT\n");
4793#endif
4794 break;
4795 }
4796
4797 /*
4798 * Check for an Empty Element from DTD definition
4799 */
4800 if ((info != NULL) && (info->empty)) {
4801 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4802 ctxt->sax->endElement(ctxt->userData, name);
4803 oldname = htmlnamePop(ctxt);
4804#ifdef DEBUG
4805 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4806#endif
4807 if (oldname != NULL)
4808 xmlFree(oldname);
4809 }
4810 ctxt->instate = XML_PARSER_CONTENT;
4811#ifdef DEBUG_PUSH
4812 xmlGenericError(xmlGenericErrorContext,
4813 "HPP: entering CONTENT\n");
4814#endif
4815 break;
4816 }
4817 case XML_PARSER_CONTENT: {
4818 long cons;
4819 /*
4820 * Handle preparsed entities and charRef
4821 */
4822 if (ctxt->token != 0) {
4823 xmlChar chr[2] = { 0 , 0 } ;
4824
4825 chr[0] = (xmlChar) ctxt->token;
4826 htmlCheckParagraph(ctxt);
4827 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4828 ctxt->sax->characters(ctxt->userData, chr, 1);
4829 ctxt->token = 0;
4830 ctxt->checkIndex = 0;
4831 }
4832 if ((avail == 1) && (terminate)) {
4833 cur = in->cur[0];
4834 if ((cur != '<') && (cur != '&')) {
4835 if (ctxt->sax != NULL) {
4836 if (IS_BLANK(cur)) {
4837 if (ctxt->sax->ignorableWhitespace != NULL)
4838 ctxt->sax->ignorableWhitespace(
4839 ctxt->userData, &cur, 1);
4840 } else {
4841 htmlCheckParagraph(ctxt);
4842 if (ctxt->sax->characters != NULL)
4843 ctxt->sax->characters(
4844 ctxt->userData, &cur, 1);
4845 }
4846 }
4847 ctxt->token = 0;
4848 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00004849 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00004850 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004851 }
Owen Taylor3473f882001-02-23 17:55:21 +00004852 }
4853 if (avail < 2)
4854 goto done;
4855 cur = in->cur[0];
4856 next = in->cur[1];
4857 cons = ctxt->nbChars;
4858 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4859 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4860 /*
4861 * Handle SCRIPT/STYLE separately
4862 */
4863 if ((!terminate) &&
4864 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4865 goto done;
4866 htmlParseScript(ctxt);
4867 if ((cur == '<') && (next == '/')) {
4868 ctxt->instate = XML_PARSER_END_TAG;
4869 ctxt->checkIndex = 0;
4870#ifdef DEBUG_PUSH
4871 xmlGenericError(xmlGenericErrorContext,
4872 "HPP: entering END_TAG\n");
4873#endif
4874 break;
4875 }
4876 } else {
4877 /*
4878 * Sometimes DOCTYPE arrives in the middle of the document
4879 */
4880 if ((cur == '<') && (next == '!') &&
4881 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4882 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4883 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4884 (UPP(8) == 'E')) {
4885 if ((!terminate) &&
4886 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4887 goto done;
4888 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4889 ctxt->sax->error(ctxt->userData,
4890 "Misplaced DOCTYPE declaration\n");
4891 ctxt->wellFormed = 0;
4892 htmlParseDocTypeDecl(ctxt);
4893 } else if ((cur == '<') && (next == '!') &&
4894 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4895 if ((!terminate) &&
4896 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4897 goto done;
4898#ifdef DEBUG_PUSH
4899 xmlGenericError(xmlGenericErrorContext,
4900 "HPP: Parsing Comment\n");
4901#endif
4902 htmlParseComment(ctxt);
4903 ctxt->instate = XML_PARSER_CONTENT;
4904 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4905 goto done;
4906 } else if ((cur == '<') && (next == '/')) {
4907 ctxt->instate = XML_PARSER_END_TAG;
4908 ctxt->checkIndex = 0;
4909#ifdef DEBUG_PUSH
4910 xmlGenericError(xmlGenericErrorContext,
4911 "HPP: entering END_TAG\n");
4912#endif
4913 break;
4914 } else if (cur == '<') {
4915 ctxt->instate = XML_PARSER_START_TAG;
4916 ctxt->checkIndex = 0;
4917#ifdef DEBUG_PUSH
4918 xmlGenericError(xmlGenericErrorContext,
4919 "HPP: entering START_TAG\n");
4920#endif
4921 break;
4922 } else if (cur == '&') {
4923 if ((!terminate) &&
4924 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4925 goto done;
4926#ifdef DEBUG_PUSH
4927 xmlGenericError(xmlGenericErrorContext,
4928 "HPP: Parsing Reference\n");
4929#endif
4930 /* TODO: check generation of subtrees if noent !!! */
4931 htmlParseReference(ctxt);
4932 } else {
4933 /* TODO Avoid the extra copy, handle directly !!!!!! */
4934 /*
Daniel Veillard01c13b52002-12-10 15:19:08 +00004935 * Goal of the following test is:
Owen Taylor3473f882001-02-23 17:55:21 +00004936 * - minimize calls to the SAX 'character' callback
4937 * when they are mergeable
4938 */
4939 if ((ctxt->inputNr == 1) &&
4940 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4941 if ((!terminate) &&
4942 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4943 goto done;
4944 }
4945 ctxt->checkIndex = 0;
4946#ifdef DEBUG_PUSH
4947 xmlGenericError(xmlGenericErrorContext,
4948 "HPP: Parsing char data\n");
4949#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004950 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004951 }
4952 }
4953 if (cons == ctxt->nbChars) {
4954 if (ctxt->node != NULL) {
4955 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4956 ctxt->sax->error(ctxt->userData,
4957 "detected an error in element content\n");
4958 ctxt->wellFormed = 0;
4959 }
4960 NEXT;
4961 break;
4962 }
4963
4964 break;
4965 }
4966 case XML_PARSER_END_TAG:
4967 if (avail < 2)
4968 goto done;
4969 if ((!terminate) &&
4970 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4971 goto done;
4972 htmlParseEndTag(ctxt);
4973 if (ctxt->nameNr == 0) {
4974 ctxt->instate = XML_PARSER_EPILOG;
4975 } else {
4976 ctxt->instate = XML_PARSER_CONTENT;
4977 }
4978 ctxt->checkIndex = 0;
4979#ifdef DEBUG_PUSH
4980 xmlGenericError(xmlGenericErrorContext,
4981 "HPP: entering CONTENT\n");
4982#endif
4983 break;
4984 case XML_PARSER_CDATA_SECTION:
4985 xmlGenericError(xmlGenericErrorContext,
4986 "HPP: internal error, state == CDATA\n");
4987 ctxt->instate = XML_PARSER_CONTENT;
4988 ctxt->checkIndex = 0;
4989#ifdef DEBUG_PUSH
4990 xmlGenericError(xmlGenericErrorContext,
4991 "HPP: entering CONTENT\n");
4992#endif
4993 break;
4994 case XML_PARSER_DTD:
4995 xmlGenericError(xmlGenericErrorContext,
4996 "HPP: internal error, state == DTD\n");
4997 ctxt->instate = XML_PARSER_CONTENT;
4998 ctxt->checkIndex = 0;
4999#ifdef DEBUG_PUSH
5000 xmlGenericError(xmlGenericErrorContext,
5001 "HPP: entering CONTENT\n");
5002#endif
5003 break;
5004 case XML_PARSER_COMMENT:
5005 xmlGenericError(xmlGenericErrorContext,
5006 "HPP: internal error, state == COMMENT\n");
5007 ctxt->instate = XML_PARSER_CONTENT;
5008 ctxt->checkIndex = 0;
5009#ifdef DEBUG_PUSH
5010 xmlGenericError(xmlGenericErrorContext,
5011 "HPP: entering CONTENT\n");
5012#endif
5013 break;
5014 case XML_PARSER_PI:
5015 xmlGenericError(xmlGenericErrorContext,
5016 "HPP: internal error, state == PI\n");
5017 ctxt->instate = XML_PARSER_CONTENT;
5018 ctxt->checkIndex = 0;
5019#ifdef DEBUG_PUSH
5020 xmlGenericError(xmlGenericErrorContext,
5021 "HPP: entering CONTENT\n");
5022#endif
5023 break;
5024 case XML_PARSER_ENTITY_DECL:
5025 xmlGenericError(xmlGenericErrorContext,
5026 "HPP: internal error, state == ENTITY_DECL\n");
5027 ctxt->instate = XML_PARSER_CONTENT;
5028 ctxt->checkIndex = 0;
5029#ifdef DEBUG_PUSH
5030 xmlGenericError(xmlGenericErrorContext,
5031 "HPP: entering CONTENT\n");
5032#endif
5033 break;
5034 case XML_PARSER_ENTITY_VALUE:
5035 xmlGenericError(xmlGenericErrorContext,
5036 "HPP: internal error, state == ENTITY_VALUE\n");
5037 ctxt->instate = XML_PARSER_CONTENT;
5038 ctxt->checkIndex = 0;
5039#ifdef DEBUG_PUSH
5040 xmlGenericError(xmlGenericErrorContext,
5041 "HPP: entering DTD\n");
5042#endif
5043 break;
5044 case XML_PARSER_ATTRIBUTE_VALUE:
5045 xmlGenericError(xmlGenericErrorContext,
5046 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
5047 ctxt->instate = XML_PARSER_START_TAG;
5048 ctxt->checkIndex = 0;
5049#ifdef DEBUG_PUSH
5050 xmlGenericError(xmlGenericErrorContext,
5051 "HPP: entering START_TAG\n");
5052#endif
5053 break;
5054 case XML_PARSER_SYSTEM_LITERAL:
5055 xmlGenericError(xmlGenericErrorContext,
5056 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
5057 ctxt->instate = XML_PARSER_CONTENT;
5058 ctxt->checkIndex = 0;
5059#ifdef DEBUG_PUSH
5060 xmlGenericError(xmlGenericErrorContext,
5061 "HPP: entering CONTENT\n");
5062#endif
5063 break;
5064 case XML_PARSER_IGNORE:
5065 xmlGenericError(xmlGenericErrorContext,
5066 "HPP: internal error, state == XML_PARSER_IGNORE\n");
5067 ctxt->instate = XML_PARSER_CONTENT;
5068 ctxt->checkIndex = 0;
5069#ifdef DEBUG_PUSH
5070 xmlGenericError(xmlGenericErrorContext,
5071 "HPP: entering CONTENT\n");
5072#endif
5073 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005074 case XML_PARSER_PUBLIC_LITERAL:
5075 xmlGenericError(xmlGenericErrorContext,
5076 "HPP: internal error, state == XML_PARSER_LITERAL\n");
5077 ctxt->instate = XML_PARSER_CONTENT;
5078 ctxt->checkIndex = 0;
5079#ifdef DEBUG_PUSH
5080 xmlGenericError(xmlGenericErrorContext,
5081 "HPP: entering CONTENT\n");
5082#endif
5083 break;
5084
Owen Taylor3473f882001-02-23 17:55:21 +00005085 }
5086 }
5087done:
5088 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005089 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005090 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5091 /*
5092 * SAX: end of the document processing.
5093 */
5094 ctxt->instate = XML_PARSER_EOF;
5095 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5096 ctxt->sax->endDocument(ctxt->userData);
5097 }
5098 }
5099 if ((ctxt->myDoc != NULL) &&
5100 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5101 (ctxt->instate == XML_PARSER_EPILOG))) {
5102 xmlDtdPtr dtd;
5103 dtd = xmlGetIntSubset(ctxt->myDoc);
5104 if (dtd == NULL)
5105 ctxt->myDoc->intSubset =
5106 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
5107 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5108 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5109 }
5110#ifdef DEBUG_PUSH
5111 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5112#endif
5113 return(ret);
5114}
5115
5116/**
Owen Taylor3473f882001-02-23 17:55:21 +00005117 * htmlParseChunk:
5118 * @ctxt: an XML parser context
5119 * @chunk: an char array
5120 * @size: the size in byte of the chunk
5121 * @terminate: last chunk indicator
5122 *
5123 * Parse a Chunk of memory
5124 *
5125 * Returns zero if no error, the xmlParserErrors otherwise.
5126 */
5127int
5128htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5129 int terminate) {
5130 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5131 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5132 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5133 int cur = ctxt->input->cur - ctxt->input->base;
5134
5135 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5136 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5137 ctxt->input->cur = ctxt->input->base + cur;
5138#ifdef DEBUG_PUSH
5139 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5140#endif
5141
5142 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5143 htmlParseTryOrFinish(ctxt, terminate);
5144 } else if (ctxt->instate != XML_PARSER_EOF) {
5145 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
5146 htmlParseTryOrFinish(ctxt, terminate);
5147 }
5148 if (terminate) {
5149 if ((ctxt->instate != XML_PARSER_EOF) &&
5150 (ctxt->instate != XML_PARSER_EPILOG) &&
5151 (ctxt->instate != XML_PARSER_MISC)) {
5152 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005153 ctxt->wellFormed = 0;
5154 }
5155 if (ctxt->instate != XML_PARSER_EOF) {
5156 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5157 ctxt->sax->endDocument(ctxt->userData);
5158 }
5159 ctxt->instate = XML_PARSER_EOF;
5160 }
5161 return((xmlParserErrors) ctxt->errNo);
5162}
5163
5164/************************************************************************
5165 * *
5166 * User entry points *
5167 * *
5168 ************************************************************************/
5169
5170/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005171 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005172 * @sax: a SAX handler
5173 * @user_data: The user data returned on SAX callbacks
5174 * @chunk: a pointer to an array of chars
5175 * @size: number of chars in the array
5176 * @filename: an optional file name or URI
5177 * @enc: an optional encoding
5178 *
5179 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005180 * The value of @filename is used for fetching external entities
5181 * and error/warning reports.
5182 *
5183 * Returns the new parser context or NULL
5184 */
5185htmlParserCtxtPtr
5186htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5187 const char *chunk, int size, const char *filename,
5188 xmlCharEncoding enc) {
5189 htmlParserCtxtPtr ctxt;
5190 htmlParserInputPtr inputStream;
5191 xmlParserInputBufferPtr buf;
5192
Daniel Veillardd0463562001-10-13 09:15:48 +00005193 xmlInitParser();
5194
Owen Taylor3473f882001-02-23 17:55:21 +00005195 buf = xmlAllocParserInputBuffer(enc);
5196 if (buf == NULL) return(NULL);
5197
5198 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
5199 if (ctxt == NULL) {
5200 xmlFree(buf);
5201 return(NULL);
5202 }
5203 memset(ctxt, 0, sizeof(htmlParserCtxt));
5204 htmlInitParserCtxt(ctxt);
5205 if (sax != NULL) {
5206 if (ctxt->sax != &htmlDefaultSAXHandler)
5207 xmlFree(ctxt->sax);
5208 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5209 if (ctxt->sax == NULL) {
5210 xmlFree(buf);
5211 xmlFree(ctxt);
5212 return(NULL);
5213 }
5214 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5215 if (user_data != NULL)
5216 ctxt->userData = user_data;
5217 }
5218 if (filename == NULL) {
5219 ctxt->directory = NULL;
5220 } else {
5221 ctxt->directory = xmlParserGetDirectory(filename);
5222 }
5223
5224 inputStream = htmlNewInputStream(ctxt);
5225 if (inputStream == NULL) {
5226 xmlFreeParserCtxt(ctxt);
5227 return(NULL);
5228 }
5229
5230 if (filename == NULL)
5231 inputStream->filename = NULL;
5232 else
5233 inputStream->filename = xmlMemStrdup(filename);
5234 inputStream->buf = buf;
5235 inputStream->base = inputStream->buf->buffer->content;
5236 inputStream->cur = inputStream->buf->buffer->content;
5237
5238 inputPush(ctxt, inputStream);
5239
5240 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5241 (ctxt->input->buf != NULL)) {
5242 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5243#ifdef DEBUG_PUSH
5244 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5245#endif
5246 }
5247
5248 return(ctxt);
5249}
5250
5251/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005252 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005253 * @cur: a pointer to an array of xmlChar
5254 * @encoding: a free form C string describing the HTML document encoding, or NULL
5255 * @sax: the SAX handler block
5256 * @userData: if using SAX, this pointer will be provided on callbacks.
5257 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005258 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5259 * to handle parse events. If sax is NULL, fallback to the default DOM
5260 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00005261 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005262 * Returns the resulting document tree unless SAX is NULL or the document is
5263 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005264 */
5265
5266htmlDocPtr
5267htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5268 htmlDocPtr ret;
5269 htmlParserCtxtPtr ctxt;
5270
Daniel Veillardd0463562001-10-13 09:15:48 +00005271 xmlInitParser();
5272
Owen Taylor3473f882001-02-23 17:55:21 +00005273 if (cur == NULL) return(NULL);
5274
5275
5276 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5277 if (ctxt == NULL) return(NULL);
5278 if (sax != NULL) {
5279 ctxt->sax = sax;
5280 ctxt->userData = userData;
5281 }
5282
5283 htmlParseDocument(ctxt);
5284 ret = ctxt->myDoc;
5285 if (sax != NULL) {
5286 ctxt->sax = NULL;
5287 ctxt->userData = NULL;
5288 }
5289 htmlFreeParserCtxt(ctxt);
5290
5291 return(ret);
5292}
5293
5294/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005295 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005296 * @cur: a pointer to an array of xmlChar
5297 * @encoding: a free form C string describing the HTML document encoding, or NULL
5298 *
5299 * parse an HTML in-memory document and build a tree.
5300 *
5301 * Returns the resulting document tree
5302 */
5303
5304htmlDocPtr
5305htmlParseDoc(xmlChar *cur, const char *encoding) {
5306 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5307}
5308
5309
5310/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005311 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005312 * @filename: the filename
5313 * @encoding: a free form C string describing the HTML document encoding, or NULL
5314 *
5315 * Create a parser context for a file content.
5316 * Automatic support for ZLIB/Compress compressed document is provided
5317 * by default if found at compile-time.
5318 *
5319 * Returns the new parser context or NULL
5320 */
5321htmlParserCtxtPtr
5322htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5323{
5324 htmlParserCtxtPtr ctxt;
5325 htmlParserInputPtr inputStream;
5326 xmlParserInputBufferPtr buf;
5327 /* htmlCharEncoding enc; */
5328 xmlChar *content, *content_line = (xmlChar *) "charset=";
5329
5330 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
5331 if (buf == NULL) return(NULL);
5332
5333 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
5334 if (ctxt == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00005335 xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00005336 return(NULL);
5337 }
5338 memset(ctxt, 0, sizeof(htmlParserCtxt));
5339 htmlInitParserCtxt(ctxt);
5340 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
5341 if (inputStream == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00005342 xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00005343 xmlFree(ctxt);
5344 return(NULL);
5345 }
5346 memset(inputStream, 0, sizeof(htmlParserInput));
5347
Daniel Veillarda646cfd2002-09-17 21:50:03 +00005348 inputStream->filename = (char *)
5349 xmlNormalizeWindowsPath((xmlChar *)filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005350 inputStream->line = 1;
5351 inputStream->col = 1;
5352 inputStream->buf = buf;
5353 inputStream->directory = NULL;
5354
5355 inputStream->base = inputStream->buf->buffer->content;
5356 inputStream->cur = inputStream->buf->buffer->content;
5357 inputStream->free = NULL;
5358
5359 inputPush(ctxt, inputStream);
5360
5361 /* set encoding */
5362 if (encoding) {
5363 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
5364 if (content) {
5365 strcpy ((char *)content, (char *)content_line);
5366 strcat ((char *)content, (char *)encoding);
5367 htmlCheckEncoding (ctxt, content);
5368 xmlFree (content);
5369 }
5370 }
5371
5372 return(ctxt);
5373}
5374
5375/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005376 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005377 * @filename: the filename
5378 * @encoding: a free form C string describing the HTML document encoding, or NULL
5379 * @sax: the SAX handler block
5380 * @userData: if using SAX, this pointer will be provided on callbacks.
5381 *
5382 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5383 * compressed document is provided by default if found at compile-time.
5384 * It use the given SAX function block to handle the parsing callback.
5385 * If sax is NULL, fallback to the default DOM tree building routines.
5386 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005387 * Returns the resulting document tree unless SAX is NULL or the document is
5388 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005389 */
5390
5391htmlDocPtr
5392htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5393 void *userData) {
5394 htmlDocPtr ret;
5395 htmlParserCtxtPtr ctxt;
5396 htmlSAXHandlerPtr oldsax = NULL;
5397
Daniel Veillardd0463562001-10-13 09:15:48 +00005398 xmlInitParser();
5399
Owen Taylor3473f882001-02-23 17:55:21 +00005400 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5401 if (ctxt == NULL) return(NULL);
5402 if (sax != NULL) {
5403 oldsax = ctxt->sax;
5404 ctxt->sax = sax;
5405 ctxt->userData = userData;
5406 }
5407
5408 htmlParseDocument(ctxt);
5409
5410 ret = ctxt->myDoc;
5411 if (sax != NULL) {
5412 ctxt->sax = oldsax;
5413 ctxt->userData = NULL;
5414 }
5415 htmlFreeParserCtxt(ctxt);
5416
5417 return(ret);
5418}
5419
5420/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005421 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005422 * @filename: the filename
5423 * @encoding: a free form C string describing the HTML document encoding, or NULL
5424 *
5425 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5426 * compressed document is provided by default if found at compile-time.
5427 *
5428 * Returns the resulting document tree
5429 */
5430
5431htmlDocPtr
5432htmlParseFile(const char *filename, const char *encoding) {
5433 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5434}
5435
5436/**
5437 * htmlHandleOmittedElem:
5438 * @val: int 0 or 1
5439 *
5440 * Set and return the previous value for handling HTML omitted tags.
5441 *
5442 * Returns the last value for 0 for no handling, 1 for auto insertion.
5443 */
5444
5445int
5446htmlHandleOmittedElem(int val) {
5447 int old = htmlOmittedDefaultValue;
5448
5449 htmlOmittedDefaultValue = val;
5450 return(old);
5451}
5452
Daniel Veillard930dfb62003-02-05 10:17:38 +00005453/**
5454 * htmlElementAllowedHere:
5455 * @parent: HTML parent element
5456 * @elt: HTML element
5457 *
5458 * Checks whether an HTML element may be a direct child of a parent element.
5459 * Note - doesn't check for deprecated elements
5460 *
5461 * Returns 1 if allowed; 0 otherwise.
5462 */
5463int
5464htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5465 const char** p ;
5466
5467 if ( ! elt || ! parent || ! parent->subelts )
5468 return 0 ;
5469
5470 for ( p = parent->subelts; *p; ++p )
5471 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5472 return 1 ;
5473
5474 return 0 ;
5475}
5476/**
5477 * htmlElementStatusHere:
5478 * @parent: HTML parent element
5479 * @elt: HTML element
5480 *
5481 * Checks whether an HTML element may be a direct child of a parent element.
5482 * and if so whether it is valid or deprecated.
5483 *
5484 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5485 */
5486htmlStatus
5487htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5488 if ( ! parent || ! elt )
5489 return HTML_INVALID ;
5490 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5491 return HTML_INVALID ;
5492
5493 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5494}
5495/**
5496 * htmlAttrAllowed
5497 * @elt: HTML element
5498 * @attr: HTML attribute
5499 * @legacy: whether to allow deprecated attributes
5500 *
5501 * Checks whether an attribute is valid for an element
5502 * Has full knowledge of Required and Deprecated attributes
5503 *
5504 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5505 */
5506htmlStatus
5507htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5508 const char** p ;
5509
5510 if ( !elt || ! attr )
5511 return HTML_INVALID ;
5512
5513 if ( elt->attrs_req )
5514 for ( p = elt->attrs_req; *p; ++p)
5515 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5516 return HTML_REQUIRED ;
5517
5518 if ( elt->attrs_opt )
5519 for ( p = elt->attrs_opt; *p; ++p)
5520 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5521 return HTML_VALID ;
5522
5523 if ( legacy && elt->attrs_depr )
5524 for ( p = elt->attrs_depr; *p; ++p)
5525 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5526 return HTML_DEPRECATED ;
5527
5528 return HTML_INVALID ;
5529}
5530/**
5531 * htmlNodeStatus
5532 * @node - an htmlNodePtr in a tree
5533 * @legacy - whether to allow deprecated elements (YES is faster here
5534 * for Element nodes)
5535 *
5536 * Checks whether the tree node is valid. Experimental (the author
5537 * only uses the HTML enhancements in a SAX parser)
5538 *
5539 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5540 * legacy allowed) or htmlElementStatusHere (otherwise).
5541 * for Attribute nodes, a return from htmlAttrAllowed
5542 * for other nodes, HTML_NA (no checks performed)
5543 */
5544htmlStatus
5545htmlNodeStatus(const htmlNodePtr node, int legacy) {
5546 if ( ! node )
5547 return HTML_INVALID ;
5548
5549 switch ( node->type ) {
5550 case XML_ELEMENT_NODE:
5551 return legacy
5552 ? ( htmlElementAllowedHere (
5553 htmlTagLookup(node->parent->name) , node->name
5554 ) ? HTML_VALID : HTML_INVALID )
5555 : htmlElementStatusHere(
5556 htmlTagLookup(node->parent->name) ,
5557 htmlTagLookup(node->name) )
5558 ;
5559 case XML_ATTRIBUTE_NODE:
5560 return htmlAttrAllowed(
5561 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5562 default: return HTML_NA ;
5563 }
5564}
Owen Taylor3473f882001-02-23 17:55:21 +00005565#endif /* LIBXML_HTML_ENABLED */