blob: 5e9b2c4c945934dcc860190f6469be8b017a9790 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillardf403d292003-10-05 13:51:35 +000062 * Some factorized error routines *
63 * *
64 ************************************************************************/
65
66/**
67 * xmlErrMemory:
68 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
76 if (ctxt != NULL) {
77 ctxt->errNo = XML_ERR_NO_MEMORY;
78 ctxt->instate = XML_PARSER_EOF;
79 ctxt->disableSAX = 1;
80 }
81 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000082 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000083 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
84 NULL, NULL, 0, 0,
85 "Memory allocation failed : %s\n", extra);
86 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000087 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000088 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
89 NULL, NULL, 0, 0, "Memory allocation failed\n");
90}
91
92/**
93 * htmlParseErr:
94 * @ctxt: an HTML parser context
95 * @error: the error number
96 * @msg: the error message
97 * @str1: string infor
98 * @str2: string infor
99 *
100 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
101 */
102static void
103htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
104 const char *msg, const xmlChar *str1, const xmlChar *str2)
105{
106 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000107 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000108 XML_ERR_ERROR, NULL, 0,
109 (const char *) str1, (const char *) str2,
110 NULL, 0, 0,
111 msg, str1, str2);
112 ctxt->wellFormed = 0;
113}
114
115/**
116 * htmlParseErrInt:
117 * @ctxt: an HTML parser context
118 * @error: the error number
119 * @msg: the error message
120 * @val: integer info
121 *
122 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
123 */
124static void
125htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
126 const char *msg, int val)
127{
128 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000129 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000130 XML_ERR_ERROR, NULL, 0, NULL, NULL,
131 NULL, val, 0, msg, val);
132 ctxt->wellFormed = 0;
133}
134
135/************************************************************************
136 * *
Owen Taylor3473f882001-02-23 17:55:21 +0000137 * Parser stacks related functions and macros *
138 * *
139 ************************************************************************/
140
Daniel Veillard1c732d22002-11-30 11:22:59 +0000141/**
142 * htmlnamePush:
143 * @ctxt: an HTML parser context
144 * @value: the element name
145 *
146 * Pushes a new element name on top of the name stack
147 *
148 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000149 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000150static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000151htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000152{
153 if (ctxt->nameNr >= ctxt->nameMax) {
154 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000155 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000156 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000157 ctxt->nameMax *
158 sizeof(ctxt->nameTab[0]));
159 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000160 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000161 return (0);
162 }
163 }
164 ctxt->nameTab[ctxt->nameNr] = value;
165 ctxt->name = value;
166 return (ctxt->nameNr++);
167}
168/**
169 * htmlnamePop:
170 * @ctxt: an HTML parser context
171 *
172 * Pops the top element name from the name stack
173 *
174 * Returns the name just removed
175 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000176static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000177htmlnamePop(htmlParserCtxtPtr ctxt)
178{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000179 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000180
Daniel Veillard1c732d22002-11-30 11:22:59 +0000181 if (ctxt->nameNr <= 0)
182 return (0);
183 ctxt->nameNr--;
184 if (ctxt->nameNr < 0)
185 return (0);
186 if (ctxt->nameNr > 0)
187 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
188 else
189 ctxt->name = NULL;
190 ret = ctxt->nameTab[ctxt->nameNr];
191 ctxt->nameTab[ctxt->nameNr] = 0;
192 return (ret);
193}
Owen Taylor3473f882001-02-23 17:55:21 +0000194
195/*
196 * Macros for accessing the content. Those should be used only by the parser,
197 * and not exported.
198 *
199 * Dirty macros, i.e. one need to make assumption on the context to use them
200 *
201 * CUR_PTR return the current pointer to the xmlChar to be parsed.
202 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
203 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
204 * in UNICODE mode. This should be used internally by the parser
205 * only to compare to ASCII values otherwise it would break when
206 * running with UTF-8 encoding.
207 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
208 * to compare on ASCII based substring.
209 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
210 * it should be used only to compare on ASCII based substring.
211 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000212 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000213 *
214 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
215 *
216 * CURRENT Returns the current char value, with the full decoding of
217 * UTF-8 if we are using this mode. It returns an int.
218 * NEXT Skip to the next character, this does the proper decoding
219 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000220 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000221 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
222 */
223
224#define UPPER (toupper(*ctxt->input->cur))
225
Daniel Veillard77a90a72003-03-22 00:04:05 +0000226#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000227
228#define NXT(val) ctxt->input->cur[(val)]
229
230#define UPP(val) (toupper(ctxt->input->cur[(val)]))
231
232#define CUR_PTR ctxt->input->cur
233
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000234#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
235 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
236 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000237
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000238#define GROW if ((ctxt->progressive == 0) && \
239 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
240 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000241
242#define CURRENT ((int) (*ctxt->input->cur))
243
244#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
245
246/* Inported from XML */
247
Daniel Veillard561b7f82002-03-20 21:55:57 +0000248/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
249#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000250#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000251
Daniel Veillard561b7f82002-03-20 21:55:57 +0000252#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000253#define NXT(val) ctxt->input->cur[(val)]
254#define CUR_PTR ctxt->input->cur
255
256
257#define NEXTL(l) do { \
258 if (*(ctxt->input->cur) == '\n') { \
259 ctxt->input->line++; ctxt->input->col = 1; \
260 } else ctxt->input->col++; \
261 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
262 } while (0)
263
264/************
265 \
266 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
267 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
268 ************/
269
270#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
271#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
272
273#define COPY_BUF(l,b,i,v) \
274 if (l == 1) b[i++] = (xmlChar) v; \
275 else i += xmlCopyChar(l,&b[i],v)
276
277/**
278 * htmlCurrentChar:
279 * @ctxt: the HTML parser context
280 * @len: pointer to the length of the char read
281 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000282 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000283 * bytes in the input buffer. Implement the end of line normalization:
284 * 2.11 End-of-Line Handling
285 * If the encoding is unspecified, in the case we find an ISO-Latin-1
286 * char, then the encoding converter is plugged in automatically.
287 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000288 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000289 */
290
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000291static int
Owen Taylor3473f882001-02-23 17:55:21 +0000292htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
293 if (ctxt->instate == XML_PARSER_EOF)
294 return(0);
295
296 if (ctxt->token != 0) {
297 *len = 0;
298 return(ctxt->token);
299 }
300 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
301 /*
302 * We are supposed to handle UTF8, check it's valid
303 * From rfc2044: encoding of the Unicode values on UTF-8:
304 *
305 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
306 * 0000 0000-0000 007F 0xxxxxxx
307 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
308 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
309 *
310 * Check for the 0x110000 limit too
311 */
312 const unsigned char *cur = ctxt->input->cur;
313 unsigned char c;
314 unsigned int val;
315
316 c = *cur;
317 if (c & 0x80) {
318 if (cur[1] == 0)
319 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
320 if ((cur[1] & 0xc0) != 0x80)
321 goto encoding_error;
322 if ((c & 0xe0) == 0xe0) {
323
324 if (cur[2] == 0)
325 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
326 if ((cur[2] & 0xc0) != 0x80)
327 goto encoding_error;
328 if ((c & 0xf0) == 0xf0) {
329 if (cur[3] == 0)
330 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
331 if (((c & 0xf8) != 0xf0) ||
332 ((cur[3] & 0xc0) != 0x80))
333 goto encoding_error;
334 /* 4-byte code */
335 *len = 4;
336 val = (cur[0] & 0x7) << 18;
337 val |= (cur[1] & 0x3f) << 12;
338 val |= (cur[2] & 0x3f) << 6;
339 val |= cur[3] & 0x3f;
340 } else {
341 /* 3-byte code */
342 *len = 3;
343 val = (cur[0] & 0xf) << 12;
344 val |= (cur[1] & 0x3f) << 6;
345 val |= cur[2] & 0x3f;
346 }
347 } else {
348 /* 2-byte code */
349 *len = 2;
350 val = (cur[0] & 0x1f) << 6;
351 val |= cur[1] & 0x3f;
352 }
353 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000354 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
355 "Char 0x%X out of allowed range\n", val);
Owen Taylor3473f882001-02-23 17:55:21 +0000356 }
357 return(val);
358 } else {
359 /* 1-byte code */
360 *len = 1;
361 return((int) *ctxt->input->cur);
362 }
363 }
364 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000365 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000366 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000367 * XML constructs only use < 128 chars
368 */
369 *len = 1;
370 if ((int) *ctxt->input->cur < 0x80)
371 return((int) *ctxt->input->cur);
372
373 /*
374 * Humm this is bad, do an automatic flow conversion
375 */
376 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
377 ctxt->charset = XML_CHAR_ENCODING_UTF8;
378 return(xmlCurrentChar(ctxt, len));
379
380encoding_error:
381 /*
382 * If we detect an UTF8 error that probably mean that the
383 * input encoding didn't get properly advertized in the
384 * declaration header. Report the error and switch the encoding
385 * to ISO-Latin-1 (if you don't like this policy, just declare the
386 * encoding !)
387 */
Daniel Veillardf403d292003-10-05 13:51:35 +0000388 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
389 "Input is not proper UTF-8, indicate encoding !\n",
390 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000391 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +0000392 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
393 ctxt->input->cur[0], ctxt->input->cur[1],
394 ctxt->input->cur[2], ctxt->input->cur[3]);
395 }
396
397 ctxt->charset = XML_CHAR_ENCODING_8859_1;
398 *len = 1;
399 return((int) *ctxt->input->cur);
400}
401
402/**
Owen Taylor3473f882001-02-23 17:55:21 +0000403 * htmlSkipBlankChars:
404 * @ctxt: the HTML parser context
405 *
406 * skip all blanks character found at that point in the input streams.
407 *
408 * Returns the number of space chars skipped
409 */
410
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000411static int
Owen Taylor3473f882001-02-23 17:55:21 +0000412htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
413 int res = 0;
414
William M. Brack76e95df2003-10-18 16:20:14 +0000415 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000416 if ((*ctxt->input->cur == 0) &&
417 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
418 xmlPopInput(ctxt);
419 } else {
420 if (*(ctxt->input->cur) == '\n') {
421 ctxt->input->line++; ctxt->input->col = 1;
422 } else ctxt->input->col++;
423 ctxt->input->cur++;
424 ctxt->nbChars++;
425 if (*ctxt->input->cur == 0)
426 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
427 }
428 res++;
429 }
430 return(res);
431}
432
433
434
435/************************************************************************
436 * *
437 * The list of HTML elements and their properties *
438 * *
439 ************************************************************************/
440
441/*
442 * Start Tag: 1 means the start tag can be ommited
443 * End Tag: 1 means the end tag can be ommited
444 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000445 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000446 * Depr: this element is deprecated
447 * DTD: 1 means that this element is valid only in the Loose DTD
448 * 2 means that this element is valid only in the Frameset DTD
449 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000450 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000451 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000452 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000453
454/* Definitions and a couple of vars for HTML Elements */
455
456#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
457#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
458#define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
459#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
460#define BLOCK HEADING LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
461#define FORMCTRL "input", "select", "textarea", "label", "button"
462#define PCDATA
463#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
464#define LIST "ul", "ol", "dir", "menu"
465#define MODIFIER
466#define FLOW BLOCK,INLINE
467#define EMPTY NULL
468
469
470static const char* html_flow[] = { FLOW, NULL } ;
471static const char* html_inline[] = { INLINE, NULL } ;
472
473/* placeholders: elts with content but no subelements */
474static const char* html_pcdata[] = { NULL } ;
475#define html_cdata html_pcdata
476
477
478/* ... and for HTML Attributes */
479
480#define COREATTRS "id", "class", "style", "title"
481#define I18N "lang", "dir"
482#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
483#define ATTRS COREATTRS,I18N,EVENTS
484#define CELLHALIGN "align", "char", "charoff"
485#define CELLVALIGN "valign"
486
487static const char* html_attrs[] = { ATTRS, NULL } ;
488static const char* core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
489static const char* core_attrs[] = { COREATTRS, NULL } ;
490static const char* i18n_attrs[] = { I18N, NULL } ;
491
492
493/* Other declarations that should go inline ... */
494static const char* a_attrs[] = { ATTRS, "charset", "type", "name",
495 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
496 "tabindex", "onfocus", "onblur", NULL } ;
497static const char* target_attr[] = { "target", NULL } ;
498static const char* rows_cols_attr[] = { "rows", "cols", NULL } ;
499static const char* alt_attr[] = { "alt", NULL } ;
500static const char* src_alt_attrs[] = { "src", "alt", NULL } ;
501static const char* href_attrs[] = { "href", NULL } ;
502static const char* clear_attrs[] = { "clear", NULL } ;
503static const char* inline_p[] = { INLINE, "p", NULL } ;
504static const char* flow_param[] = { FLOW, "param", NULL } ;
505static const char* applet_attrs[] = { COREATTRS , "codebase",
506 "archive", "alt", "name", "height", "width", "align",
507 "hspace", "vspace", NULL } ;
508static const char* area_attrs[] = { "shape", "coords", "href", "nohref",
509 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
510static const char* basefont_attrs[] =
511 { "id", "size", "color", "face", NULL } ;
512static const char* quote_attrs[] = { ATTRS, "cite", NULL } ;
513static const char* body_contents[] = { FLOW, "ins", "del", NULL } ;
514static const char* body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
515static const char* body_depr[] = { "background", "bgcolor", "text",
516 "link", "vlink", "alink", NULL } ;
517static const char* button_attrs[] = { ATTRS, "name", "value", "type",
518 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
519
520
521static const char* col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
522static const char* col_elt[] = { "col", NULL } ;
523static const char* edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
524static const char* compact_attrs[] = { ATTRS, "compact", NULL } ;
525static const char* dl_contents[] = { "dt", "dd", NULL } ;
526static const char* compact_attr[] = { "compact", NULL } ;
527static const char* label_attr[] = { "label", NULL } ;
528static const char* fieldset_contents[] = { FLOW, "legend" } ;
529static const char* font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
530static const char* form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
531static const char* form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
532static const char* frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
533static const char* frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
534static const char* frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
535static const char* head_attrs[] = { I18N, "profile", NULL } ;
536static const char* head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
537static const char* hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
538static const char* version_attr[] = { "version", NULL } ;
539static const char* html_content[] = { "head", "body", "frameset", NULL } ;
540static const char* iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
541static const char* img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
542static const char* input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
543static const char* prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
544static const char* label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
545static const char* legend_attrs[] = { ATTRS, "accesskey", NULL } ;
546static const char* align_attr[] = { "align", NULL } ;
547static const char* link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
548static const char* map_contents[] = { BLOCK, "area", NULL } ;
549static const char* name_attr[] = { "name", NULL } ;
550static const char* action_attr[] = { "action", NULL } ;
551static const char* blockli_elt[] = { BLOCK, "li", NULL } ;
552static const char* meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
553static const char* content_attr[] = { "content", NULL } ;
554static const char* type_attr[] = { "type", NULL } ;
555static const char* noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
556static const char* object_contents[] = { FLOW, "param", NULL } ;
557static const char* object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
558static const char* object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
559static const char* ol_attrs[] = { "type", "compact", "start", NULL} ;
560static const char* option_elt[] = { "option", NULL } ;
561static const char* optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
562static const char* option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
563static const char* param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
564static const char* width_attr[] = { "width", NULL } ;
565static const char* pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
566static const char* script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
567static const char* language_attr[] = { "language", NULL } ;
568static const char* select_content[] = { "optgroup", "option", NULL } ;
569static const char* select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
570static const char* style_attrs[] = { I18N, "media", "title", NULL } ;
571static const char* table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
572static const char* table_depr[] = { "align", "bgcolor", NULL } ;
573static const char* table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
574static const char* tr_elt[] = { "tr", NULL } ;
575static const char* talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
576static const char* th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
577static const char* th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
578static const char* textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
579static const char* tr_contents[] = { "th", "td", NULL } ;
580static const char* bgcolor_attr[] = { "bgcolor", NULL } ;
581static const char* li_elt[] = { "li", NULL } ;
582static const char* ul_depr[] = { "type", "compact", NULL} ;
583static const char* dir_attr[] = { "dir", NULL} ;
584
585#define DECL (const char**)
586
Daniel Veillard22090732001-07-16 00:06:07 +0000587static const htmlElemDesc
588html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000589{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
590 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
591},
592{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
593 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
594},
595{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
596 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
597},
598{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
599 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
600},
601{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
602 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
603},
604{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
605 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
606},
607{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
608 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
609},
610{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
611 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
612},
613{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
614 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
615},
616{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
617 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
618},
619{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
620 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
621},
622{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
623 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
624},
625{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
626 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
627},
628{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
629 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
630},
631{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
632 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
633},
634{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
635 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
636},
637{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
638 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
639},
640{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
641 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
642},
643{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
644 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
645},
646{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
647 EMPTY , NULL , DECL col_attrs , NULL, NULL
648},
649{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
650 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
651},
652{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
653 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
654},
655{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
656 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
657},
658{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
659 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
660},
661{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
662 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
663},
664{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
665 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
666},
667{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
668 DECL dl_contents , "dd" , html_attrs, DECL compact_attr, NULL
669},
670{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
671 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
672},
673{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
674 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
675},
676{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
677 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
678},
679{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
680 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
681},
682{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
683 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
684},
685{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
686 EMPTY, NULL, NULL, DECL frame_attrs, NULL
687},
688{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
689 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
690},
691{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
692 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
693},
694{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
695 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
696},
697{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
698 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
699},
700{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
701 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
702},
703{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
704 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
705},
706{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
707 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
708},
709{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
710 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
711},
712{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
713 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
714},
715{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
716 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
717},
718{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
719 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
720},
721{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
722 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
723},
724{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
725 EMPTY, NULL, DECL img_attrs, DECL align_attr, src_alt_attrs
726},
727{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
728 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
729},
730{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
731 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
732},
733{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
734 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
735},
736{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
737 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
738},
739{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
740 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
741},
742{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
743 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
744},
745{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
746 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
747},
748{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
749 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
750},
751{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
752 DECL map_contents , NULL, DECL html_attrs , NULL, name_attr
753},
754{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
755 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
756},
757{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
758 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
759},
760{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
761 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
762},
763{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
764 DECL html_flow, "div", DECL html_attrs, NULL, NULL
765},
766{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
767 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
768},
769{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
770 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
771},
772{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
773 option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
774},
775{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
776 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
777},
778{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
779 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
780},
781{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
782 EMPTY, NULL, DECL param_attrs, NULL, name_attr
783},
784{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
785 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
786},
787{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
788 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
789},
790{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
791 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
792},
793{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
794 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
795},
796{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
797 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
798},
799{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
800 DECL select_content, NULL, DECL select_attrs, NULL, NULL
801},
802{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
803 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
804},
805{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
806 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
807},
808{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
809 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
810},
811{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
812 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
813},
814{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
815 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
816},
817{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
818 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
819},
820{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
821 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
822},
823{ "table", 0, 0, 0, 0, 0, 0, 0, "",
824 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
825},
826{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
827 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
828},
829{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
830 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
831},
832{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
833 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
834},
835{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
836 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
837},
838{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
839 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
840},
841{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
842 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
843},
844{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
845 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
846},
847{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
848 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
849},
850{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
851 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
852},
853{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
854 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
855},
856{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
857 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
858},
859{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
860 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
861}
Owen Taylor3473f882001-02-23 17:55:21 +0000862};
863
864/*
Owen Taylor3473f882001-02-23 17:55:21 +0000865 * start tags that imply the end of current element
866 */
Daniel Veillard22090732001-07-16 00:06:07 +0000867static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000868"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
869 "dl", "ul", "ol", "menu", "dir", "address", "pre",
870 "listing", "xmp", "head", NULL,
871"head", "p", NULL,
872"title", "p", NULL,
873"body", "head", "style", "link", "title", "p", NULL,
874"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
875 "pre", "listing", "xmp", "head", "li", NULL,
876"hr", "p", "head", NULL,
877"h1", "p", "head", NULL,
878"h2", "p", "head", NULL,
879"h3", "p", "head", NULL,
880"h4", "p", "head", NULL,
881"h5", "p", "head", NULL,
882"h6", "p", "head", NULL,
883"dir", "p", "head", NULL,
884"address", "p", "head", "ul", NULL,
885"pre", "p", "head", "ul", NULL,
886"listing", "p", "head", NULL,
887"xmp", "p", "head", NULL,
888"blockquote", "p", "head", NULL,
889"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
890 "xmp", "head", NULL,
891"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
892 "head", "dd", NULL,
893"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
894 "head", "dt", NULL,
895"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
896 "listing", "xmp", NULL,
897"ol", "p", "head", "ul", NULL,
898"menu", "p", "head", "ul", NULL,
899"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
900"div", "p", "head", NULL,
901"noscript", "p", "head", NULL,
902"center", "font", "b", "i", "p", "head", NULL,
903"a", "a", NULL,
904"caption", "p", NULL,
905"colgroup", "caption", "colgroup", "col", "p", NULL,
906"col", "caption", "col", "p", NULL,
907"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
908 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000909"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
910"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000911"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
912"thead", "caption", "col", "colgroup", NULL,
913"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
914 "tbody", "p", NULL,
915"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
916 "tfoot", "tbody", "p", NULL,
917"optgroup", "option", NULL,
918"option", "option", NULL,
919"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
920 "pre", "listing", "xmp", "a", NULL,
921NULL
922};
923
924/*
925 * The list of HTML elements which are supposed not to have
926 * CDATA content and where a p element will be implied
927 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000928 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000929 * implied paragraph
930 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000931static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000932 "html",
933 "head",
934 "body",
935 NULL
936};
937
938/*
939 * The list of HTML attributes which are of content %Script;
940 * NOTE: when adding ones, check htmlIsScriptAttribute() since
941 * it assumes the name starts with 'on'
942 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000943static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000944 "onclick",
945 "ondblclick",
946 "onmousedown",
947 "onmouseup",
948 "onmouseover",
949 "onmousemove",
950 "onmouseout",
951 "onkeypress",
952 "onkeydown",
953 "onkeyup",
954 "onload",
955 "onunload",
956 "onfocus",
957 "onblur",
958 "onsubmit",
959 "onrest",
960 "onchange",
961 "onselect"
962};
963
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000964/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000965 * This table is used by the htmlparser to know what to do with
966 * broken html pages. By assigning different priorities to different
967 * elements the parser can decide how to handle extra endtags.
968 * Endtags are only allowed to close elements with lower or equal
969 * priority.
970 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000971
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000972typedef struct {
973 const char *name;
974 int priority;
975} elementPriority;
976
Daniel Veillard22090732001-07-16 00:06:07 +0000977static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000978 {"div", 150},
979 {"td", 160},
980 {"th", 160},
981 {"tr", 170},
982 {"thead", 180},
983 {"tbody", 180},
984 {"tfoot", 180},
985 {"table", 190},
986 {"head", 200},
987 {"body", 200},
988 {"html", 220},
989 {NULL, 100} /* Default priority */
990};
Owen Taylor3473f882001-02-23 17:55:21 +0000991
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000992static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000993static int htmlStartCloseIndexinitialized = 0;
994
995/************************************************************************
996 * *
997 * functions to handle HTML specific data *
998 * *
999 ************************************************************************/
1000
1001/**
1002 * htmlInitAutoClose:
1003 *
1004 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1005 * This is not reentrant. Call xmlInitParser() once before processing in
1006 * case of use in multithreaded programs.
1007 */
1008void
1009htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001010 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001011
1012 if (htmlStartCloseIndexinitialized) return;
1013
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001014 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1015 indx = 0;
1016 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1017 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001018 while (htmlStartClose[i] != NULL) i++;
1019 i++;
1020 }
1021 htmlStartCloseIndexinitialized = 1;
1022}
1023
1024/**
1025 * htmlTagLookup:
1026 * @tag: The tag name in lowercase
1027 *
1028 * Lookup the HTML tag in the ElementTable
1029 *
1030 * Returns the related htmlElemDescPtr or NULL if not found.
1031 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001032const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001033htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001034 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001035
1036 for (i = 0; i < (sizeof(html40ElementTable) /
1037 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001038 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001039 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001040 }
1041 return(NULL);
1042}
1043
1044/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001045 * htmlGetEndPriority:
1046 * @name: The name of the element to look up the priority for.
1047 *
1048 * Return value: The "endtag" priority.
1049 **/
1050static int
1051htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001052 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001053
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001054 while ((htmlEndPriority[i].name != NULL) &&
1055 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1056 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001057
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001058 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001059}
1060
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001061
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001062/**
Owen Taylor3473f882001-02-23 17:55:21 +00001063 * htmlCheckAutoClose:
1064 * @newtag: The new tag name
1065 * @oldtag: The old tag name
1066 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001067 * Checks whether the new tag is one of the registered valid tags for
1068 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001069 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1070 *
1071 * Returns 0 if no, 1 if yes.
1072 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001073static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001074htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1075{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001076 int i, indx;
1077 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001078
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001079 if (htmlStartCloseIndexinitialized == 0)
1080 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001081
1082 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001083 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001084 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001085 if (closed == NULL)
1086 return (0);
1087 if (xmlStrEqual(BAD_CAST * closed, newtag))
1088 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001089 }
1090
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001091 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001092 i++;
1093 while (htmlStartClose[i] != NULL) {
1094 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001095 return (1);
1096 }
1097 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001098 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001099 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001100}
1101
1102/**
1103 * htmlAutoCloseOnClose:
1104 * @ctxt: an HTML parser context
1105 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001106 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001107 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001108 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001109 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001110static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001111htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1112{
1113 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001114 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001115
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001116 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001117
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001118 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001119
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001120 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1121 break;
1122 /*
1123 * A missplaced endtag can only close elements with lower
1124 * or equal priority, so if we find an element with higher
1125 * priority before we find an element with
1126 * matching name, we just ignore this endtag
1127 */
1128 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1129 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001130 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001131 if (i < 0)
1132 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001133
1134 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001135 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001136 if ((info != NULL) && (info->endTag == 3)) {
1137 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1138 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001139 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001140 }
1141 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1142 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001143 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001144 }
1145}
1146
1147/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001148 * htmlAutoCloseOnEnd:
1149 * @ctxt: an HTML parser context
1150 *
1151 * Close all remaining tags at the end of the stream
1152 */
1153static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001154htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1155{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001156 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001157
William M. Brack899e64a2003-09-26 18:03:42 +00001158 if (ctxt->nameNr == 0)
1159 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001160 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001161 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1162 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001163 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001164 }
1165}
1166
1167/**
Owen Taylor3473f882001-02-23 17:55:21 +00001168 * htmlAutoClose:
1169 * @ctxt: an HTML parser context
1170 * @newtag: The new tag name or NULL
1171 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001172 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001173 * The list is kept in htmlStartClose array. This function is
1174 * called when a new tag has been detected and generates the
1175 * appropriates closes if possible/needed.
1176 * If newtag is NULL this mean we are at the end of the resource
1177 * and we should check
1178 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001179static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001180htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1181{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001182 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001183 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001184 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1185 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001186 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001187 }
1188 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001189 htmlAutoCloseOnEnd(ctxt);
1190 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001191 }
1192 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001193 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1194 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1195 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001196 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1197 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001198 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001199 }
Owen Taylor3473f882001-02-23 17:55:21 +00001200}
1201
1202/**
1203 * htmlAutoCloseTag:
1204 * @doc: the HTML document
1205 * @name: The tag name
1206 * @elem: the HTML element
1207 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001208 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001209 * The list is kept in htmlStartClose array. This function checks
1210 * if the element or one of it's children would autoclose the
1211 * given tag.
1212 *
1213 * Returns 1 if autoclose, 0 otherwise
1214 */
1215int
1216htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1217 htmlNodePtr child;
1218
1219 if (elem == NULL) return(1);
1220 if (xmlStrEqual(name, elem->name)) return(0);
1221 if (htmlCheckAutoClose(elem->name, name)) return(1);
1222 child = elem->children;
1223 while (child != NULL) {
1224 if (htmlAutoCloseTag(doc, name, child)) return(1);
1225 child = child->next;
1226 }
1227 return(0);
1228}
1229
1230/**
1231 * htmlIsAutoClosed:
1232 * @doc: the HTML document
1233 * @elem: the HTML element
1234 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001235 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001236 * The list is kept in htmlStartClose array. This function checks
1237 * if a tag is autoclosed by one of it's child
1238 *
1239 * Returns 1 if autoclosed, 0 otherwise
1240 */
1241int
1242htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1243 htmlNodePtr child;
1244
1245 if (elem == NULL) return(1);
1246 child = elem->children;
1247 while (child != NULL) {
1248 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1249 child = child->next;
1250 }
1251 return(0);
1252}
1253
1254/**
1255 * htmlCheckImplied:
1256 * @ctxt: an HTML parser context
1257 * @newtag: The new tag name
1258 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001259 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001260 * called when a new tag has been detected and generates the
1261 * appropriates implicit tags if missing
1262 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001263static void
Owen Taylor3473f882001-02-23 17:55:21 +00001264htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1265 if (!htmlOmittedDefaultValue)
1266 return;
1267 if (xmlStrEqual(newtag, BAD_CAST"html"))
1268 return;
1269 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001270 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001271 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1272 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1273 }
1274 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1275 return;
1276 if ((ctxt->nameNr <= 1) &&
1277 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1278 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1279 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1280 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1281 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1282 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1283 /*
1284 * dropped OBJECT ... i you put it first BODY will be
1285 * assumed !
1286 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001287 htmlnamePush(ctxt, BAD_CAST"head");
Owen Taylor3473f882001-02-23 17:55:21 +00001288 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1289 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1290 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1291 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1292 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1293 int i;
1294 for (i = 0;i < ctxt->nameNr;i++) {
1295 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1296 return;
1297 }
1298 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1299 return;
1300 }
1301 }
1302
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001303 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001304 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1305 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1306 }
1307}
1308
1309/**
1310 * htmlCheckParagraph
1311 * @ctxt: an HTML parser context
1312 *
1313 * Check whether a p element need to be implied before inserting
1314 * characters in the current element.
1315 *
1316 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1317 * in case of error.
1318 */
1319
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001320static int
Owen Taylor3473f882001-02-23 17:55:21 +00001321htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1322 const xmlChar *tag;
1323 int i;
1324
1325 if (ctxt == NULL)
1326 return(-1);
1327 tag = ctxt->name;
1328 if (tag == NULL) {
1329 htmlAutoClose(ctxt, BAD_CAST"p");
1330 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001331 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001332 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1333 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1334 return(1);
1335 }
1336 if (!htmlOmittedDefaultValue)
1337 return(0);
1338 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1339 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001340 htmlAutoClose(ctxt, BAD_CAST"p");
1341 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001342 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001343 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1344 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1345 return(1);
1346 }
1347 }
1348 return(0);
1349}
1350
1351/**
1352 * htmlIsScriptAttribute:
1353 * @name: an attribute name
1354 *
1355 * Check if an attribute is of content type Script
1356 *
1357 * Returns 1 is the attribute is a script 0 otherwise
1358 */
1359int
1360htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001361 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001362
1363 if (name == NULL)
1364 return(0);
1365 /*
1366 * all script attributes start with 'on'
1367 */
1368 if ((name[0] != 'o') || (name[1] != 'n'))
1369 return(0);
1370 for (i = 0;
1371 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1372 i++) {
1373 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1374 return(1);
1375 }
1376 return(0);
1377}
1378
1379/************************************************************************
1380 * *
1381 * The list of HTML predefined entities *
1382 * *
1383 ************************************************************************/
1384
1385
Daniel Veillard22090732001-07-16 00:06:07 +00001386static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001387/*
1388 * the 4 absolute ones, plus apostrophe.
1389 */
1390{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1391{ 38, "amp", "ampersand, U+0026 ISOnum" },
1392{ 39, "apos", "single quote" },
1393{ 60, "lt", "less-than sign, U+003C ISOnum" },
1394{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1395
1396/*
1397 * A bunch still in the 128-255 range
1398 * Replacing them depend really on the charset used.
1399 */
1400{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1401{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1402{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1403{ 163, "pound","pound sign, U+00A3 ISOnum" },
1404{ 164, "curren","currency sign, U+00A4 ISOnum" },
1405{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1406{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1407{ 167, "sect", "section sign, U+00A7 ISOnum" },
1408{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1409{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1410{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1411{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1412{ 172, "not", "not sign, U+00AC ISOnum" },
1413{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1414{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1415{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1416{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1417{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1418{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1419{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1420{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1421{ 181, "micro","micro sign, U+00B5 ISOnum" },
1422{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1423{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1424{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1425{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1426{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1427{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1428{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1429{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1430{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1431{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1432{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1433{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1434{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1435{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1436{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1437{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1438{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1439{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1440{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1441{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1442{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1443{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1444{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1445{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1446{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1447{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1448{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1449{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1450{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1451{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1452{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1453{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1454{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1455{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1456{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1457{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1458{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1459{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1460{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1461{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1462{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1463{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1464{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1465{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1466{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1467{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1468{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1469{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1470{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1471{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1472{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1473{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1474{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1475{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1476{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1477{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1478{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1479{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1480{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1481{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1482{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1483{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1484{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1485{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1486{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1487{ 247, "divide","division sign, U+00F7 ISOnum" },
1488{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1489{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1490{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1491{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1492{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1493{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1494{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1495{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1496
1497{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1498{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1499{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1500{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1501{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1502
1503/*
1504 * Anything below should really be kept as entities references
1505 */
1506{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1507
1508{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1509{ 732, "tilde","small tilde, U+02DC ISOdia" },
1510
1511{ 913, "Alpha","greek capital letter alpha, U+0391" },
1512{ 914, "Beta", "greek capital letter beta, U+0392" },
1513{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1514{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1515{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1516{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1517{ 919, "Eta", "greek capital letter eta, U+0397" },
1518{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1519{ 921, "Iota", "greek capital letter iota, U+0399" },
1520{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001521{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001522{ 924, "Mu", "greek capital letter mu, U+039C" },
1523{ 925, "Nu", "greek capital letter nu, U+039D" },
1524{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1525{ 927, "Omicron","greek capital letter omicron, U+039F" },
1526{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1527{ 929, "Rho", "greek capital letter rho, U+03A1" },
1528{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1529{ 932, "Tau", "greek capital letter tau, U+03A4" },
1530{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1531{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1532{ 935, "Chi", "greek capital letter chi, U+03A7" },
1533{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1534{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1535
1536{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1537{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1538{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1539{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1540{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1541{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1542{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1543{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1544{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1545{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1546{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1547{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1548{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1549{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1550{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1551{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1552{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1553{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1554{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1555{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1556{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1557{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1558{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1559{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1560{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1561{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1562{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1563{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1564
1565{ 8194, "ensp", "en space, U+2002 ISOpub" },
1566{ 8195, "emsp", "em space, U+2003 ISOpub" },
1567{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1568{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1569{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1570{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1571{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1572{ 8211, "ndash","en dash, U+2013 ISOpub" },
1573{ 8212, "mdash","em dash, U+2014 ISOpub" },
1574{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1575{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1576{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1577{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1578{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1579{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1580{ 8224, "dagger","dagger, U+2020 ISOpub" },
1581{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1582
1583{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1584{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1585
1586{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1587
1588{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1589{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1590
1591{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1592{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1593
1594{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1595{ 8260, "frasl","fraction slash, U+2044 NEW" },
1596
1597{ 8364, "euro", "euro sign, U+20AC NEW" },
1598
1599{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1600{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1601{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1602{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1603{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1604{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1605{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1606{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1607{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1608{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1609{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1610{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1611{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1612{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1613{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1614{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1615
1616{ 8704, "forall","for all, U+2200 ISOtech" },
1617{ 8706, "part", "partial differential, U+2202 ISOtech" },
1618{ 8707, "exist","there exists, U+2203 ISOtech" },
1619{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1620{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1621{ 8712, "isin", "element of, U+2208 ISOtech" },
1622{ 8713, "notin","not an element of, U+2209 ISOtech" },
1623{ 8715, "ni", "contains as member, U+220B ISOtech" },
1624{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001625{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001626{ 8722, "minus","minus sign, U+2212 ISOtech" },
1627{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1628{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1629{ 8733, "prop", "proportional to, U+221D ISOtech" },
1630{ 8734, "infin","infinity, U+221E ISOtech" },
1631{ 8736, "ang", "angle, U+2220 ISOamso" },
1632{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1633{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1634{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1635{ 8746, "cup", "union = cup, U+222A ISOtech" },
1636{ 8747, "int", "integral, U+222B ISOtech" },
1637{ 8756, "there4","therefore, U+2234 ISOtech" },
1638{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1639{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1640{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1641{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1642{ 8801, "equiv","identical to, U+2261 ISOtech" },
1643{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1644{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1645{ 8834, "sub", "subset of, U+2282 ISOtech" },
1646{ 8835, "sup", "superset of, U+2283 ISOtech" },
1647{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1648{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1649{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1650{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1651{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1652{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1653{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1654{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1655{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1656{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1657{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1658{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1659{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1660{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1661
1662{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1663{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1664{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1665{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1666
1667};
1668
1669/************************************************************************
1670 * *
1671 * Commodity functions to handle entities *
1672 * *
1673 ************************************************************************/
1674
1675/*
1676 * Macro used to grow the current buffer.
1677 */
1678#define growBuffer(buffer) { \
1679 buffer##_size *= 2; \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001680 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Owen Taylor3473f882001-02-23 17:55:21 +00001681 if (buffer == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001682 htmlErrMemory(ctxt, "growing buffer\n"); \
Owen Taylor3473f882001-02-23 17:55:21 +00001683 return(NULL); \
1684 } \
1685}
1686
1687/**
1688 * htmlEntityLookup:
1689 * @name: the entity name
1690 *
1691 * Lookup the given entity in EntitiesTable
1692 *
1693 * TODO: the linear scan is really ugly, an hash table is really needed.
1694 *
1695 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1696 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001697const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001698htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001699 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001700
1701 for (i = 0;i < (sizeof(html40EntitiesTable)/
1702 sizeof(html40EntitiesTable[0]));i++) {
1703 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001704 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001705 }
1706 }
1707 return(NULL);
1708}
1709
1710/**
1711 * htmlEntityValueLookup:
1712 * @value: the entity's unicode value
1713 *
1714 * Lookup the given entity in EntitiesTable
1715 *
1716 * TODO: the linear scan is really ugly, an hash table is really needed.
1717 *
1718 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1719 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001720const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001721htmlEntityValueLookup(unsigned int value) {
1722 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001723
1724 for (i = 0;i < (sizeof(html40EntitiesTable)/
1725 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001726 if (html40EntitiesTable[i].value >= value) {
1727 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001728 break;
William M. Brack78637da2003-07-31 14:47:38 +00001729 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001730 }
Owen Taylor3473f882001-02-23 17:55:21 +00001731 }
1732 return(NULL);
1733}
1734
1735/**
1736 * UTF8ToHtml:
1737 * @out: a pointer to an array of bytes to store the result
1738 * @outlen: the length of @out
1739 * @in: a pointer to an array of UTF-8 chars
1740 * @inlen: the length of @in
1741 *
1742 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1743 * plus HTML entities block of chars out.
1744 *
1745 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1746 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001747 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001748 * The value of @outlen after return is the number of octets consumed.
1749 */
1750int
1751UTF8ToHtml(unsigned char* out, int *outlen,
1752 const unsigned char* in, int *inlen) {
1753 const unsigned char* processed = in;
1754 const unsigned char* outend;
1755 const unsigned char* outstart = out;
1756 const unsigned char* instart = in;
1757 const unsigned char* inend;
1758 unsigned int c, d;
1759 int trailing;
1760
1761 if (in == NULL) {
1762 /*
1763 * initialization nothing to do
1764 */
1765 *outlen = 0;
1766 *inlen = 0;
1767 return(0);
1768 }
1769 inend = in + (*inlen);
1770 outend = out + (*outlen);
1771 while (in < inend) {
1772 d = *in++;
1773 if (d < 0x80) { c= d; trailing= 0; }
1774 else if (d < 0xC0) {
1775 /* trailing byte in leading position */
1776 *outlen = out - outstart;
1777 *inlen = processed - instart;
1778 return(-2);
1779 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1780 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1781 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1782 else {
1783 /* no chance for this in Ascii */
1784 *outlen = out - outstart;
1785 *inlen = processed - instart;
1786 return(-2);
1787 }
1788
1789 if (inend - in < trailing) {
1790 break;
1791 }
1792
1793 for ( ; trailing; trailing--) {
1794 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1795 break;
1796 c <<= 6;
1797 c |= d & 0x3F;
1798 }
1799
1800 /* assertion: c is a single UTF-4 value */
1801 if (c < 0x80) {
1802 if (out + 1 >= outend)
1803 break;
1804 *out++ = c;
1805 } else {
1806 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001807 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001808
1809 /*
1810 * Try to lookup a predefined HTML entity for it
1811 */
1812
1813 ent = htmlEntityValueLookup(c);
1814 if (ent == NULL) {
1815 /* no chance for this in Ascii */
1816 *outlen = out - outstart;
1817 *inlen = processed - instart;
1818 return(-2);
1819 }
1820 len = strlen(ent->name);
1821 if (out + 2 + len >= outend)
1822 break;
1823 *out++ = '&';
1824 memcpy(out, ent->name, len);
1825 out += len;
1826 *out++ = ';';
1827 }
1828 processed = in;
1829 }
1830 *outlen = out - outstart;
1831 *inlen = processed - instart;
1832 return(0);
1833}
1834
1835/**
1836 * htmlEncodeEntities:
1837 * @out: a pointer to an array of bytes to store the result
1838 * @outlen: the length of @out
1839 * @in: a pointer to an array of UTF-8 chars
1840 * @inlen: the length of @in
1841 * @quoteChar: the quote character to escape (' or ") or zero.
1842 *
1843 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1844 * plus HTML entities block of chars out.
1845 *
1846 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1847 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001848 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001849 * The value of @outlen after return is the number of octets consumed.
1850 */
1851int
1852htmlEncodeEntities(unsigned char* out, int *outlen,
1853 const unsigned char* in, int *inlen, int quoteChar) {
1854 const unsigned char* processed = in;
1855 const unsigned char* outend = out + (*outlen);
1856 const unsigned char* outstart = out;
1857 const unsigned char* instart = in;
1858 const unsigned char* inend = in + (*inlen);
1859 unsigned int c, d;
1860 int trailing;
1861
1862 while (in < inend) {
1863 d = *in++;
1864 if (d < 0x80) { c= d; trailing= 0; }
1865 else if (d < 0xC0) {
1866 /* trailing byte in leading position */
1867 *outlen = out - outstart;
1868 *inlen = processed - instart;
1869 return(-2);
1870 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1871 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1872 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1873 else {
1874 /* no chance for this in Ascii */
1875 *outlen = out - outstart;
1876 *inlen = processed - instart;
1877 return(-2);
1878 }
1879
1880 if (inend - in < trailing)
1881 break;
1882
1883 while (trailing--) {
1884 if (((d= *in++) & 0xC0) != 0x80) {
1885 *outlen = out - outstart;
1886 *inlen = processed - instart;
1887 return(-2);
1888 }
1889 c <<= 6;
1890 c |= d & 0x3F;
1891 }
1892
1893 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001894 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1895 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001896 if (out >= outend)
1897 break;
1898 *out++ = c;
1899 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001900 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001901 const char *cp;
1902 char nbuf[16];
1903 int len;
1904
1905 /*
1906 * Try to lookup a predefined HTML entity for it
1907 */
1908 ent = htmlEntityValueLookup(c);
1909 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001910 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001911 cp = nbuf;
1912 }
1913 else
1914 cp = ent->name;
1915 len = strlen(cp);
1916 if (out + 2 + len > outend)
1917 break;
1918 *out++ = '&';
1919 memcpy(out, cp, len);
1920 out += len;
1921 *out++ = ';';
1922 }
1923 processed = in;
1924 }
1925 *outlen = out - outstart;
1926 *inlen = processed - instart;
1927 return(0);
1928}
1929
Owen Taylor3473f882001-02-23 17:55:21 +00001930/************************************************************************
1931 * *
1932 * Commodity functions to handle streams *
1933 * *
1934 ************************************************************************/
1935
1936/**
Owen Taylor3473f882001-02-23 17:55:21 +00001937 * htmlNewInputStream:
1938 * @ctxt: an HTML parser context
1939 *
1940 * Create a new input stream structure
1941 * Returns the new input stream or NULL
1942 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001943static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001944htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1945 htmlParserInputPtr input;
1946
1947 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1948 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00001949 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001950 return(NULL);
1951 }
1952 memset(input, 0, sizeof(htmlParserInput));
1953 input->filename = NULL;
1954 input->directory = NULL;
1955 input->base = NULL;
1956 input->cur = NULL;
1957 input->buf = NULL;
1958 input->line = 1;
1959 input->col = 1;
1960 input->buf = NULL;
1961 input->free = NULL;
1962 input->version = NULL;
1963 input->consumed = 0;
1964 input->length = 0;
1965 return(input);
1966}
1967
1968
1969/************************************************************************
1970 * *
1971 * Commodity functions, cleanup needed ? *
1972 * *
1973 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001974/*
1975 * all tags allowing pc data from the html 4.01 loose dtd
1976 * NOTE: it might be more apropriate to integrate this information
1977 * into the html40ElementTable array but I don't want to risk any
1978 * binary incomptibility
1979 */
1980static const char *allowPCData[] = {
1981 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
1982 "blockquote", "body", "button", "caption", "center", "cite", "code",
1983 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
1984 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
1985 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
1986 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
1987};
Owen Taylor3473f882001-02-23 17:55:21 +00001988
1989/**
1990 * areBlanks:
1991 * @ctxt: an HTML parser context
1992 * @str: a xmlChar *
1993 * @len: the size of @str
1994 *
1995 * Is this a sequence of blank chars that one can ignore ?
1996 *
1997 * Returns 1 if ignorable 0 otherwise.
1998 */
1999
2000static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002001 unsigned int i;
2002 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002003 xmlNodePtr lastChild;
2004
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002005 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002006 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002007
2008 if (CUR == 0) return(1);
2009 if (CUR != '<') return(0);
2010 if (ctxt->name == NULL)
2011 return(1);
2012 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2013 return(1);
2014 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2015 return(1);
2016 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
2017 return(1);
2018 if (ctxt->node == NULL) return(0);
2019 lastChild = xmlGetLastChild(ctxt->node);
2020 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002021 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2022 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002023 /* keep ws in constructs like ...<b> </b>...
2024 for all tags "b" allowing PCDATA */
2025 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2026 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2027 return(0);
2028 }
2029 }
Owen Taylor3473f882001-02-23 17:55:21 +00002030 } else if (xmlNodeIsText(lastChild)) {
2031 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002032 } else {
2033 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2034 for all tags "p" allowing PCDATA */
2035 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2036 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2037 return(0);
2038 }
2039 }
Owen Taylor3473f882001-02-23 17:55:21 +00002040 }
2041 return(1);
2042}
2043
2044/**
Owen Taylor3473f882001-02-23 17:55:21 +00002045 * htmlNewDocNoDtD:
2046 * @URI: URI for the dtd, or NULL
2047 * @ExternalID: the external ID of the DTD, or NULL
2048 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002049 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2050 * are NULL
2051 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002052 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002053 */
2054htmlDocPtr
2055htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2056 xmlDocPtr cur;
2057
2058 /*
2059 * Allocate a new document and fill the fields.
2060 */
2061 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2062 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002063 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002064 return(NULL);
2065 }
2066 memset(cur, 0, sizeof(xmlDoc));
2067
2068 cur->type = XML_HTML_DOCUMENT_NODE;
2069 cur->version = NULL;
2070 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002071 cur->doc = cur;
2072 cur->name = NULL;
2073 cur->children = NULL;
2074 cur->extSubset = NULL;
2075 cur->oldNs = NULL;
2076 cur->encoding = NULL;
2077 cur->standalone = 1;
2078 cur->compression = 0;
2079 cur->ids = NULL;
2080 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002081 cur->_private = NULL;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002082 if ((ExternalID != NULL) ||
2083 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002084 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002085 return(cur);
2086}
2087
2088/**
2089 * htmlNewDoc:
2090 * @URI: URI for the dtd, or NULL
2091 * @ExternalID: the external ID of the DTD, or NULL
2092 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002093 * Creates a new HTML document
2094 *
Owen Taylor3473f882001-02-23 17:55:21 +00002095 * Returns a new document
2096 */
2097htmlDocPtr
2098htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2099 if ((URI == NULL) && (ExternalID == NULL))
2100 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002101 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2102 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002103
2104 return(htmlNewDocNoDtD(URI, ExternalID));
2105}
2106
2107
2108/************************************************************************
2109 * *
2110 * The parser itself *
2111 * Relates to http://www.w3.org/TR/html40 *
2112 * *
2113 ************************************************************************/
2114
2115/************************************************************************
2116 * *
2117 * The parser itself *
2118 * *
2119 ************************************************************************/
2120
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002121static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002122
Owen Taylor3473f882001-02-23 17:55:21 +00002123/**
2124 * htmlParseHTMLName:
2125 * @ctxt: an HTML parser context
2126 *
2127 * parse an HTML tag or attribute name, note that we convert it to lowercase
2128 * since HTML names are not case-sensitive.
2129 *
2130 * Returns the Tag Name parsed or NULL
2131 */
2132
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002133static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002134htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002135 int i = 0;
2136 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2137
William M. Brack76e95df2003-10-18 16:20:14 +00002138 if (!IS_LETTER_CH(CUR) && (CUR != '_') &&
Owen Taylor3473f882001-02-23 17:55:21 +00002139 (CUR != ':')) return(NULL);
2140
2141 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brack76e95df2003-10-18 16:20:14 +00002142 ((IS_LETTER_CH(CUR)) || (IS_DIGIT_CH(CUR)) ||
Owen Taylor3473f882001-02-23 17:55:21 +00002143 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2144 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2145 else loc[i] = CUR;
2146 i++;
2147
2148 NEXT;
2149 }
2150
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002151 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002152}
2153
2154/**
2155 * htmlParseName:
2156 * @ctxt: an HTML parser context
2157 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002158 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002159 *
2160 * Returns the Name parsed or NULL
2161 */
2162
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002163static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002164htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002165 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002166 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002167 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002168
2169 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002170
2171 /*
2172 * Accelerator for simple ASCII names
2173 */
2174 in = ctxt->input->cur;
2175 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2176 ((*in >= 0x41) && (*in <= 0x5A)) ||
2177 (*in == '_') || (*in == ':')) {
2178 in++;
2179 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2180 ((*in >= 0x41) && (*in <= 0x5A)) ||
2181 ((*in >= 0x30) && (*in <= 0x39)) ||
2182 (*in == '_') || (*in == '-') ||
2183 (*in == ':') || (*in == '.'))
2184 in++;
2185 if ((*in > 0) && (*in < 0x80)) {
2186 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002187 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002188 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002189 ctxt->nbChars += count;
2190 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002191 return(ret);
2192 }
2193 }
2194 return(htmlParseNameComplex(ctxt));
2195}
2196
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002197static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002198htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002199 int len = 0, l;
2200 int c;
2201 int count = 0;
2202
2203 /*
2204 * Handler for more complex cases
2205 */
2206 GROW;
2207 c = CUR_CHAR(l);
2208 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2209 (!IS_LETTER(c) && (c != '_') &&
2210 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002211 return(NULL);
2212 }
2213
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002214 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2215 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2216 (c == '.') || (c == '-') ||
2217 (c == '_') || (c == ':') ||
2218 (IS_COMBINING(c)) ||
2219 (IS_EXTENDER(c)))) {
2220 if (count++ > 100) {
2221 count = 0;
2222 GROW;
2223 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002224 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002225 NEXTL(l);
2226 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002227 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002228 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002229}
2230
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002231
Owen Taylor3473f882001-02-23 17:55:21 +00002232/**
2233 * htmlParseHTMLAttribute:
2234 * @ctxt: an HTML parser context
2235 * @stop: a char stop value
2236 *
2237 * parse an HTML attribute value till the stop (quote), if
2238 * stop is 0 then it stops at the first space
2239 *
2240 * Returns the attribute parsed or NULL
2241 */
2242
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002243static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002244htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2245 xmlChar *buffer = NULL;
2246 int buffer_size = 0;
2247 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002248 const xmlChar *name = NULL;
2249 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002250 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002251
2252 /*
2253 * allocate a translation buffer.
2254 */
2255 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002256 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002257 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002258 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002259 return(NULL);
2260 }
2261 out = buffer;
2262
2263 /*
2264 * Ok loop until we reach one of the ending chars
2265 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002266 while ((CUR != 0) && (CUR != stop)) {
2267 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002268 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002269 if (CUR == '&') {
2270 if (NXT(1) == '#') {
2271 unsigned int c;
2272 int bits;
2273
2274 c = htmlParseCharRef(ctxt);
2275 if (c < 0x80)
2276 { *out++ = c; bits= -6; }
2277 else if (c < 0x800)
2278 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2279 else if (c < 0x10000)
2280 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2281 else
2282 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2283
2284 for ( ; bits >= 0; bits-= 6) {
2285 *out++ = ((c >> bits) & 0x3F) | 0x80;
2286 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002287
2288 if (out - buffer > buffer_size - 100) {
2289 int indx = out - buffer;
2290
2291 growBuffer(buffer);
2292 out = &buffer[indx];
2293 }
Owen Taylor3473f882001-02-23 17:55:21 +00002294 } else {
2295 ent = htmlParseEntityRef(ctxt, &name);
2296 if (name == NULL) {
2297 *out++ = '&';
2298 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002299 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002300
2301 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002302 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002303 }
2304 } else if (ent == NULL) {
2305 *out++ = '&';
2306 cur = name;
2307 while (*cur != 0) {
2308 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002309 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002310
2311 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002312 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002313 }
2314 *out++ = *cur++;
2315 }
Owen Taylor3473f882001-02-23 17:55:21 +00002316 } else {
2317 unsigned int c;
2318 int bits;
2319
2320 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002321 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002322
2323 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002324 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002325 }
2326 c = (xmlChar)ent->value;
2327 if (c < 0x80)
2328 { *out++ = c; bits= -6; }
2329 else if (c < 0x800)
2330 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2331 else if (c < 0x10000)
2332 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2333 else
2334 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2335
2336 for ( ; bits >= 0; bits-= 6) {
2337 *out++ = ((c >> bits) & 0x3F) | 0x80;
2338 }
Owen Taylor3473f882001-02-23 17:55:21 +00002339 }
2340 }
2341 } else {
2342 unsigned int c;
2343 int bits, l;
2344
2345 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002346 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002347
2348 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002349 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002350 }
2351 c = CUR_CHAR(l);
2352 if (c < 0x80)
2353 { *out++ = c; bits= -6; }
2354 else if (c < 0x800)
2355 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2356 else if (c < 0x10000)
2357 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2358 else
2359 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2360
2361 for ( ; bits >= 0; bits-= 6) {
2362 *out++ = ((c >> bits) & 0x3F) | 0x80;
2363 }
2364 NEXT;
2365 }
2366 }
2367 *out++ = 0;
2368 return(buffer);
2369}
2370
2371/**
Owen Taylor3473f882001-02-23 17:55:21 +00002372 * htmlParseEntityRef:
2373 * @ctxt: an HTML parser context
2374 * @str: location to store the entity name
2375 *
2376 * parse an HTML ENTITY references
2377 *
2378 * [68] EntityRef ::= '&' Name ';'
2379 *
2380 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2381 * if non-NULL *str will have to be freed by the caller.
2382 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002383const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002384htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2385 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002386 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002387 *str = NULL;
2388
2389 if (CUR == '&') {
2390 NEXT;
2391 name = htmlParseName(ctxt);
2392 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002393 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2394 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002395 } else {
2396 GROW;
2397 if (CUR == ';') {
2398 *str = name;
2399
2400 /*
2401 * Lookup the entity in the table.
2402 */
2403 ent = htmlEntityLookup(name);
2404 if (ent != NULL) /* OK that's ugly !!! */
2405 NEXT;
2406 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002407 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2408 "htmlParseEntityRef: expecting ';'\n",
2409 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002410 *str = name;
2411 }
2412 }
2413 }
2414 return(ent);
2415}
2416
2417/**
2418 * htmlParseAttValue:
2419 * @ctxt: an HTML parser context
2420 *
2421 * parse a value for an attribute
2422 * Note: the parser won't do substitution of entities here, this
2423 * will be handled later in xmlStringGetNodeList, unless it was
2424 * asked for ctxt->replaceEntities != 0
2425 *
2426 * Returns the AttValue parsed or NULL.
2427 */
2428
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002429static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002430htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2431 xmlChar *ret = NULL;
2432
2433 if (CUR == '"') {
2434 NEXT;
2435 ret = htmlParseHTMLAttribute(ctxt, '"');
2436 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002437 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2438 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002439 } else
2440 NEXT;
2441 } else if (CUR == '\'') {
2442 NEXT;
2443 ret = htmlParseHTMLAttribute(ctxt, '\'');
2444 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002445 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2446 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002447 } else
2448 NEXT;
2449 } else {
2450 /*
2451 * That's an HTMLism, the attribute value may not be quoted
2452 */
2453 ret = htmlParseHTMLAttribute(ctxt, 0);
2454 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002455 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2456 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002457 }
2458 }
2459 return(ret);
2460}
2461
2462/**
2463 * htmlParseSystemLiteral:
2464 * @ctxt: an HTML parser context
2465 *
2466 * parse an HTML Literal
2467 *
2468 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2469 *
2470 * Returns the SystemLiteral parsed or NULL
2471 */
2472
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002473static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002474htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2475 const xmlChar *q;
2476 xmlChar *ret = NULL;
2477
2478 if (CUR == '"') {
2479 NEXT;
2480 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002481 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002482 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002483 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002484 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2485 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002486 } else {
2487 ret = xmlStrndup(q, CUR_PTR - q);
2488 NEXT;
2489 }
2490 } else if (CUR == '\'') {
2491 NEXT;
2492 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002493 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002494 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002495 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002496 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2497 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002498 } else {
2499 ret = xmlStrndup(q, CUR_PTR - q);
2500 NEXT;
2501 }
2502 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002503 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2504 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002505 }
2506
2507 return(ret);
2508}
2509
2510/**
2511 * htmlParsePubidLiteral:
2512 * @ctxt: an HTML parser context
2513 *
2514 * parse an HTML public literal
2515 *
2516 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2517 *
2518 * Returns the PubidLiteral parsed or NULL.
2519 */
2520
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002521static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002522htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2523 const xmlChar *q;
2524 xmlChar *ret = NULL;
2525 /*
2526 * Name ::= (Letter | '_') (NameChar)*
2527 */
2528 if (CUR == '"') {
2529 NEXT;
2530 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002531 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002532 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002533 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2534 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002535 } else {
2536 ret = xmlStrndup(q, CUR_PTR - q);
2537 NEXT;
2538 }
2539 } else if (CUR == '\'') {
2540 NEXT;
2541 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002542 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002543 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002544 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002545 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2546 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002547 } else {
2548 ret = xmlStrndup(q, CUR_PTR - q);
2549 NEXT;
2550 }
2551 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002552 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2553 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002554 }
2555
2556 return(ret);
2557}
2558
2559/**
2560 * htmlParseScript:
2561 * @ctxt: an HTML parser context
2562 *
2563 * parse the content of an HTML SCRIPT or STYLE element
2564 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2565 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2566 * http://www.w3.org/TR/html4/types.html#type-script
2567 * http://www.w3.org/TR/html4/types.html#h-6.15
2568 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2569 *
2570 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2571 * element and the value of intrinsic event attributes. User agents must
2572 * not evaluate script data as HTML markup but instead must pass it on as
2573 * data to a script engine.
2574 * NOTES:
2575 * - The content is passed like CDATA
2576 * - the attributes for style and scripting "onXXX" are also described
2577 * as CDATA but SGML allows entities references in attributes so their
2578 * processing is identical as other attributes
2579 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002580static void
Owen Taylor3473f882001-02-23 17:55:21 +00002581htmlParseScript(htmlParserCtxtPtr ctxt) {
2582 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2583 int nbchar = 0;
2584 xmlChar cur;
2585
2586 SHRINK;
2587 cur = CUR;
William M. Brack76e95df2003-10-18 16:20:14 +00002588 while (IS_CHAR_CH(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002589 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2590 (NXT(3) == '-')) {
2591 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2592 if (ctxt->sax->cdataBlock!= NULL) {
2593 /*
2594 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2595 */
2596 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002597 } else if (ctxt->sax->characters != NULL) {
2598 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Daniel Veillardc1f78342001-11-10 11:43:05 +00002599 }
2600 }
2601 nbchar = 0;
2602 htmlParseComment(ctxt);
2603 cur = CUR;
2604 continue;
2605 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002606 /*
2607 * One should break here, the specification is clear:
2608 * Authors should therefore escape "</" within the content.
2609 * Escape mechanisms are specific to each scripting or
2610 * style sheet language.
2611 */
2612 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2613 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2614 break; /* while */
2615 }
2616 buf[nbchar++] = cur;
2617 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2618 if (ctxt->sax->cdataBlock!= NULL) {
2619 /*
2620 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2621 */
2622 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002623 } else if (ctxt->sax->characters != NULL) {
2624 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002625 }
2626 nbchar = 0;
2627 }
2628 NEXT;
2629 cur = CUR;
2630 }
William M. Brack76e95df2003-10-18 16:20:14 +00002631 if (!(IS_CHAR_CH(cur))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002632 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2633 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002634 NEXT;
2635 }
2636
2637 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2638 if (ctxt->sax->cdataBlock!= NULL) {
2639 /*
2640 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2641 */
2642 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002643 } else if (ctxt->sax->characters != NULL) {
2644 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002645 }
2646 }
2647}
2648
2649
2650/**
2651 * htmlParseCharData:
2652 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002653 *
2654 * parse a CharData section.
2655 * if we are within a CDATA section ']]>' marks an end of section.
2656 *
2657 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2658 */
2659
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002660static void
2661htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002662 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2663 int nbchar = 0;
2664 int cur, l;
2665
2666 SHRINK;
2667 cur = CUR_CHAR(l);
2668 while (((cur != '<') || (ctxt->token == '<')) &&
2669 ((cur != '&') || (ctxt->token == '&')) &&
2670 (IS_CHAR(cur))) {
2671 COPY_BUF(l,buf,nbchar,cur);
2672 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2673 /*
2674 * Ok the segment is to be consumed as chars.
2675 */
2676 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2677 if (areBlanks(ctxt, buf, nbchar)) {
2678 if (ctxt->sax->ignorableWhitespace != NULL)
2679 ctxt->sax->ignorableWhitespace(ctxt->userData,
2680 buf, nbchar);
2681 } else {
2682 htmlCheckParagraph(ctxt);
2683 if (ctxt->sax->characters != NULL)
2684 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2685 }
2686 }
2687 nbchar = 0;
2688 }
2689 NEXTL(l);
2690 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002691 if (cur == 0) {
2692 SHRINK;
2693 GROW;
2694 cur = CUR_CHAR(l);
2695 }
Owen Taylor3473f882001-02-23 17:55:21 +00002696 }
2697 if (nbchar != 0) {
2698 /*
2699 * Ok the segment is to be consumed as chars.
2700 */
2701 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2702 if (areBlanks(ctxt, buf, nbchar)) {
2703 if (ctxt->sax->ignorableWhitespace != NULL)
2704 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2705 } else {
2706 htmlCheckParagraph(ctxt);
2707 if (ctxt->sax->characters != NULL)
2708 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2709 }
2710 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002711 } else {
2712 /*
2713 * Loop detection
2714 */
2715 if (cur == 0)
2716 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002717 }
2718}
2719
2720/**
2721 * htmlParseExternalID:
2722 * @ctxt: an HTML parser context
2723 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002724 *
2725 * Parse an External ID or a Public ID
2726 *
Owen Taylor3473f882001-02-23 17:55:21 +00002727 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2728 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2729 *
2730 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2731 *
2732 * Returns the function returns SystemLiteral and in the second
2733 * case publicID receives PubidLiteral, is strict is off
2734 * it is possible to return NULL and have publicID set.
2735 */
2736
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002737static xmlChar *
2738htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002739 xmlChar *URI = NULL;
2740
2741 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2742 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2743 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2744 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002745 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002746 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2747 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002748 }
2749 SKIP_BLANKS;
2750 URI = htmlParseSystemLiteral(ctxt);
2751 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002752 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2753 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002754 }
2755 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2756 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2757 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2758 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002759 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002760 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2761 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002762 }
2763 SKIP_BLANKS;
2764 *publicID = htmlParsePubidLiteral(ctxt);
2765 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002766 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2767 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2768 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002769 }
2770 SKIP_BLANKS;
2771 if ((CUR == '"') || (CUR == '\'')) {
2772 URI = htmlParseSystemLiteral(ctxt);
2773 }
2774 }
2775 return(URI);
2776}
2777
2778/**
2779 * htmlParseComment:
2780 * @ctxt: an HTML parser context
2781 *
2782 * Parse an XML (SGML) comment <!-- .... -->
2783 *
2784 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2785 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002786static void
Owen Taylor3473f882001-02-23 17:55:21 +00002787htmlParseComment(htmlParserCtxtPtr ctxt) {
2788 xmlChar *buf = NULL;
2789 int len;
2790 int size = HTML_PARSER_BUFFER_SIZE;
2791 int q, ql;
2792 int r, rl;
2793 int cur, l;
2794 xmlParserInputState state;
2795
2796 /*
2797 * Check that there is a comment right here.
2798 */
2799 if ((RAW != '<') || (NXT(1) != '!') ||
2800 (NXT(2) != '-') || (NXT(3) != '-')) return;
2801
2802 state = ctxt->instate;
2803 ctxt->instate = XML_PARSER_COMMENT;
2804 SHRINK;
2805 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002806 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002807 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002808 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002809 ctxt->instate = state;
2810 return;
2811 }
2812 q = CUR_CHAR(ql);
2813 NEXTL(ql);
2814 r = CUR_CHAR(rl);
2815 NEXTL(rl);
2816 cur = CUR_CHAR(l);
2817 len = 0;
2818 while (IS_CHAR(cur) &&
2819 ((cur != '>') ||
2820 (r != '-') || (q != '-'))) {
2821 if (len + 5 >= size) {
2822 size *= 2;
2823 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2824 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002825 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002826 ctxt->instate = state;
2827 return;
2828 }
2829 }
2830 COPY_BUF(ql,buf,len,q);
2831 q = r;
2832 ql = rl;
2833 r = cur;
2834 rl = l;
2835 NEXTL(l);
2836 cur = CUR_CHAR(l);
2837 if (cur == 0) {
2838 SHRINK;
2839 GROW;
2840 cur = CUR_CHAR(l);
2841 }
2842 }
2843 buf[len] = 0;
2844 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002845 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
2846 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002847 xmlFree(buf);
2848 } else {
2849 NEXT;
2850 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2851 (!ctxt->disableSAX))
2852 ctxt->sax->comment(ctxt->userData, buf);
2853 xmlFree(buf);
2854 }
2855 ctxt->instate = state;
2856}
2857
2858/**
2859 * htmlParseCharRef:
2860 * @ctxt: an HTML parser context
2861 *
2862 * parse Reference declarations
2863 *
2864 * [66] CharRef ::= '&#' [0-9]+ ';' |
2865 * '&#x' [0-9a-fA-F]+ ';'
2866 *
2867 * Returns the value parsed (as an int)
2868 */
2869int
2870htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2871 int val = 0;
2872
2873 if ((CUR == '&') && (NXT(1) == '#') &&
2874 (NXT(2) == 'x')) {
2875 SKIP(3);
2876 while (CUR != ';') {
2877 if ((CUR >= '0') && (CUR <= '9'))
2878 val = val * 16 + (CUR - '0');
2879 else if ((CUR >= 'a') && (CUR <= 'f'))
2880 val = val * 16 + (CUR - 'a') + 10;
2881 else if ((CUR >= 'A') && (CUR <= 'F'))
2882 val = val * 16 + (CUR - 'A') + 10;
2883 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002884 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
2885 "htmlParseCharRef: invalid hexadecimal value\n",
2886 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002887 return(0);
2888 }
2889 NEXT;
2890 }
2891 if (CUR == ';')
2892 NEXT;
2893 } else if ((CUR == '&') && (NXT(1) == '#')) {
2894 SKIP(2);
2895 while (CUR != ';') {
2896 if ((CUR >= '0') && (CUR <= '9'))
2897 val = val * 10 + (CUR - '0');
2898 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002899 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
2900 "htmlParseCharRef: invalid decimal value\n",
2901 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002902 return(0);
2903 }
2904 NEXT;
2905 }
2906 if (CUR == ';')
2907 NEXT;
2908 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002909 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
2910 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002911 }
2912 /*
2913 * Check the value IS_CHAR ...
2914 */
2915 if (IS_CHAR(val)) {
2916 return(val);
2917 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002918 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2919 "htmlParseCharRef: invalid xmlChar value %d\n",
2920 val);
Owen Taylor3473f882001-02-23 17:55:21 +00002921 }
2922 return(0);
2923}
2924
2925
2926/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00002927 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00002928 * @ctxt: an HTML parser context
2929 *
2930 * parse a DOCTYPE declaration
2931 *
2932 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2933 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2934 */
2935
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002936static void
Owen Taylor3473f882001-02-23 17:55:21 +00002937htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002938 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00002939 xmlChar *ExternalID = NULL;
2940 xmlChar *URI = NULL;
2941
2942 /*
2943 * We know that '<!DOCTYPE' has been detected.
2944 */
2945 SKIP(9);
2946
2947 SKIP_BLANKS;
2948
2949 /*
2950 * Parse the DOCTYPE name.
2951 */
2952 name = htmlParseName(ctxt);
2953 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002954 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2955 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
2956 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002957 }
2958 /*
2959 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2960 */
2961
2962 SKIP_BLANKS;
2963
2964 /*
2965 * Check for SystemID and ExternalID
2966 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002967 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002968 SKIP_BLANKS;
2969
2970 /*
2971 * We should be at the end of the DOCTYPE declaration.
2972 */
2973 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002974 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
2975 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002976 /* We shouldn't try to resynchronize ... */
2977 }
2978 NEXT;
2979
2980 /*
2981 * Create or update the document accordingly to the DOCTYPE
2982 */
2983 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2984 (!ctxt->disableSAX))
2985 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2986
2987 /*
2988 * Cleanup, since we don't use all those identifiers
2989 */
2990 if (URI != NULL) xmlFree(URI);
2991 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002992}
2993
2994/**
2995 * htmlParseAttribute:
2996 * @ctxt: an HTML parser context
2997 * @value: a xmlChar ** used to store the value of the attribute
2998 *
2999 * parse an attribute
3000 *
3001 * [41] Attribute ::= Name Eq AttValue
3002 *
3003 * [25] Eq ::= S? '=' S?
3004 *
3005 * With namespace:
3006 *
3007 * [NS 11] Attribute ::= QName Eq AttValue
3008 *
3009 * Also the case QName == xmlns:??? is handled independently as a namespace
3010 * definition.
3011 *
3012 * Returns the attribute name, and the value in *value.
3013 */
3014
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003015static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003016htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003017 const xmlChar *name;
3018 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003019
3020 *value = NULL;
3021 name = htmlParseHTMLName(ctxt);
3022 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003023 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3024 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003025 return(NULL);
3026 }
3027
3028 /*
3029 * read the value
3030 */
3031 SKIP_BLANKS;
3032 if (CUR == '=') {
3033 NEXT;
3034 SKIP_BLANKS;
3035 val = htmlParseAttValue(ctxt);
3036 /******
3037 } else {
3038 * TODO : some attribute must have values, some may not
3039 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3040 ctxt->sax->warning(ctxt->userData,
3041 "No value for attribute %s\n", name); */
3042 }
3043
3044 *value = val;
3045 return(name);
3046}
3047
3048/**
3049 * htmlCheckEncoding:
3050 * @ctxt: an HTML parser context
3051 * @attvalue: the attribute value
3052 *
3053 * Checks an http-equiv attribute from a Meta tag to detect
3054 * the encoding
3055 * If a new encoding is detected the parser is switched to decode
3056 * it and pass UTF8
3057 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003058static void
Owen Taylor3473f882001-02-23 17:55:21 +00003059htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3060 const xmlChar *encoding;
3061
3062 if ((ctxt == NULL) || (attvalue == NULL))
3063 return;
3064
3065 /* do not change encoding */
3066 if (ctxt->input->encoding != NULL)
3067 return;
3068
3069 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3070 if (encoding != NULL) {
3071 encoding += 8;
3072 } else {
3073 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3074 if (encoding != NULL)
3075 encoding += 9;
3076 }
3077 if (encoding != NULL) {
3078 xmlCharEncoding enc;
3079 xmlCharEncodingHandlerPtr handler;
3080
3081 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3082
3083 if (ctxt->input->encoding != NULL)
3084 xmlFree((xmlChar *) ctxt->input->encoding);
3085 ctxt->input->encoding = xmlStrdup(encoding);
3086
3087 enc = xmlParseCharEncoding((const char *) encoding);
3088 /*
3089 * registered set of known encodings
3090 */
3091 if (enc != XML_CHAR_ENCODING_ERROR) {
3092 xmlSwitchEncoding(ctxt, enc);
3093 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3094 } else {
3095 /*
3096 * fallback for unknown encodings
3097 */
3098 handler = xmlFindCharEncodingHandler((const char *) encoding);
3099 if (handler != NULL) {
3100 xmlSwitchToEncoding(ctxt, handler);
3101 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3102 } else {
3103 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3104 }
3105 }
3106
3107 if ((ctxt->input->buf != NULL) &&
3108 (ctxt->input->buf->encoder != NULL) &&
3109 (ctxt->input->buf->raw != NULL) &&
3110 (ctxt->input->buf->buffer != NULL)) {
3111 int nbchars;
3112 int processed;
3113
3114 /*
3115 * convert as much as possible to the parser reading buffer.
3116 */
3117 processed = ctxt->input->cur - ctxt->input->base;
3118 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3119 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3120 ctxt->input->buf->buffer,
3121 ctxt->input->buf->raw);
3122 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003123 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3124 "htmlCheckEncoding: encoder error\n",
3125 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003126 }
3127 ctxt->input->base =
3128 ctxt->input->cur = ctxt->input->buf->buffer->content;
3129 }
3130 }
3131}
3132
3133/**
3134 * htmlCheckMeta:
3135 * @ctxt: an HTML parser context
3136 * @atts: the attributes values
3137 *
3138 * Checks an attributes from a Meta tag
3139 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003140static void
Owen Taylor3473f882001-02-23 17:55:21 +00003141htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3142 int i;
3143 const xmlChar *att, *value;
3144 int http = 0;
3145 const xmlChar *content = NULL;
3146
3147 if ((ctxt == NULL) || (atts == NULL))
3148 return;
3149
3150 i = 0;
3151 att = atts[i++];
3152 while (att != NULL) {
3153 value = atts[i++];
3154 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3155 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3156 http = 1;
3157 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3158 content = value;
3159 att = atts[i++];
3160 }
3161 if ((http) && (content != NULL))
3162 htmlCheckEncoding(ctxt, content);
3163
3164}
3165
3166/**
3167 * htmlParseStartTag:
3168 * @ctxt: an HTML parser context
3169 *
3170 * parse a start of tag either for rule element or
3171 * EmptyElement. In both case we don't parse the tag closing chars.
3172 *
3173 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3174 *
3175 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3176 *
3177 * With namespace:
3178 *
3179 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3180 *
3181 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3182 *
3183 */
3184
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003185static void
Owen Taylor3473f882001-02-23 17:55:21 +00003186htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003187 const xmlChar *name;
3188 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003189 xmlChar *attvalue;
Daniel Veillardf403d292003-10-05 13:51:35 +00003190 const xmlChar **atts = ctxt->atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003191 int nbatts = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +00003192 int maxatts = ctxt->maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003193 int meta = 0;
3194 int i;
3195
3196 if (CUR != '<') return;
3197 NEXT;
3198
3199 GROW;
3200 name = htmlParseHTMLName(ctxt);
3201 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003202 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3203 "htmlParseStartTag: invalid element name\n",
3204 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003205 /* Dump the bogus tag like browsers do */
William M. Brack76e95df2003-10-18 16:20:14 +00003206 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Owen Taylor3473f882001-02-23 17:55:21 +00003207 NEXT;
3208 return;
3209 }
3210 if (xmlStrEqual(name, BAD_CAST"meta"))
3211 meta = 1;
3212
3213 /*
3214 * Check for auto-closure of HTML elements.
3215 */
3216 htmlAutoClose(ctxt, name);
3217
3218 /*
3219 * Check for implied HTML elements.
3220 */
3221 htmlCheckImplied(ctxt, name);
3222
3223 /*
3224 * Avoid html at any level > 0, head at any level != 1
3225 * or any attempt to recurse body
3226 */
3227 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003228 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3229 "htmlParseStartTag: misplaced <html> tag\n",
3230 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003231 return;
3232 }
3233 if ((ctxt->nameNr != 1) &&
3234 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003235 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3236 "htmlParseStartTag: misplaced <head> tag\n",
3237 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003238 return;
3239 }
3240 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003241 int indx;
3242 for (indx = 0;indx < ctxt->nameNr;indx++) {
3243 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003244 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3245 "htmlParseStartTag: misplaced <body> tag\n",
3246 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003247 return;
3248 }
3249 }
3250 }
3251
3252 /*
3253 * Now parse the attributes, it ends up with the ending
3254 *
3255 * (S Attribute)* S?
3256 */
3257 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003258 while ((IS_CHAR_CH(CUR)) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003259 (CUR != '>') &&
3260 ((CUR != '/') || (NXT(1) != '>'))) {
3261 long cons = ctxt->nbChars;
3262
3263 GROW;
3264 attname = htmlParseAttribute(ctxt, &attvalue);
3265 if (attname != NULL) {
3266
3267 /*
3268 * Well formedness requires at most one declaration of an attribute
3269 */
3270 for (i = 0; i < nbatts;i += 2) {
3271 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003272 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3273 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003274 if (attvalue != NULL)
3275 xmlFree(attvalue);
3276 goto failed;
3277 }
3278 }
3279
3280 /*
3281 * Add the pair to atts
3282 */
3283 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003284 maxatts = 22; /* allow for 10 attrs by default */
3285 atts = (const xmlChar **)
3286 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003287 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003288 htmlErrMemory(ctxt, NULL);
3289 if (attvalue != NULL)
3290 xmlFree(attvalue);
3291 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003292 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003293 ctxt->atts = atts;
3294 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003295 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003296 const xmlChar **n;
3297
Owen Taylor3473f882001-02-23 17:55:21 +00003298 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003299 n = (const xmlChar **) xmlRealloc((void *) atts,
3300 maxatts * sizeof(const xmlChar *));
3301 if (n == NULL) {
3302 htmlErrMemory(ctxt, NULL);
3303 if (attvalue != NULL)
3304 xmlFree(attvalue);
3305 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003306 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003307 atts = n;
3308 ctxt->atts = atts;
3309 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003310 }
3311 atts[nbatts++] = attname;
3312 atts[nbatts++] = attvalue;
3313 atts[nbatts] = NULL;
3314 atts[nbatts + 1] = NULL;
3315 }
3316 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003317 if (attvalue != NULL)
3318 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003319 /* Dump the bogus attribute string up to the next blank or
3320 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003321 while ((IS_CHAR_CH(CUR)) &&
3322 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003323 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003324 NEXT;
3325 }
3326
3327failed:
3328 SKIP_BLANKS;
3329 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003330 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3331 "htmlParseStartTag: problem parsing attributes\n",
3332 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003333 break;
3334 }
3335 }
3336
3337 /*
3338 * Handle specific association to the META tag
3339 */
3340 if (meta)
3341 htmlCheckMeta(ctxt, atts);
3342
3343 /*
3344 * SAX: Start of Element !
3345 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003346 htmlnamePush(ctxt, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003347 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3348 if (nbatts != 0)
3349 ctxt->sax->startElement(ctxt->userData, name, atts);
3350 else
3351 ctxt->sax->startElement(ctxt->userData, name, NULL);
3352 }
Owen Taylor3473f882001-02-23 17:55:21 +00003353
3354 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003355 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003356 if (atts[i] != NULL)
3357 xmlFree((xmlChar *) atts[i]);
3358 }
Owen Taylor3473f882001-02-23 17:55:21 +00003359 }
Owen Taylor3473f882001-02-23 17:55:21 +00003360}
3361
3362/**
3363 * htmlParseEndTag:
3364 * @ctxt: an HTML parser context
3365 *
3366 * parse an end of tag
3367 *
3368 * [42] ETag ::= '</' Name S? '>'
3369 *
3370 * With namespace
3371 *
3372 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003373 *
3374 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003375 */
3376
Daniel Veillardf420ac52001-07-04 16:04:09 +00003377static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003378htmlParseEndTag(htmlParserCtxtPtr ctxt)
3379{
3380 const xmlChar *name;
3381 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003382 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003383
3384 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003385 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3386 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003387 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003388 }
3389 SKIP(2);
3390
3391 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003392 if (name == NULL)
3393 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003394
3395 /*
3396 * We should definitely be at the ending "S? '>'" part
3397 */
3398 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003399 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003400 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3401 "End tag : expected '>'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003402 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003403 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003404
3405 /*
3406 * If the name read is not one of the element in the parsing stack
3407 * then return, it's just an error.
3408 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003409 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3410 if (xmlStrEqual(name, ctxt->nameTab[i]))
3411 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003412 }
3413 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003414 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3415 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003416 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003417 }
3418
3419
3420 /*
3421 * Check for auto-closure of HTML elements.
3422 */
3423
3424 htmlAutoCloseOnClose(ctxt, name);
3425
3426 /*
3427 * Well formedness constraints, opening and closing must match.
3428 * With the exception that the autoclose may have popped stuff out
3429 * of the stack.
3430 */
3431 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003432 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003433 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3434 "Opening and ending tag mismatch: %s and %s\n",
3435 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003436 }
3437 }
3438
3439 /*
3440 * SAX: End of Tag
3441 */
3442 oldname = ctxt->name;
3443 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003444 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3445 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003446 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003447 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003448 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003449 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003450 }
3451
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003452 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003453}
3454
3455
3456/**
3457 * htmlParseReference:
3458 * @ctxt: an HTML parser context
3459 *
3460 * parse and handle entity references in content,
3461 * this will end-up in a call to character() since this is either a
3462 * CharRef, or a predefined entity.
3463 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003464static void
Owen Taylor3473f882001-02-23 17:55:21 +00003465htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003466 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003467 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003468 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003469 if (CUR != '&') return;
3470
3471 if (NXT(1) == '#') {
3472 unsigned int c;
3473 int bits, i = 0;
3474
3475 c = htmlParseCharRef(ctxt);
3476 if (c == 0)
3477 return;
3478
3479 if (c < 0x80) { out[i++]= c; bits= -6; }
3480 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3481 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3482 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3483
3484 for ( ; bits >= 0; bits-= 6) {
3485 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3486 }
3487 out[i] = 0;
3488
3489 htmlCheckParagraph(ctxt);
3490 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3491 ctxt->sax->characters(ctxt->userData, out, i);
3492 } else {
3493 ent = htmlParseEntityRef(ctxt, &name);
3494 if (name == NULL) {
3495 htmlCheckParagraph(ctxt);
3496 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3497 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3498 return;
3499 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003500 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003501 htmlCheckParagraph(ctxt);
3502 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3503 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3504 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3505 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3506 }
3507 } else {
3508 unsigned int c;
3509 int bits, i = 0;
3510
3511 c = ent->value;
3512 if (c < 0x80)
3513 { out[i++]= c; bits= -6; }
3514 else if (c < 0x800)
3515 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3516 else if (c < 0x10000)
3517 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3518 else
3519 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3520
3521 for ( ; bits >= 0; bits-= 6) {
3522 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3523 }
3524 out[i] = 0;
3525
3526 htmlCheckParagraph(ctxt);
3527 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3528 ctxt->sax->characters(ctxt->userData, out, i);
3529 }
Owen Taylor3473f882001-02-23 17:55:21 +00003530 }
3531}
3532
3533/**
3534 * htmlParseContent:
3535 * @ctxt: an HTML parser context
3536 * @name: the node name
3537 *
3538 * Parse a content: comment, sub-element, reference or text.
3539 *
3540 */
3541
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003542static void
Owen Taylor3473f882001-02-23 17:55:21 +00003543htmlParseContent(htmlParserCtxtPtr ctxt) {
3544 xmlChar *currentNode;
3545 int depth;
3546
3547 currentNode = xmlStrdup(ctxt->name);
3548 depth = ctxt->nameNr;
3549 while (1) {
3550 long cons = ctxt->nbChars;
3551
3552 GROW;
3553 /*
3554 * Our tag or one of it's parent or children is ending.
3555 */
3556 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003557 if (htmlParseEndTag(ctxt) &&
3558 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3559 if (currentNode != NULL)
3560 xmlFree(currentNode);
3561 return;
3562 }
3563 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003564 }
3565
3566 /*
3567 * Has this node been popped out during parsing of
3568 * the next element
3569 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003570 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3571 (!xmlStrEqual(currentNode, ctxt->name)))
3572 {
Owen Taylor3473f882001-02-23 17:55:21 +00003573 if (currentNode != NULL) xmlFree(currentNode);
3574 return;
3575 }
3576
Daniel Veillardf9533d12001-03-03 10:04:57 +00003577 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3578 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003579 /*
3580 * Handle SCRIPT/STYLE separately
3581 */
3582 htmlParseScript(ctxt);
3583 } else {
3584 /*
3585 * Sometimes DOCTYPE arrives in the middle of the document
3586 */
3587 if ((CUR == '<') && (NXT(1) == '!') &&
3588 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3589 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3590 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3591 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003592 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3593 "Misplaced DOCTYPE declaration\n",
3594 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003595 htmlParseDocTypeDecl(ctxt);
3596 }
3597
3598 /*
3599 * First case : a comment
3600 */
3601 if ((CUR == '<') && (NXT(1) == '!') &&
3602 (NXT(2) == '-') && (NXT(3) == '-')) {
3603 htmlParseComment(ctxt);
3604 }
3605
3606 /*
3607 * Second case : a sub-element.
3608 */
3609 else if (CUR == '<') {
3610 htmlParseElement(ctxt);
3611 }
3612
3613 /*
3614 * Third case : a reference. If if has not been resolved,
3615 * parsing returns it's Name, create the node
3616 */
3617 else if (CUR == '&') {
3618 htmlParseReference(ctxt);
3619 }
3620
3621 /*
3622 * Fourth : end of the resource
3623 */
3624 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003625 htmlAutoCloseOnEnd(ctxt);
3626 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003627 }
3628
3629 /*
3630 * Last case, text. Note that References are handled directly.
3631 */
3632 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003633 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003634 }
3635
3636 if (cons == ctxt->nbChars) {
3637 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003638 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3639 "detected an error in element content\n",
3640 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003641 }
3642 break;
3643 }
3644 }
3645 GROW;
3646 }
3647 if (currentNode != NULL) xmlFree(currentNode);
3648}
3649
3650/**
3651 * htmlParseElement:
3652 * @ctxt: an HTML parser context
3653 *
3654 * parse an HTML element, this is highly recursive
3655 *
3656 * [39] element ::= EmptyElemTag | STag content ETag
3657 *
3658 * [41] Attribute ::= Name Eq AttValue
3659 */
3660
3661void
3662htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003663 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003664 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003665 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003666 htmlParserNodeInfo node_info;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003667 const xmlChar *oldname;
Owen Taylor3473f882001-02-23 17:55:21 +00003668 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003669 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003670
3671 /* Capture start position */
3672 if (ctxt->record_info) {
3673 node_info.begin_pos = ctxt->input->consumed +
3674 (CUR_PTR - ctxt->input->base);
3675 node_info.begin_line = ctxt->input->line;
3676 }
3677
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003678 oldname = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00003679 htmlParseStartTag(ctxt);
3680 name = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00003681 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3682 (name == NULL)) {
3683 if (CUR == '>')
3684 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003685 return;
3686 }
Owen Taylor3473f882001-02-23 17:55:21 +00003687
3688 /*
3689 * Lookup the info for that element.
3690 */
3691 info = htmlTagLookup(name);
3692 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003693 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
3694 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003695 }
3696
3697 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003698 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003699 */
3700 if ((CUR == '/') && (NXT(1) == '>')) {
3701 SKIP(2);
3702 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3703 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003704 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003705 return;
3706 }
3707
3708 if (CUR == '>') {
3709 NEXT;
3710 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003711 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3712 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003713
3714 /*
3715 * end of parsing of this node.
3716 */
3717 if (xmlStrEqual(name, ctxt->name)) {
3718 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00003719 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003720 }
3721
3722 /*
3723 * Capture end position and add node
3724 */
3725 if ( currentNode != NULL && ctxt->record_info ) {
3726 node_info.end_pos = ctxt->input->consumed +
3727 (CUR_PTR - ctxt->input->base);
3728 node_info.end_line = ctxt->input->line;
3729 node_info.node = ctxt->node;
3730 xmlParserAddNodeInfo(ctxt, &node_info);
3731 }
3732 return;
3733 }
3734
3735 /*
3736 * Check for an Empty Element from DTD definition
3737 */
3738 if ((info != NULL) && (info->empty)) {
3739 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3740 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003741 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003742 return;
3743 }
3744
3745 /*
3746 * Parse the content of the element:
3747 */
3748 currentNode = xmlStrdup(ctxt->name);
3749 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00003750 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003751 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003752 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003753 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003754 if (ctxt->nameNr < depth) break;
3755 }
3756
Owen Taylor3473f882001-02-23 17:55:21 +00003757 /*
3758 * Capture end position and add node
3759 */
3760 if ( currentNode != NULL && ctxt->record_info ) {
3761 node_info.end_pos = ctxt->input->consumed +
3762 (CUR_PTR - ctxt->input->base);
3763 node_info.end_line = ctxt->input->line;
3764 node_info.node = ctxt->node;
3765 xmlParserAddNodeInfo(ctxt, &node_info);
3766 }
William M. Brack76e95df2003-10-18 16:20:14 +00003767 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003768 htmlAutoCloseOnEnd(ctxt);
3769 }
3770
Owen Taylor3473f882001-02-23 17:55:21 +00003771 if (currentNode != NULL)
3772 xmlFree(currentNode);
3773}
3774
3775/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003776 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00003777 * @ctxt: an HTML parser context
3778 *
3779 * parse an HTML document (and build a tree if using the standard SAX
3780 * interface).
3781 *
3782 * Returns 0, -1 in case of error. the parser context is augmented
3783 * as a result of the parsing.
3784 */
3785
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00003786int
Owen Taylor3473f882001-02-23 17:55:21 +00003787htmlParseDocument(htmlParserCtxtPtr ctxt) {
3788 xmlDtdPtr dtd;
3789
Daniel Veillardd0463562001-10-13 09:15:48 +00003790 xmlInitParser();
3791
Owen Taylor3473f882001-02-23 17:55:21 +00003792 htmlDefaultSAXHandlerInit();
3793 ctxt->html = 1;
3794
3795 GROW;
3796 /*
3797 * SAX: beginning of the document processing.
3798 */
3799 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3800 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3801
3802 /*
3803 * Wipe out everything which is before the first '<'
3804 */
3805 SKIP_BLANKS;
3806 if (CUR == 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003807 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
3808 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003809 }
3810
3811 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3812 ctxt->sax->startDocument(ctxt->userData);
3813
3814
3815 /*
3816 * Parse possible comments before any content
3817 */
3818 while ((CUR == '<') && (NXT(1) == '!') &&
3819 (NXT(2) == '-') && (NXT(3) == '-')) {
3820 htmlParseComment(ctxt);
3821 SKIP_BLANKS;
3822 }
3823
3824
3825 /*
3826 * Then possibly doc type declaration(s) and more Misc
3827 * (doctypedecl Misc*)?
3828 */
3829 if ((CUR == '<') && (NXT(1) == '!') &&
3830 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3831 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3832 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3833 (UPP(8) == 'E')) {
3834 htmlParseDocTypeDecl(ctxt);
3835 }
3836 SKIP_BLANKS;
3837
3838 /*
3839 * Parse possible comments before any content
3840 */
3841 while ((CUR == '<') && (NXT(1) == '!') &&
3842 (NXT(2) == '-') && (NXT(3) == '-')) {
3843 htmlParseComment(ctxt);
3844 SKIP_BLANKS;
3845 }
3846
3847 /*
3848 * Time to start parsing the tree itself
3849 */
3850 htmlParseContent(ctxt);
3851
3852 /*
3853 * autoclose
3854 */
3855 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003856 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003857
3858
3859 /*
3860 * SAX: end of the document processing.
3861 */
3862 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3863 ctxt->sax->endDocument(ctxt->userData);
3864
3865 if (ctxt->myDoc != NULL) {
3866 dtd = xmlGetIntSubset(ctxt->myDoc);
3867 if (dtd == NULL)
3868 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00003869 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00003870 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3871 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3872 }
3873 if (! ctxt->wellFormed) return(-1);
3874 return(0);
3875}
3876
3877
3878/************************************************************************
3879 * *
3880 * Parser contexts handling *
3881 * *
3882 ************************************************************************/
3883
3884/**
3885 * xmlInitParserCtxt:
3886 * @ctxt: an HTML parser context
3887 *
3888 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00003889 *
3890 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00003891 */
3892
Daniel Veillardf403d292003-10-05 13:51:35 +00003893static int
Owen Taylor3473f882001-02-23 17:55:21 +00003894htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3895{
3896 htmlSAXHandler *sax;
3897
Daniel Veillardf403d292003-10-05 13:51:35 +00003898 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003899 memset(ctxt, 0, sizeof(htmlParserCtxt));
3900
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003901 ctxt->dict = xmlDictCreate();
3902 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003903 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
3904 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003905 }
Owen Taylor3473f882001-02-23 17:55:21 +00003906 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3907 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003908 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
3909 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003910 }
3911 else
3912 memset(sax, 0, sizeof(htmlSAXHandler));
3913
3914 /* Allocate the Input stack */
3915 ctxt->inputTab = (htmlParserInputPtr *)
3916 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3917 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003918 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003919 ctxt->inputNr = 0;
3920 ctxt->inputMax = 0;
3921 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00003922 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003923 }
3924 ctxt->inputNr = 0;
3925 ctxt->inputMax = 5;
3926 ctxt->input = NULL;
3927 ctxt->version = NULL;
3928 ctxt->encoding = NULL;
3929 ctxt->standalone = -1;
3930 ctxt->instate = XML_PARSER_START;
3931
3932 /* Allocate the Node stack */
3933 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3934 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003935 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003936 ctxt->nodeNr = 0;
3937 ctxt->nodeMax = 0;
3938 ctxt->node = NULL;
3939 ctxt->inputNr = 0;
3940 ctxt->inputMax = 0;
3941 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00003942 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003943 }
3944 ctxt->nodeNr = 0;
3945 ctxt->nodeMax = 10;
3946 ctxt->node = NULL;
3947
3948 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003949 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003950 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003951 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003952 ctxt->nameNr = 0;
3953 ctxt->nameMax = 10;
3954 ctxt->name = NULL;
3955 ctxt->nodeNr = 0;
3956 ctxt->nodeMax = 0;
3957 ctxt->node = NULL;
3958 ctxt->inputNr = 0;
3959 ctxt->inputMax = 0;
3960 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00003961 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003962 }
3963 ctxt->nameNr = 0;
3964 ctxt->nameMax = 10;
3965 ctxt->name = NULL;
3966
Daniel Veillard092643b2003-09-25 14:29:29 +00003967 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00003968 else {
3969 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00003970 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00003971 }
3972 ctxt->userData = ctxt;
3973 ctxt->myDoc = NULL;
3974 ctxt->wellFormed = 1;
3975 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00003976 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00003977 ctxt->html = 1;
3978 ctxt->record_info = 0;
3979 ctxt->validate = 0;
3980 ctxt->nbChars = 0;
3981 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00003982 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003983 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00003984 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003985}
3986
3987/**
3988 * htmlFreeParserCtxt:
3989 * @ctxt: an HTML parser context
3990 *
3991 * Free all the memory used by a parser context. However the parsed
3992 * document in ctxt->myDoc is not freed.
3993 */
3994
3995void
3996htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3997{
3998 xmlFreeParserCtxt(ctxt);
3999}
4000
4001/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004002 * htmlNewParserCtxt:
4003 *
4004 * Allocate and initialize a new parser context.
4005 *
4006 * Returns the xmlParserCtxtPtr or NULL
4007 */
4008
4009static htmlParserCtxtPtr
4010htmlNewParserCtxt(void)
4011{
4012 xmlParserCtxtPtr ctxt;
4013
4014 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4015 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004016 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004017 return(NULL);
4018 }
4019 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004020 if (htmlInitParserCtxt(ctxt) < 0) {
4021 htmlFreeParserCtxt(ctxt);
4022 return(NULL);
4023 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004024 return(ctxt);
4025}
4026
4027/**
4028 * htmlCreateMemoryParserCtxt:
4029 * @buffer: a pointer to a char array
4030 * @size: the size of the array
4031 *
4032 * Create a parser context for an HTML in-memory document.
4033 *
4034 * Returns the new parser context or NULL
4035 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004036htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004037htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4038 xmlParserCtxtPtr ctxt;
4039 xmlParserInputPtr input;
4040 xmlParserInputBufferPtr buf;
4041
4042 if (buffer == NULL)
4043 return(NULL);
4044 if (size <= 0)
4045 return(NULL);
4046
4047 ctxt = htmlNewParserCtxt();
4048 if (ctxt == NULL)
4049 return(NULL);
4050
4051 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4052 if (buf == NULL) return(NULL);
4053
4054 input = xmlNewInputStream(ctxt);
4055 if (input == NULL) {
4056 xmlFreeParserCtxt(ctxt);
4057 return(NULL);
4058 }
4059
4060 input->filename = NULL;
4061 input->buf = buf;
4062 input->base = input->buf->buffer->content;
4063 input->cur = input->buf->buffer->content;
4064 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4065
4066 inputPush(ctxt, input);
4067 return(ctxt);
4068}
4069
4070/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004071 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004072 * @cur: a pointer to an array of xmlChar
4073 * @encoding: a free form C string describing the HTML document encoding, or NULL
4074 *
4075 * Create a parser context for an HTML document.
4076 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004077 * TODO: check the need to add encoding handling there
4078 *
Owen Taylor3473f882001-02-23 17:55:21 +00004079 * Returns the new parser context or NULL
4080 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004081static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00004082htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004083 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004084 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004085
Daniel Veillard1d995272002-07-22 16:43:32 +00004086 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004087 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004088 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004089 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4090
4091 if (encoding != NULL) {
4092 xmlCharEncoding enc;
4093 xmlCharEncodingHandlerPtr handler;
4094
4095 if (ctxt->input->encoding != NULL)
4096 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004097 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004098
4099 enc = xmlParseCharEncoding(encoding);
4100 /*
4101 * registered set of known encodings
4102 */
4103 if (enc != XML_CHAR_ENCODING_ERROR) {
4104 xmlSwitchEncoding(ctxt, enc);
4105 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004106 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4107 "Unsupported encoding %s\n",
4108 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004109 }
4110 } else {
4111 /*
4112 * fallback for unknown encodings
4113 */
4114 handler = xmlFindCharEncodingHandler((const char *) encoding);
4115 if (handler != NULL) {
4116 xmlSwitchToEncoding(ctxt, handler);
4117 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004118 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4119 "Unsupported encoding %s\n",
4120 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004121 }
4122 }
4123 }
4124 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004125}
4126
Daniel Veillard73b013f2003-09-30 12:36:01 +00004127#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004128/************************************************************************
4129 * *
4130 * Progressive parsing interfaces *
4131 * *
4132 ************************************************************************/
4133
4134/**
4135 * htmlParseLookupSequence:
4136 * @ctxt: an HTML parser context
4137 * @first: the first char to lookup
4138 * @next: the next char to lookup or zero
4139 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004140 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004141 *
4142 * Try to find if a sequence (first, next, third) or just (first next) or
4143 * (first) is available in the input stream.
4144 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4145 * to avoid rescanning sequences of bytes, it DOES change the state of the
4146 * parser, do not use liberally.
4147 * This is basically similar to xmlParseLookupSequence()
4148 *
4149 * Returns the index to the current parsing point if the full sequence
4150 * is available, -1 otherwise.
4151 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004152static int
Owen Taylor3473f882001-02-23 17:55:21 +00004153htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
William M. Brackc1939562003-08-05 15:52:22 +00004154 xmlChar next, xmlChar third, int iscomment) {
Owen Taylor3473f882001-02-23 17:55:21 +00004155 int base, len;
4156 htmlParserInputPtr in;
4157 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004158 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004159
4160 in = ctxt->input;
4161 if (in == NULL) return(-1);
4162 base = in->cur - in->base;
4163 if (base < 0) return(-1);
4164 if (ctxt->checkIndex > base)
4165 base = ctxt->checkIndex;
4166 if (in->buf == NULL) {
4167 buf = in->base;
4168 len = in->length;
4169 } else {
4170 buf = in->buf->buffer->content;
4171 len = in->buf->buffer->use;
4172 }
4173 /* take into account the sequence length */
4174 if (third) len -= 2;
4175 else if (next) len --;
4176 for (;base < len;base++) {
William M. Brackc1939562003-08-05 15:52:22 +00004177 if (!incomment && (base + 4 < len) && !iscomment) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004178 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4179 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4180 incomment = 1;
Daniel Veillard97e01882003-07-30 18:59:19 +00004181 /* do not increment past <! - some people use <!--> */
4182 base += 2;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004183 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004184 }
4185 if (incomment) {
William M. Brack4a557d92003-07-29 04:28:04 +00004186 if (base + 3 > len)
Daniel Veillardc1f78342001-11-10 11:43:05 +00004187 return(-1);
4188 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4189 (buf[base + 2] == '>')) {
4190 incomment = 0;
4191 base += 2;
4192 }
4193 continue;
4194 }
Owen Taylor3473f882001-02-23 17:55:21 +00004195 if (buf[base] == first) {
4196 if (third != 0) {
4197 if ((buf[base + 1] != next) ||
4198 (buf[base + 2] != third)) continue;
4199 } else if (next != 0) {
4200 if (buf[base + 1] != next) continue;
4201 }
4202 ctxt->checkIndex = 0;
4203#ifdef DEBUG_PUSH
4204 if (next == 0)
4205 xmlGenericError(xmlGenericErrorContext,
4206 "HPP: lookup '%c' found at %d\n",
4207 first, base);
4208 else if (third == 0)
4209 xmlGenericError(xmlGenericErrorContext,
4210 "HPP: lookup '%c%c' found at %d\n",
4211 first, next, base);
4212 else
4213 xmlGenericError(xmlGenericErrorContext,
4214 "HPP: lookup '%c%c%c' found at %d\n",
4215 first, next, third, base);
4216#endif
4217 return(base - (in->cur - in->base));
4218 }
4219 }
4220 ctxt->checkIndex = base;
4221#ifdef DEBUG_PUSH
4222 if (next == 0)
4223 xmlGenericError(xmlGenericErrorContext,
4224 "HPP: lookup '%c' failed\n", first);
4225 else if (third == 0)
4226 xmlGenericError(xmlGenericErrorContext,
4227 "HPP: lookup '%c%c' failed\n", first, next);
4228 else
4229 xmlGenericError(xmlGenericErrorContext,
4230 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4231#endif
4232 return(-1);
4233}
4234
4235/**
4236 * htmlParseTryOrFinish:
4237 * @ctxt: an HTML parser context
4238 * @terminate: last chunk indicator
4239 *
4240 * Try to progress on parsing
4241 *
4242 * Returns zero if no parsing was possible
4243 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004244static int
Owen Taylor3473f882001-02-23 17:55:21 +00004245htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4246 int ret = 0;
4247 htmlParserInputPtr in;
4248 int avail = 0;
4249 xmlChar cur, next;
4250
4251#ifdef DEBUG_PUSH
4252 switch (ctxt->instate) {
4253 case XML_PARSER_EOF:
4254 xmlGenericError(xmlGenericErrorContext,
4255 "HPP: try EOF\n"); break;
4256 case XML_PARSER_START:
4257 xmlGenericError(xmlGenericErrorContext,
4258 "HPP: try START\n"); break;
4259 case XML_PARSER_MISC:
4260 xmlGenericError(xmlGenericErrorContext,
4261 "HPP: try MISC\n");break;
4262 case XML_PARSER_COMMENT:
4263 xmlGenericError(xmlGenericErrorContext,
4264 "HPP: try COMMENT\n");break;
4265 case XML_PARSER_PROLOG:
4266 xmlGenericError(xmlGenericErrorContext,
4267 "HPP: try PROLOG\n");break;
4268 case XML_PARSER_START_TAG:
4269 xmlGenericError(xmlGenericErrorContext,
4270 "HPP: try START_TAG\n");break;
4271 case XML_PARSER_CONTENT:
4272 xmlGenericError(xmlGenericErrorContext,
4273 "HPP: try CONTENT\n");break;
4274 case XML_PARSER_CDATA_SECTION:
4275 xmlGenericError(xmlGenericErrorContext,
4276 "HPP: try CDATA_SECTION\n");break;
4277 case XML_PARSER_END_TAG:
4278 xmlGenericError(xmlGenericErrorContext,
4279 "HPP: try END_TAG\n");break;
4280 case XML_PARSER_ENTITY_DECL:
4281 xmlGenericError(xmlGenericErrorContext,
4282 "HPP: try ENTITY_DECL\n");break;
4283 case XML_PARSER_ENTITY_VALUE:
4284 xmlGenericError(xmlGenericErrorContext,
4285 "HPP: try ENTITY_VALUE\n");break;
4286 case XML_PARSER_ATTRIBUTE_VALUE:
4287 xmlGenericError(xmlGenericErrorContext,
4288 "HPP: try ATTRIBUTE_VALUE\n");break;
4289 case XML_PARSER_DTD:
4290 xmlGenericError(xmlGenericErrorContext,
4291 "HPP: try DTD\n");break;
4292 case XML_PARSER_EPILOG:
4293 xmlGenericError(xmlGenericErrorContext,
4294 "HPP: try EPILOG\n");break;
4295 case XML_PARSER_PI:
4296 xmlGenericError(xmlGenericErrorContext,
4297 "HPP: try PI\n");break;
4298 case XML_PARSER_SYSTEM_LITERAL:
4299 xmlGenericError(xmlGenericErrorContext,
4300 "HPP: try SYSTEM_LITERAL\n");break;
4301 }
4302#endif
4303
4304 while (1) {
4305
4306 in = ctxt->input;
4307 if (in == NULL) break;
4308 if (in->buf == NULL)
4309 avail = in->length - (in->cur - in->base);
4310 else
4311 avail = in->buf->buffer->use - (in->cur - in->base);
4312 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004313 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004314 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4315 /*
4316 * SAX: end of the document processing.
4317 */
4318 ctxt->instate = XML_PARSER_EOF;
4319 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4320 ctxt->sax->endDocument(ctxt->userData);
4321 }
4322 }
4323 if (avail < 1)
4324 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004325 cur = in->cur[0];
4326 if (cur == 0) {
4327 SKIP(1);
4328 continue;
4329 }
4330
Owen Taylor3473f882001-02-23 17:55:21 +00004331 switch (ctxt->instate) {
4332 case XML_PARSER_EOF:
4333 /*
4334 * Document parsing is done !
4335 */
4336 goto done;
4337 case XML_PARSER_START:
4338 /*
4339 * Very first chars read from the document flow.
4340 */
4341 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004342 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004343 SKIP_BLANKS;
4344 if (in->buf == NULL)
4345 avail = in->length - (in->cur - in->base);
4346 else
4347 avail = in->buf->buffer->use - (in->cur - in->base);
4348 }
4349 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4350 ctxt->sax->setDocumentLocator(ctxt->userData,
4351 &xmlDefaultSAXLocator);
4352 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4353 (!ctxt->disableSAX))
4354 ctxt->sax->startDocument(ctxt->userData);
4355
4356 cur = in->cur[0];
4357 next = in->cur[1];
4358 if ((cur == '<') && (next == '!') &&
4359 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4360 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4361 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4362 (UPP(8) == 'E')) {
4363 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004364 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004365 goto done;
4366#ifdef DEBUG_PUSH
4367 xmlGenericError(xmlGenericErrorContext,
4368 "HPP: Parsing internal subset\n");
4369#endif
4370 htmlParseDocTypeDecl(ctxt);
4371 ctxt->instate = XML_PARSER_PROLOG;
4372#ifdef DEBUG_PUSH
4373 xmlGenericError(xmlGenericErrorContext,
4374 "HPP: entering PROLOG\n");
4375#endif
4376 } else {
4377 ctxt->instate = XML_PARSER_MISC;
4378 }
4379#ifdef DEBUG_PUSH
4380 xmlGenericError(xmlGenericErrorContext,
4381 "HPP: entering MISC\n");
4382#endif
4383 break;
4384 case XML_PARSER_MISC:
4385 SKIP_BLANKS;
4386 if (in->buf == NULL)
4387 avail = in->length - (in->cur - in->base);
4388 else
4389 avail = in->buf->buffer->use - (in->cur - in->base);
4390 if (avail < 2)
4391 goto done;
4392 cur = in->cur[0];
4393 next = in->cur[1];
4394 if ((cur == '<') && (next == '!') &&
4395 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4396 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004397 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004398 goto done;
4399#ifdef DEBUG_PUSH
4400 xmlGenericError(xmlGenericErrorContext,
4401 "HPP: Parsing Comment\n");
4402#endif
4403 htmlParseComment(ctxt);
4404 ctxt->instate = XML_PARSER_MISC;
4405 } else if ((cur == '<') && (next == '!') &&
4406 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4407 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4408 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4409 (UPP(8) == 'E')) {
4410 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004411 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004412 goto done;
4413#ifdef DEBUG_PUSH
4414 xmlGenericError(xmlGenericErrorContext,
4415 "HPP: Parsing internal subset\n");
4416#endif
4417 htmlParseDocTypeDecl(ctxt);
4418 ctxt->instate = XML_PARSER_PROLOG;
4419#ifdef DEBUG_PUSH
4420 xmlGenericError(xmlGenericErrorContext,
4421 "HPP: entering PROLOG\n");
4422#endif
4423 } else if ((cur == '<') && (next == '!') &&
4424 (avail < 9)) {
4425 goto done;
4426 } else {
4427 ctxt->instate = XML_PARSER_START_TAG;
4428#ifdef DEBUG_PUSH
4429 xmlGenericError(xmlGenericErrorContext,
4430 "HPP: entering START_TAG\n");
4431#endif
4432 }
4433 break;
4434 case XML_PARSER_PROLOG:
4435 SKIP_BLANKS;
4436 if (in->buf == NULL)
4437 avail = in->length - (in->cur - in->base);
4438 else
4439 avail = in->buf->buffer->use - (in->cur - in->base);
4440 if (avail < 2)
4441 goto done;
4442 cur = in->cur[0];
4443 next = in->cur[1];
4444 if ((cur == '<') && (next == '!') &&
4445 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4446 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004447 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004448 goto done;
4449#ifdef DEBUG_PUSH
4450 xmlGenericError(xmlGenericErrorContext,
4451 "HPP: Parsing Comment\n");
4452#endif
4453 htmlParseComment(ctxt);
4454 ctxt->instate = XML_PARSER_PROLOG;
4455 } else if ((cur == '<') && (next == '!') &&
4456 (avail < 4)) {
4457 goto done;
4458 } else {
4459 ctxt->instate = XML_PARSER_START_TAG;
4460#ifdef DEBUG_PUSH
4461 xmlGenericError(xmlGenericErrorContext,
4462 "HPP: entering START_TAG\n");
4463#endif
4464 }
4465 break;
4466 case XML_PARSER_EPILOG:
4467 if (in->buf == NULL)
4468 avail = in->length - (in->cur - in->base);
4469 else
4470 avail = in->buf->buffer->use - (in->cur - in->base);
4471 if (avail < 1)
4472 goto done;
4473 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004474 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004475 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004476 goto done;
4477 }
4478 if (avail < 2)
4479 goto done;
4480 next = in->cur[1];
4481 if ((cur == '<') && (next == '!') &&
4482 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4483 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004484 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004485 goto done;
4486#ifdef DEBUG_PUSH
4487 xmlGenericError(xmlGenericErrorContext,
4488 "HPP: Parsing Comment\n");
4489#endif
4490 htmlParseComment(ctxt);
4491 ctxt->instate = XML_PARSER_EPILOG;
4492 } else if ((cur == '<') && (next == '!') &&
4493 (avail < 4)) {
4494 goto done;
4495 } else {
4496 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004497 ctxt->wellFormed = 0;
4498 ctxt->instate = XML_PARSER_EOF;
4499#ifdef DEBUG_PUSH
4500 xmlGenericError(xmlGenericErrorContext,
4501 "HPP: entering EOF\n");
4502#endif
4503 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4504 ctxt->sax->endDocument(ctxt->userData);
4505 goto done;
4506 }
4507 break;
4508 case XML_PARSER_START_TAG: {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004509 const xmlChar *name, *oldname;
Owen Taylor3473f882001-02-23 17:55:21 +00004510 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004511 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004512
4513 if (avail < 2)
4514 goto done;
4515 cur = in->cur[0];
4516 if (cur != '<') {
4517 ctxt->instate = XML_PARSER_CONTENT;
4518#ifdef DEBUG_PUSH
4519 xmlGenericError(xmlGenericErrorContext,
4520 "HPP: entering CONTENT\n");
4521#endif
4522 break;
4523 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004524 if (in->cur[1] == '/') {
4525 ctxt->instate = XML_PARSER_END_TAG;
4526 ctxt->checkIndex = 0;
4527#ifdef DEBUG_PUSH
4528 xmlGenericError(xmlGenericErrorContext,
4529 "HPP: entering END_TAG\n");
4530#endif
4531 break;
4532 }
Owen Taylor3473f882001-02-23 17:55:21 +00004533 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004534 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004535 goto done;
4536
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004537 oldname = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00004538 htmlParseStartTag(ctxt);
4539 name = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00004540 if (((depth == ctxt->nameNr) &&
4541 (xmlStrEqual(oldname, ctxt->name))) ||
4542 (name == NULL)) {
4543 if (CUR == '>')
4544 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004545 break;
4546 }
Owen Taylor3473f882001-02-23 17:55:21 +00004547
4548 /*
4549 * Lookup the info for that element.
4550 */
4551 info = htmlTagLookup(name);
4552 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004553 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4554 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004555 }
4556
4557 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004558 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004559 */
4560 if ((CUR == '/') && (NXT(1) == '>')) {
4561 SKIP(2);
4562 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4563 ctxt->sax->endElement(ctxt->userData, name);
4564 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004565 ctxt->instate = XML_PARSER_CONTENT;
4566#ifdef DEBUG_PUSH
4567 xmlGenericError(xmlGenericErrorContext,
4568 "HPP: entering CONTENT\n");
4569#endif
4570 break;
4571 }
4572
4573 if (CUR == '>') {
4574 NEXT;
4575 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004576 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4577 "Couldn't find end of Start Tag %s\n",
4578 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004579
4580 /*
4581 * end of parsing of this node.
4582 */
4583 if (xmlStrEqual(name, ctxt->name)) {
4584 nodePop(ctxt);
4585 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004586 }
4587
4588 ctxt->instate = XML_PARSER_CONTENT;
4589#ifdef DEBUG_PUSH
4590 xmlGenericError(xmlGenericErrorContext,
4591 "HPP: entering CONTENT\n");
4592#endif
4593 break;
4594 }
4595
4596 /*
4597 * Check for an Empty Element from DTD definition
4598 */
4599 if ((info != NULL) && (info->empty)) {
4600 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4601 ctxt->sax->endElement(ctxt->userData, name);
4602 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004603 }
4604 ctxt->instate = XML_PARSER_CONTENT;
4605#ifdef DEBUG_PUSH
4606 xmlGenericError(xmlGenericErrorContext,
4607 "HPP: entering CONTENT\n");
4608#endif
4609 break;
4610 }
4611 case XML_PARSER_CONTENT: {
4612 long cons;
4613 /*
4614 * Handle preparsed entities and charRef
4615 */
4616 if (ctxt->token != 0) {
4617 xmlChar chr[2] = { 0 , 0 } ;
4618
4619 chr[0] = (xmlChar) ctxt->token;
4620 htmlCheckParagraph(ctxt);
4621 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4622 ctxt->sax->characters(ctxt->userData, chr, 1);
4623 ctxt->token = 0;
4624 ctxt->checkIndex = 0;
4625 }
4626 if ((avail == 1) && (terminate)) {
4627 cur = in->cur[0];
4628 if ((cur != '<') && (cur != '&')) {
4629 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00004630 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004631 if (ctxt->sax->ignorableWhitespace != NULL)
4632 ctxt->sax->ignorableWhitespace(
4633 ctxt->userData, &cur, 1);
4634 } else {
4635 htmlCheckParagraph(ctxt);
4636 if (ctxt->sax->characters != NULL)
4637 ctxt->sax->characters(
4638 ctxt->userData, &cur, 1);
4639 }
4640 }
4641 ctxt->token = 0;
4642 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00004643 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00004644 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004645 }
Owen Taylor3473f882001-02-23 17:55:21 +00004646 }
4647 if (avail < 2)
4648 goto done;
4649 cur = in->cur[0];
4650 next = in->cur[1];
4651 cons = ctxt->nbChars;
4652 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4653 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4654 /*
4655 * Handle SCRIPT/STYLE separately
4656 */
4657 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004658 (htmlParseLookupSequence(ctxt, '<', '/', 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004659 goto done;
4660 htmlParseScript(ctxt);
4661 if ((cur == '<') && (next == '/')) {
4662 ctxt->instate = XML_PARSER_END_TAG;
4663 ctxt->checkIndex = 0;
4664#ifdef DEBUG_PUSH
4665 xmlGenericError(xmlGenericErrorContext,
4666 "HPP: entering END_TAG\n");
4667#endif
4668 break;
4669 }
4670 } else {
4671 /*
4672 * Sometimes DOCTYPE arrives in the middle of the document
4673 */
4674 if ((cur == '<') && (next == '!') &&
4675 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4676 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4677 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4678 (UPP(8) == 'E')) {
4679 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004680 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004681 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00004682 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4683 "Misplaced DOCTYPE declaration\n",
4684 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004685 htmlParseDocTypeDecl(ctxt);
4686 } else if ((cur == '<') && (next == '!') &&
4687 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4688 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004689 (htmlParseLookupSequence(
4690 ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004691 goto done;
4692#ifdef DEBUG_PUSH
4693 xmlGenericError(xmlGenericErrorContext,
4694 "HPP: Parsing Comment\n");
4695#endif
4696 htmlParseComment(ctxt);
4697 ctxt->instate = XML_PARSER_CONTENT;
4698 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4699 goto done;
4700 } else if ((cur == '<') && (next == '/')) {
4701 ctxt->instate = XML_PARSER_END_TAG;
4702 ctxt->checkIndex = 0;
4703#ifdef DEBUG_PUSH
4704 xmlGenericError(xmlGenericErrorContext,
4705 "HPP: entering END_TAG\n");
4706#endif
4707 break;
4708 } else if (cur == '<') {
4709 ctxt->instate = XML_PARSER_START_TAG;
4710 ctxt->checkIndex = 0;
4711#ifdef DEBUG_PUSH
4712 xmlGenericError(xmlGenericErrorContext,
4713 "HPP: entering START_TAG\n");
4714#endif
4715 break;
4716 } else if (cur == '&') {
4717 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004718 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004719 goto done;
4720#ifdef DEBUG_PUSH
4721 xmlGenericError(xmlGenericErrorContext,
4722 "HPP: Parsing Reference\n");
4723#endif
4724 /* TODO: check generation of subtrees if noent !!! */
4725 htmlParseReference(ctxt);
4726 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00004727 /*
4728 * check that the text sequence is complete
4729 * before handing out the data to the parser
4730 * to avoid problems with erroneous end of
4731 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00004732 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00004733 if ((!terminate) &&
4734 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
4735 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00004736 ctxt->checkIndex = 0;
4737#ifdef DEBUG_PUSH
4738 xmlGenericError(xmlGenericErrorContext,
4739 "HPP: Parsing char data\n");
4740#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004741 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004742 }
4743 }
4744 if (cons == ctxt->nbChars) {
4745 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004746 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4747 "detected an error in element content\n",
4748 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004749 }
4750 NEXT;
4751 break;
4752 }
4753
4754 break;
4755 }
4756 case XML_PARSER_END_TAG:
4757 if (avail < 2)
4758 goto done;
4759 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004760 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004761 goto done;
4762 htmlParseEndTag(ctxt);
4763 if (ctxt->nameNr == 0) {
4764 ctxt->instate = XML_PARSER_EPILOG;
4765 } else {
4766 ctxt->instate = XML_PARSER_CONTENT;
4767 }
4768 ctxt->checkIndex = 0;
4769#ifdef DEBUG_PUSH
4770 xmlGenericError(xmlGenericErrorContext,
4771 "HPP: entering CONTENT\n");
4772#endif
4773 break;
4774 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00004775 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4776 "HPP: internal error, state == CDATA\n",
4777 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004778 ctxt->instate = XML_PARSER_CONTENT;
4779 ctxt->checkIndex = 0;
4780#ifdef DEBUG_PUSH
4781 xmlGenericError(xmlGenericErrorContext,
4782 "HPP: entering CONTENT\n");
4783#endif
4784 break;
4785 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00004786 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4787 "HPP: internal error, state == DTD\n",
4788 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004789 ctxt->instate = XML_PARSER_CONTENT;
4790 ctxt->checkIndex = 0;
4791#ifdef DEBUG_PUSH
4792 xmlGenericError(xmlGenericErrorContext,
4793 "HPP: entering CONTENT\n");
4794#endif
4795 break;
4796 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00004797 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4798 "HPP: internal error, state == COMMENT\n",
4799 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004800 ctxt->instate = XML_PARSER_CONTENT;
4801 ctxt->checkIndex = 0;
4802#ifdef DEBUG_PUSH
4803 xmlGenericError(xmlGenericErrorContext,
4804 "HPP: entering CONTENT\n");
4805#endif
4806 break;
4807 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00004808 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4809 "HPP: internal error, state == PI\n",
4810 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004811 ctxt->instate = XML_PARSER_CONTENT;
4812 ctxt->checkIndex = 0;
4813#ifdef DEBUG_PUSH
4814 xmlGenericError(xmlGenericErrorContext,
4815 "HPP: entering CONTENT\n");
4816#endif
4817 break;
4818 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00004819 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4820 "HPP: internal error, state == ENTITY_DECL\n",
4821 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004822 ctxt->instate = XML_PARSER_CONTENT;
4823 ctxt->checkIndex = 0;
4824#ifdef DEBUG_PUSH
4825 xmlGenericError(xmlGenericErrorContext,
4826 "HPP: entering CONTENT\n");
4827#endif
4828 break;
4829 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00004830 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4831 "HPP: internal error, state == ENTITY_VALUE\n",
4832 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004833 ctxt->instate = XML_PARSER_CONTENT;
4834 ctxt->checkIndex = 0;
4835#ifdef DEBUG_PUSH
4836 xmlGenericError(xmlGenericErrorContext,
4837 "HPP: entering DTD\n");
4838#endif
4839 break;
4840 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00004841 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4842 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
4843 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004844 ctxt->instate = XML_PARSER_START_TAG;
4845 ctxt->checkIndex = 0;
4846#ifdef DEBUG_PUSH
4847 xmlGenericError(xmlGenericErrorContext,
4848 "HPP: entering START_TAG\n");
4849#endif
4850 break;
4851 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00004852 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4853 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
4854 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004855 ctxt->instate = XML_PARSER_CONTENT;
4856 ctxt->checkIndex = 0;
4857#ifdef DEBUG_PUSH
4858 xmlGenericError(xmlGenericErrorContext,
4859 "HPP: entering CONTENT\n");
4860#endif
4861 break;
4862 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00004863 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4864 "HPP: internal error, state == XML_PARSER_IGNORE\n",
4865 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004866 ctxt->instate = XML_PARSER_CONTENT;
4867 ctxt->checkIndex = 0;
4868#ifdef DEBUG_PUSH
4869 xmlGenericError(xmlGenericErrorContext,
4870 "HPP: entering CONTENT\n");
4871#endif
4872 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00004873 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00004874 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4875 "HPP: internal error, state == XML_PARSER_LITERAL\n",
4876 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00004877 ctxt->instate = XML_PARSER_CONTENT;
4878 ctxt->checkIndex = 0;
4879#ifdef DEBUG_PUSH
4880 xmlGenericError(xmlGenericErrorContext,
4881 "HPP: entering CONTENT\n");
4882#endif
4883 break;
4884
Owen Taylor3473f882001-02-23 17:55:21 +00004885 }
4886 }
4887done:
4888 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004889 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004890 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4891 /*
4892 * SAX: end of the document processing.
4893 */
4894 ctxt->instate = XML_PARSER_EOF;
4895 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4896 ctxt->sax->endDocument(ctxt->userData);
4897 }
4898 }
4899 if ((ctxt->myDoc != NULL) &&
4900 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4901 (ctxt->instate == XML_PARSER_EPILOG))) {
4902 xmlDtdPtr dtd;
4903 dtd = xmlGetIntSubset(ctxt->myDoc);
4904 if (dtd == NULL)
4905 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00004906 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004907 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4908 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4909 }
4910#ifdef DEBUG_PUSH
4911 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4912#endif
4913 return(ret);
4914}
4915
4916/**
Owen Taylor3473f882001-02-23 17:55:21 +00004917 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00004918 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00004919 * @chunk: an char array
4920 * @size: the size in byte of the chunk
4921 * @terminate: last chunk indicator
4922 *
4923 * Parse a Chunk of memory
4924 *
4925 * Returns zero if no error, the xmlParserErrors otherwise.
4926 */
4927int
4928htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4929 int terminate) {
4930 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4931 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4932 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4933 int cur = ctxt->input->cur - ctxt->input->base;
4934
4935 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4936 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4937 ctxt->input->cur = ctxt->input->base + cur;
4938#ifdef DEBUG_PUSH
4939 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4940#endif
4941
Daniel Veillard14f752c2003-08-09 11:44:50 +00004942#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00004943 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4944 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00004945#endif
Owen Taylor3473f882001-02-23 17:55:21 +00004946 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00004947 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
4948 xmlParserInputBufferPtr in = ctxt->input->buf;
4949 if ((in->encoder != NULL) && (in->buffer != NULL) &&
4950 (in->raw != NULL)) {
4951 int nbchars;
4952
4953 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
4954 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004955 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
4956 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00004957 return(XML_ERR_INVALID_ENCODING);
4958 }
4959 }
4960 }
Owen Taylor3473f882001-02-23 17:55:21 +00004961 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00004962 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00004963 if (terminate) {
4964 if ((ctxt->instate != XML_PARSER_EOF) &&
4965 (ctxt->instate != XML_PARSER_EPILOG) &&
4966 (ctxt->instate != XML_PARSER_MISC)) {
4967 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004968 ctxt->wellFormed = 0;
4969 }
4970 if (ctxt->instate != XML_PARSER_EOF) {
4971 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4972 ctxt->sax->endDocument(ctxt->userData);
4973 }
4974 ctxt->instate = XML_PARSER_EOF;
4975 }
4976 return((xmlParserErrors) ctxt->errNo);
4977}
Daniel Veillard73b013f2003-09-30 12:36:01 +00004978#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00004979
4980/************************************************************************
4981 * *
4982 * User entry points *
4983 * *
4984 ************************************************************************/
4985
4986/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004987 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004988 * @sax: a SAX handler
4989 * @user_data: The user data returned on SAX callbacks
4990 * @chunk: a pointer to an array of chars
4991 * @size: number of chars in the array
4992 * @filename: an optional file name or URI
4993 * @enc: an optional encoding
4994 *
4995 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00004996 * The value of @filename is used for fetching external entities
4997 * and error/warning reports.
4998 *
4999 * Returns the new parser context or NULL
5000 */
5001htmlParserCtxtPtr
5002htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5003 const char *chunk, int size, const char *filename,
5004 xmlCharEncoding enc) {
5005 htmlParserCtxtPtr ctxt;
5006 htmlParserInputPtr inputStream;
5007 xmlParserInputBufferPtr buf;
5008
Daniel Veillardd0463562001-10-13 09:15:48 +00005009 xmlInitParser();
5010
Owen Taylor3473f882001-02-23 17:55:21 +00005011 buf = xmlAllocParserInputBuffer(enc);
5012 if (buf == NULL) return(NULL);
5013
Daniel Veillardf403d292003-10-05 13:51:35 +00005014 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005015 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005016 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005017 return(NULL);
5018 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00005019 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5020 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005021 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005022 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005023 xmlFree(ctxt->sax);
5024 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5025 if (ctxt->sax == NULL) {
5026 xmlFree(buf);
5027 xmlFree(ctxt);
5028 return(NULL);
5029 }
5030 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5031 if (user_data != NULL)
5032 ctxt->userData = user_data;
5033 }
5034 if (filename == NULL) {
5035 ctxt->directory = NULL;
5036 } else {
5037 ctxt->directory = xmlParserGetDirectory(filename);
5038 }
5039
5040 inputStream = htmlNewInputStream(ctxt);
5041 if (inputStream == NULL) {
5042 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005043 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005044 return(NULL);
5045 }
5046
5047 if (filename == NULL)
5048 inputStream->filename = NULL;
5049 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005050 inputStream->filename = (char *)
5051 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005052 inputStream->buf = buf;
5053 inputStream->base = inputStream->buf->buffer->content;
5054 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillard5f704af2003-03-05 10:01:43 +00005055 inputStream->end =
5056 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005057
5058 inputPush(ctxt, inputStream);
5059
5060 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5061 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005062 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5063 int cur = ctxt->input->cur - ctxt->input->base;
5064
Owen Taylor3473f882001-02-23 17:55:21 +00005065 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005066
5067 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5068 ctxt->input->cur = ctxt->input->base + cur;
5069 ctxt->input->end =
5070 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005071#ifdef DEBUG_PUSH
5072 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5073#endif
5074 }
5075
5076 return(ctxt);
5077}
5078
5079/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005080 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005081 * @cur: a pointer to an array of xmlChar
5082 * @encoding: a free form C string describing the HTML document encoding, or NULL
5083 * @sax: the SAX handler block
5084 * @userData: if using SAX, this pointer will be provided on callbacks.
5085 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005086 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5087 * to handle parse events. If sax is NULL, fallback to the default DOM
5088 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00005089 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005090 * Returns the resulting document tree unless SAX is NULL or the document is
5091 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005092 */
5093
5094htmlDocPtr
5095htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5096 htmlDocPtr ret;
5097 htmlParserCtxtPtr ctxt;
5098
Daniel Veillardd0463562001-10-13 09:15:48 +00005099 xmlInitParser();
5100
Owen Taylor3473f882001-02-23 17:55:21 +00005101 if (cur == NULL) return(NULL);
5102
5103
5104 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5105 if (ctxt == NULL) return(NULL);
5106 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005107 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005108 ctxt->sax = sax;
5109 ctxt->userData = userData;
5110 }
5111
5112 htmlParseDocument(ctxt);
5113 ret = ctxt->myDoc;
5114 if (sax != NULL) {
5115 ctxt->sax = NULL;
5116 ctxt->userData = NULL;
5117 }
5118 htmlFreeParserCtxt(ctxt);
5119
5120 return(ret);
5121}
5122
5123/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005124 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005125 * @cur: a pointer to an array of xmlChar
5126 * @encoding: a free form C string describing the HTML document encoding, or NULL
5127 *
5128 * parse an HTML in-memory document and build a tree.
5129 *
5130 * Returns the resulting document tree
5131 */
5132
5133htmlDocPtr
5134htmlParseDoc(xmlChar *cur, const char *encoding) {
5135 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5136}
5137
5138
5139/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005140 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005141 * @filename: the filename
5142 * @encoding: a free form C string describing the HTML document encoding, or NULL
5143 *
5144 * Create a parser context for a file content.
5145 * Automatic support for ZLIB/Compress compressed document is provided
5146 * by default if found at compile-time.
5147 *
5148 * Returns the new parser context or NULL
5149 */
5150htmlParserCtxtPtr
5151htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5152{
5153 htmlParserCtxtPtr ctxt;
5154 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005155 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005156 /* htmlCharEncoding enc; */
5157 xmlChar *content, *content_line = (xmlChar *) "charset=";
5158
Daniel Veillardf403d292003-10-05 13:51:35 +00005159 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005160 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00005161 return(NULL);
5162 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005163 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5164 if (canonicFilename == NULL) {
5165 if (xmlDefaultSAXHandler.error != NULL) {
5166 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5167 }
Daniel Veillard104caa32003-05-13 22:54:05 +00005168 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005169 return(NULL);
5170 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005171
5172 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5173 xmlFree(canonicFilename);
5174 if (inputStream == NULL) {
5175 xmlFreeParserCtxt(ctxt);
5176 return(NULL);
5177 }
Owen Taylor3473f882001-02-23 17:55:21 +00005178
5179 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005180
Owen Taylor3473f882001-02-23 17:55:21 +00005181 /* set encoding */
5182 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005183 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Owen Taylor3473f882001-02-23 17:55:21 +00005184 if (content) {
5185 strcpy ((char *)content, (char *)content_line);
5186 strcat ((char *)content, (char *)encoding);
5187 htmlCheckEncoding (ctxt, content);
5188 xmlFree (content);
5189 }
5190 }
5191
5192 return(ctxt);
5193}
5194
5195/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005196 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005197 * @filename: the filename
5198 * @encoding: a free form C string describing the HTML document encoding, or NULL
5199 * @sax: the SAX handler block
5200 * @userData: if using SAX, this pointer will be provided on callbacks.
5201 *
5202 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5203 * compressed document is provided by default if found at compile-time.
5204 * It use the given SAX function block to handle the parsing callback.
5205 * If sax is NULL, fallback to the default DOM tree building routines.
5206 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005207 * Returns the resulting document tree unless SAX is NULL or the document is
5208 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005209 */
5210
5211htmlDocPtr
5212htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5213 void *userData) {
5214 htmlDocPtr ret;
5215 htmlParserCtxtPtr ctxt;
5216 htmlSAXHandlerPtr oldsax = NULL;
5217
Daniel Veillardd0463562001-10-13 09:15:48 +00005218 xmlInitParser();
5219
Owen Taylor3473f882001-02-23 17:55:21 +00005220 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5221 if (ctxt == NULL) return(NULL);
5222 if (sax != NULL) {
5223 oldsax = ctxt->sax;
5224 ctxt->sax = sax;
5225 ctxt->userData = userData;
5226 }
5227
5228 htmlParseDocument(ctxt);
5229
5230 ret = ctxt->myDoc;
5231 if (sax != NULL) {
5232 ctxt->sax = oldsax;
5233 ctxt->userData = NULL;
5234 }
5235 htmlFreeParserCtxt(ctxt);
5236
5237 return(ret);
5238}
5239
5240/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005241 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005242 * @filename: the filename
5243 * @encoding: a free form C string describing the HTML document encoding, or NULL
5244 *
5245 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5246 * compressed document is provided by default if found at compile-time.
5247 *
5248 * Returns the resulting document tree
5249 */
5250
5251htmlDocPtr
5252htmlParseFile(const char *filename, const char *encoding) {
5253 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5254}
5255
5256/**
5257 * htmlHandleOmittedElem:
5258 * @val: int 0 or 1
5259 *
5260 * Set and return the previous value for handling HTML omitted tags.
5261 *
5262 * Returns the last value for 0 for no handling, 1 for auto insertion.
5263 */
5264
5265int
5266htmlHandleOmittedElem(int val) {
5267 int old = htmlOmittedDefaultValue;
5268
5269 htmlOmittedDefaultValue = val;
5270 return(old);
5271}
5272
Daniel Veillard930dfb62003-02-05 10:17:38 +00005273/**
5274 * htmlElementAllowedHere:
5275 * @parent: HTML parent element
5276 * @elt: HTML element
5277 *
5278 * Checks whether an HTML element may be a direct child of a parent element.
5279 * Note - doesn't check for deprecated elements
5280 *
5281 * Returns 1 if allowed; 0 otherwise.
5282 */
5283int
5284htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5285 const char** p ;
5286
5287 if ( ! elt || ! parent || ! parent->subelts )
5288 return 0 ;
5289
5290 for ( p = parent->subelts; *p; ++p )
5291 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5292 return 1 ;
5293
5294 return 0 ;
5295}
5296/**
5297 * htmlElementStatusHere:
5298 * @parent: HTML parent element
5299 * @elt: HTML element
5300 *
5301 * Checks whether an HTML element may be a direct child of a parent element.
5302 * and if so whether it is valid or deprecated.
5303 *
5304 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5305 */
5306htmlStatus
5307htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5308 if ( ! parent || ! elt )
5309 return HTML_INVALID ;
5310 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5311 return HTML_INVALID ;
5312
5313 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5314}
5315/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005316 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005317 * @elt: HTML element
5318 * @attr: HTML attribute
5319 * @legacy: whether to allow deprecated attributes
5320 *
5321 * Checks whether an attribute is valid for an element
5322 * Has full knowledge of Required and Deprecated attributes
5323 *
5324 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5325 */
5326htmlStatus
5327htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5328 const char** p ;
5329
5330 if ( !elt || ! attr )
5331 return HTML_INVALID ;
5332
5333 if ( elt->attrs_req )
5334 for ( p = elt->attrs_req; *p; ++p)
5335 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5336 return HTML_REQUIRED ;
5337
5338 if ( elt->attrs_opt )
5339 for ( p = elt->attrs_opt; *p; ++p)
5340 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5341 return HTML_VALID ;
5342
5343 if ( legacy && elt->attrs_depr )
5344 for ( p = elt->attrs_depr; *p; ++p)
5345 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5346 return HTML_DEPRECATED ;
5347
5348 return HTML_INVALID ;
5349}
5350/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005351 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005352 * @node: an htmlNodePtr in a tree
5353 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005354 * for Element nodes)
5355 *
5356 * Checks whether the tree node is valid. Experimental (the author
5357 * only uses the HTML enhancements in a SAX parser)
5358 *
5359 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5360 * legacy allowed) or htmlElementStatusHere (otherwise).
5361 * for Attribute nodes, a return from htmlAttrAllowed
5362 * for other nodes, HTML_NA (no checks performed)
5363 */
5364htmlStatus
5365htmlNodeStatus(const htmlNodePtr node, int legacy) {
5366 if ( ! node )
5367 return HTML_INVALID ;
5368
5369 switch ( node->type ) {
5370 case XML_ELEMENT_NODE:
5371 return legacy
5372 ? ( htmlElementAllowedHere (
5373 htmlTagLookup(node->parent->name) , node->name
5374 ) ? HTML_VALID : HTML_INVALID )
5375 : htmlElementStatusHere(
5376 htmlTagLookup(node->parent->name) ,
5377 htmlTagLookup(node->name) )
5378 ;
5379 case XML_ATTRIBUTE_NODE:
5380 return htmlAttrAllowed(
5381 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5382 default: return HTML_NA ;
5383 }
5384}
Daniel Veillard9475a352003-09-26 12:47:50 +00005385/************************************************************************
5386 * *
5387 * New set (2.6.0) of simpler and more flexible APIs *
5388 * *
5389 ************************************************************************/
5390/**
5391 * DICT_FREE:
5392 * @str: a string
5393 *
5394 * Free a string if it is not owned by the "dict" dictionnary in the
5395 * current scope
5396 */
5397#define DICT_FREE(str) \
5398 if ((str) && ((!dict) || \
5399 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5400 xmlFree((char *)(str));
5401
5402/**
5403 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00005404 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00005405 *
5406 * Reset a parser context
5407 */
5408void
5409htmlCtxtReset(htmlParserCtxtPtr ctxt)
5410{
5411 xmlParserInputPtr input;
5412 xmlDictPtr dict = ctxt->dict;
5413
5414 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5415 xmlFreeInputStream(input);
5416 }
5417 ctxt->inputNr = 0;
5418 ctxt->input = NULL;
5419
5420 ctxt->spaceNr = 0;
5421 ctxt->spaceTab[0] = -1;
5422 ctxt->space = &ctxt->spaceTab[0];
5423
5424
5425 ctxt->nodeNr = 0;
5426 ctxt->node = NULL;
5427
5428 ctxt->nameNr = 0;
5429 ctxt->name = NULL;
5430
5431 DICT_FREE(ctxt->version);
5432 ctxt->version = NULL;
5433 DICT_FREE(ctxt->encoding);
5434 ctxt->encoding = NULL;
5435 DICT_FREE(ctxt->directory);
5436 ctxt->directory = NULL;
5437 DICT_FREE(ctxt->extSubURI);
5438 ctxt->extSubURI = NULL;
5439 DICT_FREE(ctxt->extSubSystem);
5440 ctxt->extSubSystem = NULL;
5441 if (ctxt->myDoc != NULL)
5442 xmlFreeDoc(ctxt->myDoc);
5443 ctxt->myDoc = NULL;
5444
5445 ctxt->standalone = -1;
5446 ctxt->hasExternalSubset = 0;
5447 ctxt->hasPErefs = 0;
5448 ctxt->html = 1;
5449 ctxt->external = 0;
5450 ctxt->instate = XML_PARSER_START;
5451 ctxt->token = 0;
5452
5453 ctxt->wellFormed = 1;
5454 ctxt->nsWellFormed = 1;
5455 ctxt->valid = 1;
5456 ctxt->vctxt.userData = ctxt;
5457 ctxt->vctxt.error = xmlParserValidityError;
5458 ctxt->vctxt.warning = xmlParserValidityWarning;
5459 ctxt->record_info = 0;
5460 ctxt->nbChars = 0;
5461 ctxt->checkIndex = 0;
5462 ctxt->inSubset = 0;
5463 ctxt->errNo = XML_ERR_OK;
5464 ctxt->depth = 0;
5465 ctxt->charset = XML_CHAR_ENCODING_UTF8;
5466 ctxt->catalogs = NULL;
5467 xmlInitNodeInfoSeq(&ctxt->node_seq);
5468
5469 if (ctxt->attsDefault != NULL) {
5470 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5471 ctxt->attsDefault = NULL;
5472 }
5473 if (ctxt->attsSpecial != NULL) {
5474 xmlHashFree(ctxt->attsSpecial, NULL);
5475 ctxt->attsSpecial = NULL;
5476 }
5477}
5478
5479/**
5480 * htmlCtxtUseOptions:
5481 * @ctxt: an HTML parser context
5482 * @options: a combination of htmlParserOption(s)
5483 *
5484 * Applies the options to the parser context
5485 *
5486 * Returns 0 in case of success, the set of unknown or unimplemented options
5487 * in case of error.
5488 */
5489int
5490htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5491{
5492 if (options & HTML_PARSE_NOWARNING) {
5493 ctxt->sax->warning = NULL;
5494 options -= XML_PARSE_NOWARNING;
5495 }
5496 if (options & HTML_PARSE_NOERROR) {
5497 ctxt->sax->error = NULL;
5498 ctxt->sax->fatalError = NULL;
5499 options -= XML_PARSE_NOERROR;
5500 }
5501 if (options & HTML_PARSE_PEDANTIC) {
5502 ctxt->pedantic = 1;
5503 options -= XML_PARSE_PEDANTIC;
5504 } else
5505 ctxt->pedantic = 0;
5506 if (options & XML_PARSE_NOBLANKS) {
5507 ctxt->keepBlanks = 0;
5508 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5509 options -= XML_PARSE_NOBLANKS;
5510 } else
5511 ctxt->keepBlanks = 1;
5512 ctxt->dictNames = 0;
5513 return (options);
5514}
5515
5516/**
5517 * htmlDoRead:
5518 * @ctxt: an HTML parser context
5519 * @URL: the base URL to use for the document
5520 * @encoding: the document encoding, or NULL
5521 * @options: a combination of htmlParserOption(s)
5522 * @reuse: keep the context for reuse
5523 *
5524 * Common front-end for the htmlRead functions
5525 *
5526 * Returns the resulting document tree or NULL
5527 */
5528static htmlDocPtr
5529htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5530 int options, int reuse)
5531{
5532 htmlDocPtr ret;
5533
5534 htmlCtxtUseOptions(ctxt, options);
5535 ctxt->html = 1;
5536 if (encoding != NULL) {
5537 xmlCharEncodingHandlerPtr hdlr;
5538
5539 hdlr = xmlFindCharEncodingHandler(encoding);
5540 if (hdlr != NULL)
5541 xmlSwitchToEncoding(ctxt, hdlr);
5542 }
5543 if ((URL != NULL) && (ctxt->input != NULL) &&
5544 (ctxt->input->filename == NULL))
5545 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
5546 htmlParseDocument(ctxt);
5547 ret = ctxt->myDoc;
5548 ctxt->myDoc = NULL;
5549 if (!reuse) {
5550 if ((ctxt->dictNames) &&
5551 (ret != NULL) &&
5552 (ret->dict == ctxt->dict))
5553 ctxt->dict = NULL;
5554 xmlFreeParserCtxt(ctxt);
5555 } else {
5556 /* Must duplicate the reference to the dictionary */
5557 if ((ctxt->dictNames) &&
5558 (ret != NULL) &&
5559 (ret->dict == ctxt->dict))
5560 xmlDictReference(ctxt->dict);
5561 }
5562 return (ret);
5563}
5564
5565/**
5566 * htmlReadDoc:
5567 * @cur: a pointer to a zero terminated string
5568 * @URL: the base URL to use for the document
5569 * @encoding: the document encoding, or NULL
5570 * @options: a combination of htmlParserOption(s)
5571 *
5572 * parse an XML in-memory document and build a tree.
5573 *
5574 * Returns the resulting document tree
5575 */
5576htmlDocPtr
5577htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
5578{
5579 htmlParserCtxtPtr ctxt;
5580
5581 if (cur == NULL)
5582 return (NULL);
5583
5584 ctxt = xmlCreateDocParserCtxt(cur);
5585 if (ctxt == NULL)
5586 return (NULL);
5587 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5588}
5589
5590/**
5591 * htmlReadFile:
5592 * @filename: a file or URL
5593 * @encoding: the document encoding, or NULL
5594 * @options: a combination of htmlParserOption(s)
5595 *
5596 * parse an XML file from the filesystem or the network.
5597 *
5598 * Returns the resulting document tree
5599 */
5600htmlDocPtr
5601htmlReadFile(const char *filename, const char *encoding, int options)
5602{
5603 htmlParserCtxtPtr ctxt;
5604
5605 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5606 if (ctxt == NULL)
5607 return (NULL);
5608 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
5609}
5610
5611/**
5612 * htmlReadMemory:
5613 * @buffer: a pointer to a char array
5614 * @size: the size of the array
5615 * @URL: the base URL to use for the document
5616 * @encoding: the document encoding, or NULL
5617 * @options: a combination of htmlParserOption(s)
5618 *
5619 * parse an XML in-memory document and build a tree.
5620 *
5621 * Returns the resulting document tree
5622 */
5623htmlDocPtr
5624htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
5625{
5626 htmlParserCtxtPtr ctxt;
5627
5628 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
5629 if (ctxt == NULL)
5630 return (NULL);
5631 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5632}
5633
5634/**
5635 * htmlReadFd:
5636 * @fd: an open file descriptor
5637 * @URL: the base URL to use for the document
5638 * @encoding: the document encoding, or NULL
5639 * @options: a combination of htmlParserOption(s)
5640 *
5641 * parse an XML from a file descriptor and build a tree.
5642 *
5643 * Returns the resulting document tree
5644 */
5645htmlDocPtr
5646htmlReadFd(int fd, const char *URL, const char *encoding, int options)
5647{
5648 htmlParserCtxtPtr ctxt;
5649 xmlParserInputBufferPtr input;
5650 xmlParserInputPtr stream;
5651
5652 if (fd < 0)
5653 return (NULL);
5654
5655 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
5656 if (input == NULL)
5657 return (NULL);
5658 ctxt = xmlNewParserCtxt();
5659 if (ctxt == NULL) {
5660 xmlFreeParserInputBuffer(input);
5661 return (NULL);
5662 }
5663 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5664 if (stream == NULL) {
5665 xmlFreeParserInputBuffer(input);
5666 xmlFreeParserCtxt(ctxt);
5667 return (NULL);
5668 }
5669 inputPush(ctxt, stream);
5670 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5671}
5672
5673/**
5674 * htmlReadIO:
5675 * @ioread: an I/O read function
5676 * @ioclose: an I/O close function
5677 * @ioctx: an I/O handler
5678 * @URL: the base URL to use for the document
5679 * @encoding: the document encoding, or NULL
5680 * @options: a combination of htmlParserOption(s)
5681 *
5682 * parse an HTML document from I/O functions and source and build a tree.
5683 *
5684 * Returns the resulting document tree
5685 */
5686htmlDocPtr
5687htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
5688 void *ioctx, const char *URL, const char *encoding, int options)
5689{
5690 htmlParserCtxtPtr ctxt;
5691 xmlParserInputBufferPtr input;
5692 xmlParserInputPtr stream;
5693
5694 if (ioread == NULL)
5695 return (NULL);
5696
5697 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
5698 XML_CHAR_ENCODING_NONE);
5699 if (input == NULL)
5700 return (NULL);
5701 ctxt = xmlNewParserCtxt();
5702 if (ctxt == NULL) {
5703 xmlFreeParserInputBuffer(input);
5704 return (NULL);
5705 }
5706 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5707 if (stream == NULL) {
5708 xmlFreeParserInputBuffer(input);
5709 xmlFreeParserCtxt(ctxt);
5710 return (NULL);
5711 }
5712 inputPush(ctxt, stream);
5713 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5714}
5715
5716/**
5717 * htmlCtxtReadDoc:
5718 * @ctxt: an HTML parser context
5719 * @cur: a pointer to a zero terminated string
5720 * @URL: the base URL to use for the document
5721 * @encoding: the document encoding, or NULL
5722 * @options: a combination of htmlParserOption(s)
5723 *
5724 * parse an XML in-memory document and build a tree.
5725 * This reuses the existing @ctxt parser context
5726 *
5727 * Returns the resulting document tree
5728 */
5729htmlDocPtr
5730htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
5731 const char *URL, const char *encoding, int options)
5732{
5733 xmlParserInputPtr stream;
5734
5735 if (cur == NULL)
5736 return (NULL);
5737 if (ctxt == NULL)
5738 return (NULL);
5739
5740 htmlCtxtReset(ctxt);
5741
5742 stream = xmlNewStringInputStream(ctxt, cur);
5743 if (stream == NULL) {
5744 return (NULL);
5745 }
5746 inputPush(ctxt, stream);
5747 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5748}
5749
5750/**
5751 * htmlCtxtReadFile:
5752 * @ctxt: an HTML parser context
5753 * @filename: a file or URL
5754 * @encoding: the document encoding, or NULL
5755 * @options: a combination of htmlParserOption(s)
5756 *
5757 * parse an XML file from the filesystem or the network.
5758 * This reuses the existing @ctxt parser context
5759 *
5760 * Returns the resulting document tree
5761 */
5762htmlDocPtr
5763htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
5764 const char *encoding, int options)
5765{
5766 xmlParserInputPtr stream;
5767
5768 if (filename == NULL)
5769 return (NULL);
5770 if (ctxt == NULL)
5771 return (NULL);
5772
5773 htmlCtxtReset(ctxt);
5774
5775 stream = xmlNewInputFromFile(ctxt, filename);
5776 if (stream == NULL) {
5777 return (NULL);
5778 }
5779 inputPush(ctxt, stream);
5780 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
5781}
5782
5783/**
5784 * htmlCtxtReadMemory:
5785 * @ctxt: an HTML parser context
5786 * @buffer: a pointer to a char array
5787 * @size: the size of the array
5788 * @URL: the base URL to use for the document
5789 * @encoding: the document encoding, or NULL
5790 * @options: a combination of htmlParserOption(s)
5791 *
5792 * parse an XML in-memory document and build a tree.
5793 * This reuses the existing @ctxt parser context
5794 *
5795 * Returns the resulting document tree
5796 */
5797htmlDocPtr
5798htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
5799 const char *URL, const char *encoding, int options)
5800{
5801 xmlParserInputBufferPtr input;
5802 xmlParserInputPtr stream;
5803
5804 if (ctxt == NULL)
5805 return (NULL);
5806 if (buffer == NULL)
5807 return (NULL);
5808
5809 htmlCtxtReset(ctxt);
5810
5811 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5812 if (input == NULL) {
5813 return(NULL);
5814 }
5815
5816 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5817 if (stream == NULL) {
5818 xmlFreeParserInputBuffer(input);
5819 return(NULL);
5820 }
5821
5822 inputPush(ctxt, stream);
5823 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5824}
5825
5826/**
5827 * htmlCtxtReadFd:
5828 * @ctxt: an HTML parser context
5829 * @fd: an open file descriptor
5830 * @URL: the base URL to use for the document
5831 * @encoding: the document encoding, or NULL
5832 * @options: a combination of htmlParserOption(s)
5833 *
5834 * parse an XML from a file descriptor and build a tree.
5835 * This reuses the existing @ctxt parser context
5836 *
5837 * Returns the resulting document tree
5838 */
5839htmlDocPtr
5840htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
5841 const char *URL, const char *encoding, int options)
5842{
5843 xmlParserInputBufferPtr input;
5844 xmlParserInputPtr stream;
5845
5846 if (fd < 0)
5847 return (NULL);
5848 if (ctxt == NULL)
5849 return (NULL);
5850
5851 htmlCtxtReset(ctxt);
5852
5853
5854 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
5855 if (input == NULL)
5856 return (NULL);
5857 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5858 if (stream == NULL) {
5859 xmlFreeParserInputBuffer(input);
5860 return (NULL);
5861 }
5862 inputPush(ctxt, stream);
5863 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5864}
5865
5866/**
5867 * htmlCtxtReadIO:
5868 * @ctxt: an HTML parser context
5869 * @ioread: an I/O read function
5870 * @ioclose: an I/O close function
5871 * @ioctx: an I/O handler
5872 * @URL: the base URL to use for the document
5873 * @encoding: the document encoding, or NULL
5874 * @options: a combination of htmlParserOption(s)
5875 *
5876 * parse an HTML document from I/O functions and source and build a tree.
5877 * This reuses the existing @ctxt parser context
5878 *
5879 * Returns the resulting document tree
5880 */
5881htmlDocPtr
5882htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
5883 xmlInputCloseCallback ioclose, void *ioctx,
5884 const char *URL,
5885 const char *encoding, int options)
5886{
5887 xmlParserInputBufferPtr input;
5888 xmlParserInputPtr stream;
5889
5890 if (ioread == NULL)
5891 return (NULL);
5892 if (ctxt == NULL)
5893 return (NULL);
5894
5895 htmlCtxtReset(ctxt);
5896
5897 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
5898 XML_CHAR_ENCODING_NONE);
5899 if (input == NULL)
5900 return (NULL);
5901 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5902 if (stream == NULL) {
5903 xmlFreeParserInputBuffer(input);
5904 return (NULL);
5905 }
5906 inputPush(ctxt, stream);
5907 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5908}
5909
Owen Taylor3473f882001-02-23 17:55:21 +00005910#endif /* LIBXML_HTML_ENABLED */