blob: ba12eccb43277944a62e3ae4f09534608edcf89a [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillardf403d292003-10-05 13:51:35 +000062 * Some factorized error routines *
63 * *
64 ************************************************************************/
65
66/**
67 * xmlErrMemory:
68 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
76 if (ctxt != NULL) {
77 ctxt->errNo = XML_ERR_NO_MEMORY;
78 ctxt->instate = XML_PARSER_EOF;
79 ctxt->disableSAX = 1;
80 }
81 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000082 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000083 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
84 NULL, NULL, 0, 0,
85 "Memory allocation failed : %s\n", extra);
86 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000087 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000088 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
89 NULL, NULL, 0, 0, "Memory allocation failed\n");
90}
91
92/**
93 * htmlParseErr:
94 * @ctxt: an HTML parser context
95 * @error: the error number
96 * @msg: the error message
97 * @str1: string infor
98 * @str2: string infor
99 *
100 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
101 */
102static void
103htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
104 const char *msg, const xmlChar *str1, const xmlChar *str2)
105{
106 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000107 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000108 XML_ERR_ERROR, NULL, 0,
109 (const char *) str1, (const char *) str2,
110 NULL, 0, 0,
111 msg, str1, str2);
112 ctxt->wellFormed = 0;
113}
114
115/**
116 * htmlParseErrInt:
117 * @ctxt: an HTML parser context
118 * @error: the error number
119 * @msg: the error message
120 * @val: integer info
121 *
122 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
123 */
124static void
125htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
126 const char *msg, int val)
127{
128 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000129 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000130 XML_ERR_ERROR, NULL, 0, NULL, NULL,
131 NULL, val, 0, msg, val);
132 ctxt->wellFormed = 0;
133}
134
135/************************************************************************
136 * *
Owen Taylor3473f882001-02-23 17:55:21 +0000137 * Parser stacks related functions and macros *
138 * *
139 ************************************************************************/
140
Daniel Veillard1c732d22002-11-30 11:22:59 +0000141/**
142 * htmlnamePush:
143 * @ctxt: an HTML parser context
144 * @value: the element name
145 *
146 * Pushes a new element name on top of the name stack
147 *
148 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000149 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000150static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000151htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000152{
153 if (ctxt->nameNr >= ctxt->nameMax) {
154 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000155 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000156 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000157 ctxt->nameMax *
158 sizeof(ctxt->nameTab[0]));
159 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000160 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000161 return (0);
162 }
163 }
164 ctxt->nameTab[ctxt->nameNr] = value;
165 ctxt->name = value;
166 return (ctxt->nameNr++);
167}
168/**
169 * htmlnamePop:
170 * @ctxt: an HTML parser context
171 *
172 * Pops the top element name from the name stack
173 *
174 * Returns the name just removed
175 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000176static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000177htmlnamePop(htmlParserCtxtPtr ctxt)
178{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000179 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000180
Daniel Veillard1c732d22002-11-30 11:22:59 +0000181 if (ctxt->nameNr <= 0)
182 return (0);
183 ctxt->nameNr--;
184 if (ctxt->nameNr < 0)
185 return (0);
186 if (ctxt->nameNr > 0)
187 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
188 else
189 ctxt->name = NULL;
190 ret = ctxt->nameTab[ctxt->nameNr];
191 ctxt->nameTab[ctxt->nameNr] = 0;
192 return (ret);
193}
Owen Taylor3473f882001-02-23 17:55:21 +0000194
195/*
196 * Macros for accessing the content. Those should be used only by the parser,
197 * and not exported.
198 *
199 * Dirty macros, i.e. one need to make assumption on the context to use them
200 *
201 * CUR_PTR return the current pointer to the xmlChar to be parsed.
202 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
203 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
204 * in UNICODE mode. This should be used internally by the parser
205 * only to compare to ASCII values otherwise it would break when
206 * running with UTF-8 encoding.
207 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
208 * to compare on ASCII based substring.
209 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
210 * it should be used only to compare on ASCII based substring.
211 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000212 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000213 *
214 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
215 *
216 * CURRENT Returns the current char value, with the full decoding of
217 * UTF-8 if we are using this mode. It returns an int.
218 * NEXT Skip to the next character, this does the proper decoding
219 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000220 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000221 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
222 */
223
224#define UPPER (toupper(*ctxt->input->cur))
225
Daniel Veillard77a90a72003-03-22 00:04:05 +0000226#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000227
228#define NXT(val) ctxt->input->cur[(val)]
229
230#define UPP(val) (toupper(ctxt->input->cur[(val)]))
231
232#define CUR_PTR ctxt->input->cur
233
234#define SHRINK xmlParserInputShrink(ctxt->input)
235
236#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
237
238#define CURRENT ((int) (*ctxt->input->cur))
239
240#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
241
242/* Inported from XML */
243
Daniel Veillard561b7f82002-03-20 21:55:57 +0000244/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
245#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000246#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000247
Daniel Veillard561b7f82002-03-20 21:55:57 +0000248#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000249#define NXT(val) ctxt->input->cur[(val)]
250#define CUR_PTR ctxt->input->cur
251
252
253#define NEXTL(l) do { \
254 if (*(ctxt->input->cur) == '\n') { \
255 ctxt->input->line++; ctxt->input->col = 1; \
256 } else ctxt->input->col++; \
257 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
258 } while (0)
259
260/************
261 \
262 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
263 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
264 ************/
265
266#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
267#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
268
269#define COPY_BUF(l,b,i,v) \
270 if (l == 1) b[i++] = (xmlChar) v; \
271 else i += xmlCopyChar(l,&b[i],v)
272
273/**
274 * htmlCurrentChar:
275 * @ctxt: the HTML parser context
276 * @len: pointer to the length of the char read
277 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000278 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000279 * bytes in the input buffer. Implement the end of line normalization:
280 * 2.11 End-of-Line Handling
281 * If the encoding is unspecified, in the case we find an ISO-Latin-1
282 * char, then the encoding converter is plugged in automatically.
283 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000284 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000285 */
286
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000287static int
Owen Taylor3473f882001-02-23 17:55:21 +0000288htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
289 if (ctxt->instate == XML_PARSER_EOF)
290 return(0);
291
292 if (ctxt->token != 0) {
293 *len = 0;
294 return(ctxt->token);
295 }
296 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
297 /*
298 * We are supposed to handle UTF8, check it's valid
299 * From rfc2044: encoding of the Unicode values on UTF-8:
300 *
301 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
302 * 0000 0000-0000 007F 0xxxxxxx
303 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
304 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
305 *
306 * Check for the 0x110000 limit too
307 */
308 const unsigned char *cur = ctxt->input->cur;
309 unsigned char c;
310 unsigned int val;
311
312 c = *cur;
313 if (c & 0x80) {
314 if (cur[1] == 0)
315 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
316 if ((cur[1] & 0xc0) != 0x80)
317 goto encoding_error;
318 if ((c & 0xe0) == 0xe0) {
319
320 if (cur[2] == 0)
321 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
322 if ((cur[2] & 0xc0) != 0x80)
323 goto encoding_error;
324 if ((c & 0xf0) == 0xf0) {
325 if (cur[3] == 0)
326 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
327 if (((c & 0xf8) != 0xf0) ||
328 ((cur[3] & 0xc0) != 0x80))
329 goto encoding_error;
330 /* 4-byte code */
331 *len = 4;
332 val = (cur[0] & 0x7) << 18;
333 val |= (cur[1] & 0x3f) << 12;
334 val |= (cur[2] & 0x3f) << 6;
335 val |= cur[3] & 0x3f;
336 } else {
337 /* 3-byte code */
338 *len = 3;
339 val = (cur[0] & 0xf) << 12;
340 val |= (cur[1] & 0x3f) << 6;
341 val |= cur[2] & 0x3f;
342 }
343 } else {
344 /* 2-byte code */
345 *len = 2;
346 val = (cur[0] & 0x1f) << 6;
347 val |= cur[1] & 0x3f;
348 }
349 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000350 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
351 "Char 0x%X out of allowed range\n", val);
Owen Taylor3473f882001-02-23 17:55:21 +0000352 }
353 return(val);
354 } else {
355 /* 1-byte code */
356 *len = 1;
357 return((int) *ctxt->input->cur);
358 }
359 }
360 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000361 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000362 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000363 * XML constructs only use < 128 chars
364 */
365 *len = 1;
366 if ((int) *ctxt->input->cur < 0x80)
367 return((int) *ctxt->input->cur);
368
369 /*
370 * Humm this is bad, do an automatic flow conversion
371 */
372 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
373 ctxt->charset = XML_CHAR_ENCODING_UTF8;
374 return(xmlCurrentChar(ctxt, len));
375
376encoding_error:
377 /*
378 * If we detect an UTF8 error that probably mean that the
379 * input encoding didn't get properly advertized in the
380 * declaration header. Report the error and switch the encoding
381 * to ISO-Latin-1 (if you don't like this policy, just declare the
382 * encoding !)
383 */
Daniel Veillardf403d292003-10-05 13:51:35 +0000384 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
385 "Input is not proper UTF-8, indicate encoding !\n",
386 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000387 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +0000388 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
389 ctxt->input->cur[0], ctxt->input->cur[1],
390 ctxt->input->cur[2], ctxt->input->cur[3]);
391 }
392
393 ctxt->charset = XML_CHAR_ENCODING_8859_1;
394 *len = 1;
395 return((int) *ctxt->input->cur);
396}
397
398/**
Owen Taylor3473f882001-02-23 17:55:21 +0000399 * htmlSkipBlankChars:
400 * @ctxt: the HTML parser context
401 *
402 * skip all blanks character found at that point in the input streams.
403 *
404 * Returns the number of space chars skipped
405 */
406
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000407static int
Owen Taylor3473f882001-02-23 17:55:21 +0000408htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
409 int res = 0;
410
William M. Brack76e95df2003-10-18 16:20:14 +0000411 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000412 if ((*ctxt->input->cur == 0) &&
413 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
414 xmlPopInput(ctxt);
415 } else {
416 if (*(ctxt->input->cur) == '\n') {
417 ctxt->input->line++; ctxt->input->col = 1;
418 } else ctxt->input->col++;
419 ctxt->input->cur++;
420 ctxt->nbChars++;
421 if (*ctxt->input->cur == 0)
422 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
423 }
424 res++;
425 }
426 return(res);
427}
428
429
430
431/************************************************************************
432 * *
433 * The list of HTML elements and their properties *
434 * *
435 ************************************************************************/
436
437/*
438 * Start Tag: 1 means the start tag can be ommited
439 * End Tag: 1 means the end tag can be ommited
440 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000441 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000442 * Depr: this element is deprecated
443 * DTD: 1 means that this element is valid only in the Loose DTD
444 * 2 means that this element is valid only in the Frameset DTD
445 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000446 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000447 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000448 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000449
450/* Definitions and a couple of vars for HTML Elements */
451
452#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
453#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
454#define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
455#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
456#define BLOCK HEADING LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
457#define FORMCTRL "input", "select", "textarea", "label", "button"
458#define PCDATA
459#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
460#define LIST "ul", "ol", "dir", "menu"
461#define MODIFIER
462#define FLOW BLOCK,INLINE
463#define EMPTY NULL
464
465
466static const char* html_flow[] = { FLOW, NULL } ;
467static const char* html_inline[] = { INLINE, NULL } ;
468
469/* placeholders: elts with content but no subelements */
470static const char* html_pcdata[] = { NULL } ;
471#define html_cdata html_pcdata
472
473
474/* ... and for HTML Attributes */
475
476#define COREATTRS "id", "class", "style", "title"
477#define I18N "lang", "dir"
478#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
479#define ATTRS COREATTRS,I18N,EVENTS
480#define CELLHALIGN "align", "char", "charoff"
481#define CELLVALIGN "valign"
482
483static const char* html_attrs[] = { ATTRS, NULL } ;
484static const char* core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
485static const char* core_attrs[] = { COREATTRS, NULL } ;
486static const char* i18n_attrs[] = { I18N, NULL } ;
487
488
489/* Other declarations that should go inline ... */
490static const char* a_attrs[] = { ATTRS, "charset", "type", "name",
491 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
492 "tabindex", "onfocus", "onblur", NULL } ;
493static const char* target_attr[] = { "target", NULL } ;
494static const char* rows_cols_attr[] = { "rows", "cols", NULL } ;
495static const char* alt_attr[] = { "alt", NULL } ;
496static const char* src_alt_attrs[] = { "src", "alt", NULL } ;
497static const char* href_attrs[] = { "href", NULL } ;
498static const char* clear_attrs[] = { "clear", NULL } ;
499static const char* inline_p[] = { INLINE, "p", NULL } ;
500static const char* flow_param[] = { FLOW, "param", NULL } ;
501static const char* applet_attrs[] = { COREATTRS , "codebase",
502 "archive", "alt", "name", "height", "width", "align",
503 "hspace", "vspace", NULL } ;
504static const char* area_attrs[] = { "shape", "coords", "href", "nohref",
505 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
506static const char* basefont_attrs[] =
507 { "id", "size", "color", "face", NULL } ;
508static const char* quote_attrs[] = { ATTRS, "cite", NULL } ;
509static const char* body_contents[] = { FLOW, "ins", "del", NULL } ;
510static const char* body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
511static const char* body_depr[] = { "background", "bgcolor", "text",
512 "link", "vlink", "alink", NULL } ;
513static const char* button_attrs[] = { ATTRS, "name", "value", "type",
514 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
515
516
517static const char* col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
518static const char* col_elt[] = { "col", NULL } ;
519static const char* edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
520static const char* compact_attrs[] = { ATTRS, "compact", NULL } ;
521static const char* dl_contents[] = { "dt", "dd", NULL } ;
522static const char* compact_attr[] = { "compact", NULL } ;
523static const char* label_attr[] = { "label", NULL } ;
524static const char* fieldset_contents[] = { FLOW, "legend" } ;
525static const char* font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
526static const char* form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
527static const char* form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
528static const char* frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
529static const char* frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
530static const char* frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
531static const char* head_attrs[] = { I18N, "profile", NULL } ;
532static const char* head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
533static const char* hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
534static const char* version_attr[] = { "version", NULL } ;
535static const char* html_content[] = { "head", "body", "frameset", NULL } ;
536static const char* iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
537static const char* img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
538static const char* input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
539static const char* prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
540static const char* label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
541static const char* legend_attrs[] = { ATTRS, "accesskey", NULL } ;
542static const char* align_attr[] = { "align", NULL } ;
543static const char* link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
544static const char* map_contents[] = { BLOCK, "area", NULL } ;
545static const char* name_attr[] = { "name", NULL } ;
546static const char* action_attr[] = { "action", NULL } ;
547static const char* blockli_elt[] = { BLOCK, "li", NULL } ;
548static const char* meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
549static const char* content_attr[] = { "content", NULL } ;
550static const char* type_attr[] = { "type", NULL } ;
551static const char* noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
552static const char* object_contents[] = { FLOW, "param", NULL } ;
553static const char* object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
554static const char* object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
555static const char* ol_attrs[] = { "type", "compact", "start", NULL} ;
556static const char* option_elt[] = { "option", NULL } ;
557static const char* optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
558static const char* option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
559static const char* param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
560static const char* width_attr[] = { "width", NULL } ;
561static const char* pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
562static const char* script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
563static const char* language_attr[] = { "language", NULL } ;
564static const char* select_content[] = { "optgroup", "option", NULL } ;
565static const char* select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
566static const char* style_attrs[] = { I18N, "media", "title", NULL } ;
567static const char* table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
568static const char* table_depr[] = { "align", "bgcolor", NULL } ;
569static const char* table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
570static const char* tr_elt[] = { "tr", NULL } ;
571static const char* talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
572static const char* th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
573static const char* th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
574static const char* textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
575static const char* tr_contents[] = { "th", "td", NULL } ;
576static const char* bgcolor_attr[] = { "bgcolor", NULL } ;
577static const char* li_elt[] = { "li", NULL } ;
578static const char* ul_depr[] = { "type", "compact", NULL} ;
579static const char* dir_attr[] = { "dir", NULL} ;
580
581#define DECL (const char**)
582
Daniel Veillard22090732001-07-16 00:06:07 +0000583static const htmlElemDesc
584html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000585{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
586 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
587},
588{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
589 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
590},
591{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
592 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
593},
594{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
595 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
596},
597{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
598 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
599},
600{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
601 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
602},
603{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
604 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
605},
606{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
607 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
608},
609{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
610 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
611},
612{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
613 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
614},
615{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
616 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
617},
618{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
619 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
620},
621{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
622 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
623},
624{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
625 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
626},
627{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
628 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
629},
630{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
631 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
632},
633{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
634 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
635},
636{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
637 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
638},
639{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
640 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
641},
642{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
643 EMPTY , NULL , DECL col_attrs , NULL, NULL
644},
645{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
646 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
647},
648{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
649 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
650},
651{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
652 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
653},
654{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
655 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
656},
657{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
658 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
659},
660{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
661 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
662},
663{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
664 DECL dl_contents , "dd" , html_attrs, DECL compact_attr, NULL
665},
666{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
667 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
668},
669{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
670 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
671},
672{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
673 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
674},
675{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
676 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
677},
678{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
679 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
680},
681{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
682 EMPTY, NULL, NULL, DECL frame_attrs, NULL
683},
684{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
685 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
686},
687{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
688 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
689},
690{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
691 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
692},
693{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
694 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
695},
696{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
697 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
698},
699{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
700 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
701},
702{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
703 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
704},
705{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
706 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
707},
708{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
709 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
710},
711{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
712 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
713},
714{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
715 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
716},
717{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
718 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
719},
720{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
721 EMPTY, NULL, DECL img_attrs, DECL align_attr, src_alt_attrs
722},
723{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
724 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
725},
726{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
727 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
728},
729{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
730 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
731},
732{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
733 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
734},
735{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
736 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
737},
738{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
739 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
740},
741{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
742 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
743},
744{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
745 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
746},
747{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
748 DECL map_contents , NULL, DECL html_attrs , NULL, name_attr
749},
750{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
751 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
752},
753{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
754 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
755},
756{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
757 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
758},
759{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
760 DECL html_flow, "div", DECL html_attrs, NULL, NULL
761},
762{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
763 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
764},
765{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
766 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
767},
768{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
769 option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
770},
771{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
772 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
773},
774{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
775 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
776},
777{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
778 EMPTY, NULL, DECL param_attrs, NULL, name_attr
779},
780{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
781 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
782},
783{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
784 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
785},
786{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
787 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
788},
789{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
790 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
791},
792{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
793 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
794},
795{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
796 DECL select_content, NULL, DECL select_attrs, NULL, NULL
797},
798{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
799 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
800},
801{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
802 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
803},
804{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
805 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
806},
807{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
808 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
809},
810{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
811 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
812},
813{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
814 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
815},
816{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
817 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
818},
819{ "table", 0, 0, 0, 0, 0, 0, 0, "",
820 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
821},
822{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
823 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
824},
825{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
826 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
827},
828{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
829 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
830},
831{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
832 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
833},
834{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
835 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
836},
837{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
838 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
839},
840{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
841 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
842},
843{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
844 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
845},
846{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
847 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
848},
849{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
850 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
851},
852{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
853 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
854},
855{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
856 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
857}
Owen Taylor3473f882001-02-23 17:55:21 +0000858};
859
860/*
Owen Taylor3473f882001-02-23 17:55:21 +0000861 * start tags that imply the end of current element
862 */
Daniel Veillard22090732001-07-16 00:06:07 +0000863static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000864"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
865 "dl", "ul", "ol", "menu", "dir", "address", "pre",
866 "listing", "xmp", "head", NULL,
867"head", "p", NULL,
868"title", "p", NULL,
869"body", "head", "style", "link", "title", "p", NULL,
870"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
871 "pre", "listing", "xmp", "head", "li", NULL,
872"hr", "p", "head", NULL,
873"h1", "p", "head", NULL,
874"h2", "p", "head", NULL,
875"h3", "p", "head", NULL,
876"h4", "p", "head", NULL,
877"h5", "p", "head", NULL,
878"h6", "p", "head", NULL,
879"dir", "p", "head", NULL,
880"address", "p", "head", "ul", NULL,
881"pre", "p", "head", "ul", NULL,
882"listing", "p", "head", NULL,
883"xmp", "p", "head", NULL,
884"blockquote", "p", "head", NULL,
885"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
886 "xmp", "head", NULL,
887"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
888 "head", "dd", NULL,
889"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
890 "head", "dt", NULL,
891"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
892 "listing", "xmp", NULL,
893"ol", "p", "head", "ul", NULL,
894"menu", "p", "head", "ul", NULL,
895"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
896"div", "p", "head", NULL,
897"noscript", "p", "head", NULL,
898"center", "font", "b", "i", "p", "head", NULL,
899"a", "a", NULL,
900"caption", "p", NULL,
901"colgroup", "caption", "colgroup", "col", "p", NULL,
902"col", "caption", "col", "p", NULL,
903"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
904 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000905"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
906"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000907"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
908"thead", "caption", "col", "colgroup", NULL,
909"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
910 "tbody", "p", NULL,
911"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
912 "tfoot", "tbody", "p", NULL,
913"optgroup", "option", NULL,
914"option", "option", NULL,
915"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
916 "pre", "listing", "xmp", "a", NULL,
917NULL
918};
919
920/*
921 * The list of HTML elements which are supposed not to have
922 * CDATA content and where a p element will be implied
923 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000924 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000925 * implied paragraph
926 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000927static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000928 "html",
929 "head",
930 "body",
931 NULL
932};
933
934/*
935 * The list of HTML attributes which are of content %Script;
936 * NOTE: when adding ones, check htmlIsScriptAttribute() since
937 * it assumes the name starts with 'on'
938 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000939static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000940 "onclick",
941 "ondblclick",
942 "onmousedown",
943 "onmouseup",
944 "onmouseover",
945 "onmousemove",
946 "onmouseout",
947 "onkeypress",
948 "onkeydown",
949 "onkeyup",
950 "onload",
951 "onunload",
952 "onfocus",
953 "onblur",
954 "onsubmit",
955 "onrest",
956 "onchange",
957 "onselect"
958};
959
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000960/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000961 * This table is used by the htmlparser to know what to do with
962 * broken html pages. By assigning different priorities to different
963 * elements the parser can decide how to handle extra endtags.
964 * Endtags are only allowed to close elements with lower or equal
965 * priority.
966 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000967
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000968typedef struct {
969 const char *name;
970 int priority;
971} elementPriority;
972
Daniel Veillard22090732001-07-16 00:06:07 +0000973static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000974 {"div", 150},
975 {"td", 160},
976 {"th", 160},
977 {"tr", 170},
978 {"thead", 180},
979 {"tbody", 180},
980 {"tfoot", 180},
981 {"table", 190},
982 {"head", 200},
983 {"body", 200},
984 {"html", 220},
985 {NULL, 100} /* Default priority */
986};
Owen Taylor3473f882001-02-23 17:55:21 +0000987
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000988static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000989static int htmlStartCloseIndexinitialized = 0;
990
991/************************************************************************
992 * *
993 * functions to handle HTML specific data *
994 * *
995 ************************************************************************/
996
997/**
998 * htmlInitAutoClose:
999 *
1000 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1001 * This is not reentrant. Call xmlInitParser() once before processing in
1002 * case of use in multithreaded programs.
1003 */
1004void
1005htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001006 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001007
1008 if (htmlStartCloseIndexinitialized) return;
1009
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001010 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1011 indx = 0;
1012 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1013 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001014 while (htmlStartClose[i] != NULL) i++;
1015 i++;
1016 }
1017 htmlStartCloseIndexinitialized = 1;
1018}
1019
1020/**
1021 * htmlTagLookup:
1022 * @tag: The tag name in lowercase
1023 *
1024 * Lookup the HTML tag in the ElementTable
1025 *
1026 * Returns the related htmlElemDescPtr or NULL if not found.
1027 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001028const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001029htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001030 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001031
1032 for (i = 0; i < (sizeof(html40ElementTable) /
1033 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001034 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001035 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001036 }
1037 return(NULL);
1038}
1039
1040/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001041 * htmlGetEndPriority:
1042 * @name: The name of the element to look up the priority for.
1043 *
1044 * Return value: The "endtag" priority.
1045 **/
1046static int
1047htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001048 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001049
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001050 while ((htmlEndPriority[i].name != NULL) &&
1051 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1052 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001053
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001054 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001055}
1056
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001057
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001058/**
Owen Taylor3473f882001-02-23 17:55:21 +00001059 * htmlCheckAutoClose:
1060 * @newtag: The new tag name
1061 * @oldtag: The old tag name
1062 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001063 * Checks whether the new tag is one of the registered valid tags for
1064 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001065 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1066 *
1067 * Returns 0 if no, 1 if yes.
1068 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001069static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001070htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1071{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001072 int i, indx;
1073 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001074
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001075 if (htmlStartCloseIndexinitialized == 0)
1076 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001077
1078 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001079 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001080 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001081 if (closed == NULL)
1082 return (0);
1083 if (xmlStrEqual(BAD_CAST * closed, newtag))
1084 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001085 }
1086
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001087 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001088 i++;
1089 while (htmlStartClose[i] != NULL) {
1090 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001091 return (1);
1092 }
1093 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001094 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001095 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001096}
1097
1098/**
1099 * htmlAutoCloseOnClose:
1100 * @ctxt: an HTML parser context
1101 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001102 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001103 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001104 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001105 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001106static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001107htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1108{
1109 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001110 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001111
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001112 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001113
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001114 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001115
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001116 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1117 break;
1118 /*
1119 * A missplaced endtag can only close elements with lower
1120 * or equal priority, so if we find an element with higher
1121 * priority before we find an element with
1122 * matching name, we just ignore this endtag
1123 */
1124 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1125 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001126 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001127 if (i < 0)
1128 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001129
1130 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001131 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001132 if ((info != NULL) && (info->endTag == 3)) {
1133 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1134 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001135 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001136 }
1137 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1138 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001139 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001140 }
1141}
1142
1143/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001144 * htmlAutoCloseOnEnd:
1145 * @ctxt: an HTML parser context
1146 *
1147 * Close all remaining tags at the end of the stream
1148 */
1149static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001150htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1151{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001152 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001153
William M. Brack899e64a2003-09-26 18:03:42 +00001154 if (ctxt->nameNr == 0)
1155 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001156 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001157 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1158 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001159 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001160 }
1161}
1162
1163/**
Owen Taylor3473f882001-02-23 17:55:21 +00001164 * htmlAutoClose:
1165 * @ctxt: an HTML parser context
1166 * @newtag: The new tag name or NULL
1167 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001168 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001169 * The list is kept in htmlStartClose array. This function is
1170 * called when a new tag has been detected and generates the
1171 * appropriates closes if possible/needed.
1172 * If newtag is NULL this mean we are at the end of the resource
1173 * and we should check
1174 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001175static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001176htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1177{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001178 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001179 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001180 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1181 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001182 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001183 }
1184 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001185 htmlAutoCloseOnEnd(ctxt);
1186 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001187 }
1188 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001189 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1190 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1191 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001192 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1193 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001194 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001195 }
Owen Taylor3473f882001-02-23 17:55:21 +00001196}
1197
1198/**
1199 * htmlAutoCloseTag:
1200 * @doc: the HTML document
1201 * @name: The tag name
1202 * @elem: the HTML element
1203 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001204 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001205 * The list is kept in htmlStartClose array. This function checks
1206 * if the element or one of it's children would autoclose the
1207 * given tag.
1208 *
1209 * Returns 1 if autoclose, 0 otherwise
1210 */
1211int
1212htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1213 htmlNodePtr child;
1214
1215 if (elem == NULL) return(1);
1216 if (xmlStrEqual(name, elem->name)) return(0);
1217 if (htmlCheckAutoClose(elem->name, name)) return(1);
1218 child = elem->children;
1219 while (child != NULL) {
1220 if (htmlAutoCloseTag(doc, name, child)) return(1);
1221 child = child->next;
1222 }
1223 return(0);
1224}
1225
1226/**
1227 * htmlIsAutoClosed:
1228 * @doc: the HTML document
1229 * @elem: the HTML element
1230 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001231 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001232 * The list is kept in htmlStartClose array. This function checks
1233 * if a tag is autoclosed by one of it's child
1234 *
1235 * Returns 1 if autoclosed, 0 otherwise
1236 */
1237int
1238htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1239 htmlNodePtr child;
1240
1241 if (elem == NULL) return(1);
1242 child = elem->children;
1243 while (child != NULL) {
1244 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1245 child = child->next;
1246 }
1247 return(0);
1248}
1249
1250/**
1251 * htmlCheckImplied:
1252 * @ctxt: an HTML parser context
1253 * @newtag: The new tag name
1254 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001255 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001256 * called when a new tag has been detected and generates the
1257 * appropriates implicit tags if missing
1258 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001259static void
Owen Taylor3473f882001-02-23 17:55:21 +00001260htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1261 if (!htmlOmittedDefaultValue)
1262 return;
1263 if (xmlStrEqual(newtag, BAD_CAST"html"))
1264 return;
1265 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001266 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001267 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1268 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1269 }
1270 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1271 return;
1272 if ((ctxt->nameNr <= 1) &&
1273 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1274 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1275 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1276 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1277 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1278 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1279 /*
1280 * dropped OBJECT ... i you put it first BODY will be
1281 * assumed !
1282 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001283 htmlnamePush(ctxt, BAD_CAST"head");
Owen Taylor3473f882001-02-23 17:55:21 +00001284 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1285 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1286 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1287 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1288 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1289 int i;
1290 for (i = 0;i < ctxt->nameNr;i++) {
1291 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1292 return;
1293 }
1294 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1295 return;
1296 }
1297 }
1298
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001299 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001300 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1301 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1302 }
1303}
1304
1305/**
1306 * htmlCheckParagraph
1307 * @ctxt: an HTML parser context
1308 *
1309 * Check whether a p element need to be implied before inserting
1310 * characters in the current element.
1311 *
1312 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1313 * in case of error.
1314 */
1315
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001316static int
Owen Taylor3473f882001-02-23 17:55:21 +00001317htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1318 const xmlChar *tag;
1319 int i;
1320
1321 if (ctxt == NULL)
1322 return(-1);
1323 tag = ctxt->name;
1324 if (tag == NULL) {
1325 htmlAutoClose(ctxt, BAD_CAST"p");
1326 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001327 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001328 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1329 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1330 return(1);
1331 }
1332 if (!htmlOmittedDefaultValue)
1333 return(0);
1334 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1335 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001336 htmlAutoClose(ctxt, BAD_CAST"p");
1337 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001338 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001339 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1340 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1341 return(1);
1342 }
1343 }
1344 return(0);
1345}
1346
1347/**
1348 * htmlIsScriptAttribute:
1349 * @name: an attribute name
1350 *
1351 * Check if an attribute is of content type Script
1352 *
1353 * Returns 1 is the attribute is a script 0 otherwise
1354 */
1355int
1356htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001357 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001358
1359 if (name == NULL)
1360 return(0);
1361 /*
1362 * all script attributes start with 'on'
1363 */
1364 if ((name[0] != 'o') || (name[1] != 'n'))
1365 return(0);
1366 for (i = 0;
1367 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1368 i++) {
1369 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1370 return(1);
1371 }
1372 return(0);
1373}
1374
1375/************************************************************************
1376 * *
1377 * The list of HTML predefined entities *
1378 * *
1379 ************************************************************************/
1380
1381
Daniel Veillard22090732001-07-16 00:06:07 +00001382static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001383/*
1384 * the 4 absolute ones, plus apostrophe.
1385 */
1386{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1387{ 38, "amp", "ampersand, U+0026 ISOnum" },
1388{ 39, "apos", "single quote" },
1389{ 60, "lt", "less-than sign, U+003C ISOnum" },
1390{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1391
1392/*
1393 * A bunch still in the 128-255 range
1394 * Replacing them depend really on the charset used.
1395 */
1396{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1397{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1398{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1399{ 163, "pound","pound sign, U+00A3 ISOnum" },
1400{ 164, "curren","currency sign, U+00A4 ISOnum" },
1401{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1402{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1403{ 167, "sect", "section sign, U+00A7 ISOnum" },
1404{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1405{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1406{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1407{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1408{ 172, "not", "not sign, U+00AC ISOnum" },
1409{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1410{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1411{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1412{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1413{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1414{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1415{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1416{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1417{ 181, "micro","micro sign, U+00B5 ISOnum" },
1418{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1419{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1420{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1421{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1422{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1423{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1424{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1425{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1426{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1427{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1428{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1429{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1430{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1431{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1432{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1433{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1434{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1435{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1436{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1437{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1438{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1439{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1440{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1441{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1442{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1443{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1444{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1445{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1446{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1447{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1448{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1449{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1450{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1451{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1452{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1453{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1454{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1455{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1456{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1457{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1458{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1459{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1460{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1461{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1462{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1463{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1464{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1465{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1466{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1467{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1468{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1469{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1470{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1471{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1472{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1473{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1474{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1475{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1476{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1477{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1478{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1479{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1480{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1481{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1482{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1483{ 247, "divide","division sign, U+00F7 ISOnum" },
1484{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1485{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1486{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1487{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1488{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1489{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1490{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1491{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1492
1493{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1494{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1495{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1496{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1497{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1498
1499/*
1500 * Anything below should really be kept as entities references
1501 */
1502{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1503
1504{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1505{ 732, "tilde","small tilde, U+02DC ISOdia" },
1506
1507{ 913, "Alpha","greek capital letter alpha, U+0391" },
1508{ 914, "Beta", "greek capital letter beta, U+0392" },
1509{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1510{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1511{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1512{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1513{ 919, "Eta", "greek capital letter eta, U+0397" },
1514{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1515{ 921, "Iota", "greek capital letter iota, U+0399" },
1516{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001517{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001518{ 924, "Mu", "greek capital letter mu, U+039C" },
1519{ 925, "Nu", "greek capital letter nu, U+039D" },
1520{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1521{ 927, "Omicron","greek capital letter omicron, U+039F" },
1522{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1523{ 929, "Rho", "greek capital letter rho, U+03A1" },
1524{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1525{ 932, "Tau", "greek capital letter tau, U+03A4" },
1526{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1527{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1528{ 935, "Chi", "greek capital letter chi, U+03A7" },
1529{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1530{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1531
1532{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1533{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1534{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1535{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1536{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1537{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1538{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1539{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1540{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1541{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1542{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1543{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1544{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1545{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1546{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1547{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1548{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1549{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1550{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1551{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1552{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1553{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1554{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1555{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1556{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1557{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1558{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1559{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1560
1561{ 8194, "ensp", "en space, U+2002 ISOpub" },
1562{ 8195, "emsp", "em space, U+2003 ISOpub" },
1563{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1564{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1565{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1566{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1567{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1568{ 8211, "ndash","en dash, U+2013 ISOpub" },
1569{ 8212, "mdash","em dash, U+2014 ISOpub" },
1570{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1571{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1572{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1573{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1574{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1575{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1576{ 8224, "dagger","dagger, U+2020 ISOpub" },
1577{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1578
1579{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1580{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1581
1582{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1583
1584{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1585{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1586
1587{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1588{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1589
1590{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1591{ 8260, "frasl","fraction slash, U+2044 NEW" },
1592
1593{ 8364, "euro", "euro sign, U+20AC NEW" },
1594
1595{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1596{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1597{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1598{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1599{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1600{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1601{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1602{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1603{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1604{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1605{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1606{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1607{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1608{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1609{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1610{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1611
1612{ 8704, "forall","for all, U+2200 ISOtech" },
1613{ 8706, "part", "partial differential, U+2202 ISOtech" },
1614{ 8707, "exist","there exists, U+2203 ISOtech" },
1615{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1616{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1617{ 8712, "isin", "element of, U+2208 ISOtech" },
1618{ 8713, "notin","not an element of, U+2209 ISOtech" },
1619{ 8715, "ni", "contains as member, U+220B ISOtech" },
1620{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001621{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001622{ 8722, "minus","minus sign, U+2212 ISOtech" },
1623{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1624{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1625{ 8733, "prop", "proportional to, U+221D ISOtech" },
1626{ 8734, "infin","infinity, U+221E ISOtech" },
1627{ 8736, "ang", "angle, U+2220 ISOamso" },
1628{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1629{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1630{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1631{ 8746, "cup", "union = cup, U+222A ISOtech" },
1632{ 8747, "int", "integral, U+222B ISOtech" },
1633{ 8756, "there4","therefore, U+2234 ISOtech" },
1634{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1635{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1636{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1637{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1638{ 8801, "equiv","identical to, U+2261 ISOtech" },
1639{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1640{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1641{ 8834, "sub", "subset of, U+2282 ISOtech" },
1642{ 8835, "sup", "superset of, U+2283 ISOtech" },
1643{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1644{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1645{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1646{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1647{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1648{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1649{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1650{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1651{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1652{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1653{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1654{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1655{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1656{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1657
1658{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1659{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1660{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1661{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1662
1663};
1664
1665/************************************************************************
1666 * *
1667 * Commodity functions to handle entities *
1668 * *
1669 ************************************************************************/
1670
1671/*
1672 * Macro used to grow the current buffer.
1673 */
1674#define growBuffer(buffer) { \
1675 buffer##_size *= 2; \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001676 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Owen Taylor3473f882001-02-23 17:55:21 +00001677 if (buffer == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001678 htmlErrMemory(ctxt, "growing buffer\n"); \
Owen Taylor3473f882001-02-23 17:55:21 +00001679 return(NULL); \
1680 } \
1681}
1682
1683/**
1684 * htmlEntityLookup:
1685 * @name: the entity name
1686 *
1687 * Lookup the given entity in EntitiesTable
1688 *
1689 * TODO: the linear scan is really ugly, an hash table is really needed.
1690 *
1691 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1692 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001693const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001694htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001695 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001696
1697 for (i = 0;i < (sizeof(html40EntitiesTable)/
1698 sizeof(html40EntitiesTable[0]));i++) {
1699 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001700 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001701 }
1702 }
1703 return(NULL);
1704}
1705
1706/**
1707 * htmlEntityValueLookup:
1708 * @value: the entity's unicode value
1709 *
1710 * Lookup the given entity in EntitiesTable
1711 *
1712 * TODO: the linear scan is really ugly, an hash table is really needed.
1713 *
1714 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1715 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001716const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001717htmlEntityValueLookup(unsigned int value) {
1718 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001719
1720 for (i = 0;i < (sizeof(html40EntitiesTable)/
1721 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001722 if (html40EntitiesTable[i].value >= value) {
1723 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001724 break;
William M. Brack78637da2003-07-31 14:47:38 +00001725 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001726 }
Owen Taylor3473f882001-02-23 17:55:21 +00001727 }
1728 return(NULL);
1729}
1730
1731/**
1732 * UTF8ToHtml:
1733 * @out: a pointer to an array of bytes to store the result
1734 * @outlen: the length of @out
1735 * @in: a pointer to an array of UTF-8 chars
1736 * @inlen: the length of @in
1737 *
1738 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1739 * plus HTML entities block of chars out.
1740 *
1741 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1742 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001743 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001744 * The value of @outlen after return is the number of octets consumed.
1745 */
1746int
1747UTF8ToHtml(unsigned char* out, int *outlen,
1748 const unsigned char* in, int *inlen) {
1749 const unsigned char* processed = in;
1750 const unsigned char* outend;
1751 const unsigned char* outstart = out;
1752 const unsigned char* instart = in;
1753 const unsigned char* inend;
1754 unsigned int c, d;
1755 int trailing;
1756
1757 if (in == NULL) {
1758 /*
1759 * initialization nothing to do
1760 */
1761 *outlen = 0;
1762 *inlen = 0;
1763 return(0);
1764 }
1765 inend = in + (*inlen);
1766 outend = out + (*outlen);
1767 while (in < inend) {
1768 d = *in++;
1769 if (d < 0x80) { c= d; trailing= 0; }
1770 else if (d < 0xC0) {
1771 /* trailing byte in leading position */
1772 *outlen = out - outstart;
1773 *inlen = processed - instart;
1774 return(-2);
1775 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1776 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1777 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1778 else {
1779 /* no chance for this in Ascii */
1780 *outlen = out - outstart;
1781 *inlen = processed - instart;
1782 return(-2);
1783 }
1784
1785 if (inend - in < trailing) {
1786 break;
1787 }
1788
1789 for ( ; trailing; trailing--) {
1790 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1791 break;
1792 c <<= 6;
1793 c |= d & 0x3F;
1794 }
1795
1796 /* assertion: c is a single UTF-4 value */
1797 if (c < 0x80) {
1798 if (out + 1 >= outend)
1799 break;
1800 *out++ = c;
1801 } else {
1802 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001803 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001804
1805 /*
1806 * Try to lookup a predefined HTML entity for it
1807 */
1808
1809 ent = htmlEntityValueLookup(c);
1810 if (ent == NULL) {
1811 /* no chance for this in Ascii */
1812 *outlen = out - outstart;
1813 *inlen = processed - instart;
1814 return(-2);
1815 }
1816 len = strlen(ent->name);
1817 if (out + 2 + len >= outend)
1818 break;
1819 *out++ = '&';
1820 memcpy(out, ent->name, len);
1821 out += len;
1822 *out++ = ';';
1823 }
1824 processed = in;
1825 }
1826 *outlen = out - outstart;
1827 *inlen = processed - instart;
1828 return(0);
1829}
1830
1831/**
1832 * htmlEncodeEntities:
1833 * @out: a pointer to an array of bytes to store the result
1834 * @outlen: the length of @out
1835 * @in: a pointer to an array of UTF-8 chars
1836 * @inlen: the length of @in
1837 * @quoteChar: the quote character to escape (' or ") or zero.
1838 *
1839 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1840 * plus HTML entities block of chars out.
1841 *
1842 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1843 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001844 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001845 * The value of @outlen after return is the number of octets consumed.
1846 */
1847int
1848htmlEncodeEntities(unsigned char* out, int *outlen,
1849 const unsigned char* in, int *inlen, int quoteChar) {
1850 const unsigned char* processed = in;
1851 const unsigned char* outend = out + (*outlen);
1852 const unsigned char* outstart = out;
1853 const unsigned char* instart = in;
1854 const unsigned char* inend = in + (*inlen);
1855 unsigned int c, d;
1856 int trailing;
1857
1858 while (in < inend) {
1859 d = *in++;
1860 if (d < 0x80) { c= d; trailing= 0; }
1861 else if (d < 0xC0) {
1862 /* trailing byte in leading position */
1863 *outlen = out - outstart;
1864 *inlen = processed - instart;
1865 return(-2);
1866 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1867 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1868 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1869 else {
1870 /* no chance for this in Ascii */
1871 *outlen = out - outstart;
1872 *inlen = processed - instart;
1873 return(-2);
1874 }
1875
1876 if (inend - in < trailing)
1877 break;
1878
1879 while (trailing--) {
1880 if (((d= *in++) & 0xC0) != 0x80) {
1881 *outlen = out - outstart;
1882 *inlen = processed - instart;
1883 return(-2);
1884 }
1885 c <<= 6;
1886 c |= d & 0x3F;
1887 }
1888
1889 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001890 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1891 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001892 if (out >= outend)
1893 break;
1894 *out++ = c;
1895 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001896 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001897 const char *cp;
1898 char nbuf[16];
1899 int len;
1900
1901 /*
1902 * Try to lookup a predefined HTML entity for it
1903 */
1904 ent = htmlEntityValueLookup(c);
1905 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001906 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001907 cp = nbuf;
1908 }
1909 else
1910 cp = ent->name;
1911 len = strlen(cp);
1912 if (out + 2 + len > outend)
1913 break;
1914 *out++ = '&';
1915 memcpy(out, cp, len);
1916 out += len;
1917 *out++ = ';';
1918 }
1919 processed = in;
1920 }
1921 *outlen = out - outstart;
1922 *inlen = processed - instart;
1923 return(0);
1924}
1925
Owen Taylor3473f882001-02-23 17:55:21 +00001926/************************************************************************
1927 * *
1928 * Commodity functions to handle streams *
1929 * *
1930 ************************************************************************/
1931
1932/**
Owen Taylor3473f882001-02-23 17:55:21 +00001933 * htmlNewInputStream:
1934 * @ctxt: an HTML parser context
1935 *
1936 * Create a new input stream structure
1937 * Returns the new input stream or NULL
1938 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001939static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001940htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1941 htmlParserInputPtr input;
1942
1943 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1944 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00001945 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001946 return(NULL);
1947 }
1948 memset(input, 0, sizeof(htmlParserInput));
1949 input->filename = NULL;
1950 input->directory = NULL;
1951 input->base = NULL;
1952 input->cur = NULL;
1953 input->buf = NULL;
1954 input->line = 1;
1955 input->col = 1;
1956 input->buf = NULL;
1957 input->free = NULL;
1958 input->version = NULL;
1959 input->consumed = 0;
1960 input->length = 0;
1961 return(input);
1962}
1963
1964
1965/************************************************************************
1966 * *
1967 * Commodity functions, cleanup needed ? *
1968 * *
1969 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001970/*
1971 * all tags allowing pc data from the html 4.01 loose dtd
1972 * NOTE: it might be more apropriate to integrate this information
1973 * into the html40ElementTable array but I don't want to risk any
1974 * binary incomptibility
1975 */
1976static const char *allowPCData[] = {
1977 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
1978 "blockquote", "body", "button", "caption", "center", "cite", "code",
1979 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
1980 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
1981 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
1982 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
1983};
Owen Taylor3473f882001-02-23 17:55:21 +00001984
1985/**
1986 * areBlanks:
1987 * @ctxt: an HTML parser context
1988 * @str: a xmlChar *
1989 * @len: the size of @str
1990 *
1991 * Is this a sequence of blank chars that one can ignore ?
1992 *
1993 * Returns 1 if ignorable 0 otherwise.
1994 */
1995
1996static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001997 unsigned int i;
1998 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00001999 xmlNodePtr lastChild;
2000
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002001 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002002 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002003
2004 if (CUR == 0) return(1);
2005 if (CUR != '<') return(0);
2006 if (ctxt->name == NULL)
2007 return(1);
2008 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2009 return(1);
2010 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2011 return(1);
2012 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
2013 return(1);
2014 if (ctxt->node == NULL) return(0);
2015 lastChild = xmlGetLastChild(ctxt->node);
2016 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002017 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2018 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002019 /* keep ws in constructs like ...<b> </b>...
2020 for all tags "b" allowing PCDATA */
2021 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2022 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2023 return(0);
2024 }
2025 }
Owen Taylor3473f882001-02-23 17:55:21 +00002026 } else if (xmlNodeIsText(lastChild)) {
2027 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002028 } else {
2029 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2030 for all tags "p" allowing PCDATA */
2031 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2032 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2033 return(0);
2034 }
2035 }
Owen Taylor3473f882001-02-23 17:55:21 +00002036 }
2037 return(1);
2038}
2039
2040/**
Owen Taylor3473f882001-02-23 17:55:21 +00002041 * htmlNewDocNoDtD:
2042 * @URI: URI for the dtd, or NULL
2043 * @ExternalID: the external ID of the DTD, or NULL
2044 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002045 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2046 * are NULL
2047 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002048 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002049 */
2050htmlDocPtr
2051htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2052 xmlDocPtr cur;
2053
2054 /*
2055 * Allocate a new document and fill the fields.
2056 */
2057 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2058 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002059 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002060 return(NULL);
2061 }
2062 memset(cur, 0, sizeof(xmlDoc));
2063
2064 cur->type = XML_HTML_DOCUMENT_NODE;
2065 cur->version = NULL;
2066 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002067 cur->doc = cur;
2068 cur->name = NULL;
2069 cur->children = NULL;
2070 cur->extSubset = NULL;
2071 cur->oldNs = NULL;
2072 cur->encoding = NULL;
2073 cur->standalone = 1;
2074 cur->compression = 0;
2075 cur->ids = NULL;
2076 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002077 cur->_private = NULL;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002078 if ((ExternalID != NULL) ||
2079 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002080 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002081 return(cur);
2082}
2083
2084/**
2085 * htmlNewDoc:
2086 * @URI: URI for the dtd, or NULL
2087 * @ExternalID: the external ID of the DTD, or NULL
2088 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002089 * Creates a new HTML document
2090 *
Owen Taylor3473f882001-02-23 17:55:21 +00002091 * Returns a new document
2092 */
2093htmlDocPtr
2094htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2095 if ((URI == NULL) && (ExternalID == NULL))
2096 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002097 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2098 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002099
2100 return(htmlNewDocNoDtD(URI, ExternalID));
2101}
2102
2103
2104/************************************************************************
2105 * *
2106 * The parser itself *
2107 * Relates to http://www.w3.org/TR/html40 *
2108 * *
2109 ************************************************************************/
2110
2111/************************************************************************
2112 * *
2113 * The parser itself *
2114 * *
2115 ************************************************************************/
2116
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002117static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002118
Owen Taylor3473f882001-02-23 17:55:21 +00002119/**
2120 * htmlParseHTMLName:
2121 * @ctxt: an HTML parser context
2122 *
2123 * parse an HTML tag or attribute name, note that we convert it to lowercase
2124 * since HTML names are not case-sensitive.
2125 *
2126 * Returns the Tag Name parsed or NULL
2127 */
2128
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002129static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002130htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002131 int i = 0;
2132 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2133
William M. Brack76e95df2003-10-18 16:20:14 +00002134 if (!IS_LETTER_CH(CUR) && (CUR != '_') &&
Owen Taylor3473f882001-02-23 17:55:21 +00002135 (CUR != ':')) return(NULL);
2136
2137 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brack76e95df2003-10-18 16:20:14 +00002138 ((IS_LETTER_CH(CUR)) || (IS_DIGIT_CH(CUR)) ||
Owen Taylor3473f882001-02-23 17:55:21 +00002139 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2140 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2141 else loc[i] = CUR;
2142 i++;
2143
2144 NEXT;
2145 }
2146
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002147 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002148}
2149
2150/**
2151 * htmlParseName:
2152 * @ctxt: an HTML parser context
2153 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002154 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002155 *
2156 * Returns the Name parsed or NULL
2157 */
2158
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002159static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002160htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002161 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002162 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002163 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002164
2165 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002166
2167 /*
2168 * Accelerator for simple ASCII names
2169 */
2170 in = ctxt->input->cur;
2171 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2172 ((*in >= 0x41) && (*in <= 0x5A)) ||
2173 (*in == '_') || (*in == ':')) {
2174 in++;
2175 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2176 ((*in >= 0x41) && (*in <= 0x5A)) ||
2177 ((*in >= 0x30) && (*in <= 0x39)) ||
2178 (*in == '_') || (*in == '-') ||
2179 (*in == ':') || (*in == '.'))
2180 in++;
2181 if ((*in > 0) && (*in < 0x80)) {
2182 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002183 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002184 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002185 ctxt->nbChars += count;
2186 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002187 return(ret);
2188 }
2189 }
2190 return(htmlParseNameComplex(ctxt));
2191}
2192
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002193static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002194htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002195 int len = 0, l;
2196 int c;
2197 int count = 0;
2198
2199 /*
2200 * Handler for more complex cases
2201 */
2202 GROW;
2203 c = CUR_CHAR(l);
2204 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2205 (!IS_LETTER(c) && (c != '_') &&
2206 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002207 return(NULL);
2208 }
2209
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002210 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2211 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2212 (c == '.') || (c == '-') ||
2213 (c == '_') || (c == ':') ||
2214 (IS_COMBINING(c)) ||
2215 (IS_EXTENDER(c)))) {
2216 if (count++ > 100) {
2217 count = 0;
2218 GROW;
2219 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002220 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002221 NEXTL(l);
2222 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002223 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002224 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002225}
2226
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002227
Owen Taylor3473f882001-02-23 17:55:21 +00002228/**
2229 * htmlParseHTMLAttribute:
2230 * @ctxt: an HTML parser context
2231 * @stop: a char stop value
2232 *
2233 * parse an HTML attribute value till the stop (quote), if
2234 * stop is 0 then it stops at the first space
2235 *
2236 * Returns the attribute parsed or NULL
2237 */
2238
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002239static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002240htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2241 xmlChar *buffer = NULL;
2242 int buffer_size = 0;
2243 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002244 const xmlChar *name = NULL;
2245 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002246 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002247
2248 /*
2249 * allocate a translation buffer.
2250 */
2251 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002252 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002253 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002254 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002255 return(NULL);
2256 }
2257 out = buffer;
2258
2259 /*
2260 * Ok loop until we reach one of the ending chars
2261 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002262 while ((CUR != 0) && (CUR != stop)) {
2263 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002264 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002265 if (CUR == '&') {
2266 if (NXT(1) == '#') {
2267 unsigned int c;
2268 int bits;
2269
2270 c = htmlParseCharRef(ctxt);
2271 if (c < 0x80)
2272 { *out++ = c; bits= -6; }
2273 else if (c < 0x800)
2274 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2275 else if (c < 0x10000)
2276 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2277 else
2278 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2279
2280 for ( ; bits >= 0; bits-= 6) {
2281 *out++ = ((c >> bits) & 0x3F) | 0x80;
2282 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002283
2284 if (out - buffer > buffer_size - 100) {
2285 int indx = out - buffer;
2286
2287 growBuffer(buffer);
2288 out = &buffer[indx];
2289 }
Owen Taylor3473f882001-02-23 17:55:21 +00002290 } else {
2291 ent = htmlParseEntityRef(ctxt, &name);
2292 if (name == NULL) {
2293 *out++ = '&';
2294 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002295 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002296
2297 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002298 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002299 }
2300 } else if (ent == NULL) {
2301 *out++ = '&';
2302 cur = name;
2303 while (*cur != 0) {
2304 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002305 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002306
2307 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002308 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002309 }
2310 *out++ = *cur++;
2311 }
Owen Taylor3473f882001-02-23 17:55:21 +00002312 } else {
2313 unsigned int c;
2314 int bits;
2315
2316 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002317 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002318
2319 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002320 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002321 }
2322 c = (xmlChar)ent->value;
2323 if (c < 0x80)
2324 { *out++ = c; bits= -6; }
2325 else if (c < 0x800)
2326 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2327 else if (c < 0x10000)
2328 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2329 else
2330 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2331
2332 for ( ; bits >= 0; bits-= 6) {
2333 *out++ = ((c >> bits) & 0x3F) | 0x80;
2334 }
Owen Taylor3473f882001-02-23 17:55:21 +00002335 }
2336 }
2337 } else {
2338 unsigned int c;
2339 int bits, l;
2340
2341 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002342 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002343
2344 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002345 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002346 }
2347 c = CUR_CHAR(l);
2348 if (c < 0x80)
2349 { *out++ = c; bits= -6; }
2350 else if (c < 0x800)
2351 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2352 else if (c < 0x10000)
2353 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2354 else
2355 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2356
2357 for ( ; bits >= 0; bits-= 6) {
2358 *out++ = ((c >> bits) & 0x3F) | 0x80;
2359 }
2360 NEXT;
2361 }
2362 }
2363 *out++ = 0;
2364 return(buffer);
2365}
2366
2367/**
Owen Taylor3473f882001-02-23 17:55:21 +00002368 * htmlParseEntityRef:
2369 * @ctxt: an HTML parser context
2370 * @str: location to store the entity name
2371 *
2372 * parse an HTML ENTITY references
2373 *
2374 * [68] EntityRef ::= '&' Name ';'
2375 *
2376 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2377 * if non-NULL *str will have to be freed by the caller.
2378 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002379const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002380htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2381 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002382 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002383 *str = NULL;
2384
2385 if (CUR == '&') {
2386 NEXT;
2387 name = htmlParseName(ctxt);
2388 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002389 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2390 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002391 } else {
2392 GROW;
2393 if (CUR == ';') {
2394 *str = name;
2395
2396 /*
2397 * Lookup the entity in the table.
2398 */
2399 ent = htmlEntityLookup(name);
2400 if (ent != NULL) /* OK that's ugly !!! */
2401 NEXT;
2402 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002403 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2404 "htmlParseEntityRef: expecting ';'\n",
2405 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002406 *str = name;
2407 }
2408 }
2409 }
2410 return(ent);
2411}
2412
2413/**
2414 * htmlParseAttValue:
2415 * @ctxt: an HTML parser context
2416 *
2417 * parse a value for an attribute
2418 * Note: the parser won't do substitution of entities here, this
2419 * will be handled later in xmlStringGetNodeList, unless it was
2420 * asked for ctxt->replaceEntities != 0
2421 *
2422 * Returns the AttValue parsed or NULL.
2423 */
2424
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002425static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002426htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2427 xmlChar *ret = NULL;
2428
2429 if (CUR == '"') {
2430 NEXT;
2431 ret = htmlParseHTMLAttribute(ctxt, '"');
2432 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002433 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2434 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002435 } else
2436 NEXT;
2437 } else if (CUR == '\'') {
2438 NEXT;
2439 ret = htmlParseHTMLAttribute(ctxt, '\'');
2440 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002441 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2442 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002443 } else
2444 NEXT;
2445 } else {
2446 /*
2447 * That's an HTMLism, the attribute value may not be quoted
2448 */
2449 ret = htmlParseHTMLAttribute(ctxt, 0);
2450 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002451 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2452 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002453 }
2454 }
2455 return(ret);
2456}
2457
2458/**
2459 * htmlParseSystemLiteral:
2460 * @ctxt: an HTML parser context
2461 *
2462 * parse an HTML Literal
2463 *
2464 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2465 *
2466 * Returns the SystemLiteral parsed or NULL
2467 */
2468
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002469static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002470htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2471 const xmlChar *q;
2472 xmlChar *ret = NULL;
2473
2474 if (CUR == '"') {
2475 NEXT;
2476 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002477 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002478 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002479 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002480 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2481 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002482 } else {
2483 ret = xmlStrndup(q, CUR_PTR - q);
2484 NEXT;
2485 }
2486 } else if (CUR == '\'') {
2487 NEXT;
2488 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002489 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002490 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002491 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002492 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2493 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002494 } else {
2495 ret = xmlStrndup(q, CUR_PTR - q);
2496 NEXT;
2497 }
2498 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002499 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2500 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002501 }
2502
2503 return(ret);
2504}
2505
2506/**
2507 * htmlParsePubidLiteral:
2508 * @ctxt: an HTML parser context
2509 *
2510 * parse an HTML public literal
2511 *
2512 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2513 *
2514 * Returns the PubidLiteral parsed or NULL.
2515 */
2516
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002517static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002518htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2519 const xmlChar *q;
2520 xmlChar *ret = NULL;
2521 /*
2522 * Name ::= (Letter | '_') (NameChar)*
2523 */
2524 if (CUR == '"') {
2525 NEXT;
2526 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002527 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002528 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002529 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2530 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002531 } else {
2532 ret = xmlStrndup(q, CUR_PTR - q);
2533 NEXT;
2534 }
2535 } else if (CUR == '\'') {
2536 NEXT;
2537 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002538 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002539 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002540 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002541 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2542 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002543 } else {
2544 ret = xmlStrndup(q, CUR_PTR - q);
2545 NEXT;
2546 }
2547 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002548 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2549 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002550 }
2551
2552 return(ret);
2553}
2554
2555/**
2556 * htmlParseScript:
2557 * @ctxt: an HTML parser context
2558 *
2559 * parse the content of an HTML SCRIPT or STYLE element
2560 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2561 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2562 * http://www.w3.org/TR/html4/types.html#type-script
2563 * http://www.w3.org/TR/html4/types.html#h-6.15
2564 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2565 *
2566 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2567 * element and the value of intrinsic event attributes. User agents must
2568 * not evaluate script data as HTML markup but instead must pass it on as
2569 * data to a script engine.
2570 * NOTES:
2571 * - The content is passed like CDATA
2572 * - the attributes for style and scripting "onXXX" are also described
2573 * as CDATA but SGML allows entities references in attributes so their
2574 * processing is identical as other attributes
2575 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002576static void
Owen Taylor3473f882001-02-23 17:55:21 +00002577htmlParseScript(htmlParserCtxtPtr ctxt) {
2578 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2579 int nbchar = 0;
2580 xmlChar cur;
2581
2582 SHRINK;
2583 cur = CUR;
William M. Brack76e95df2003-10-18 16:20:14 +00002584 while (IS_CHAR_CH(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002585 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2586 (NXT(3) == '-')) {
2587 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2588 if (ctxt->sax->cdataBlock!= NULL) {
2589 /*
2590 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2591 */
2592 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002593 } else if (ctxt->sax->characters != NULL) {
2594 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Daniel Veillardc1f78342001-11-10 11:43:05 +00002595 }
2596 }
2597 nbchar = 0;
2598 htmlParseComment(ctxt);
2599 cur = CUR;
2600 continue;
2601 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002602 /*
2603 * One should break here, the specification is clear:
2604 * Authors should therefore escape "</" within the content.
2605 * Escape mechanisms are specific to each scripting or
2606 * style sheet language.
2607 */
2608 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2609 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2610 break; /* while */
2611 }
2612 buf[nbchar++] = cur;
2613 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2614 if (ctxt->sax->cdataBlock!= NULL) {
2615 /*
2616 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2617 */
2618 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002619 } else if (ctxt->sax->characters != NULL) {
2620 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002621 }
2622 nbchar = 0;
2623 }
2624 NEXT;
2625 cur = CUR;
2626 }
William M. Brack76e95df2003-10-18 16:20:14 +00002627 if (!(IS_CHAR_CH(cur))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002628 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2629 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002630 NEXT;
2631 }
2632
2633 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2634 if (ctxt->sax->cdataBlock!= NULL) {
2635 /*
2636 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2637 */
2638 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002639 } else if (ctxt->sax->characters != NULL) {
2640 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002641 }
2642 }
2643}
2644
2645
2646/**
2647 * htmlParseCharData:
2648 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002649 *
2650 * parse a CharData section.
2651 * if we are within a CDATA section ']]>' marks an end of section.
2652 *
2653 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2654 */
2655
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002656static void
2657htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002658 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2659 int nbchar = 0;
2660 int cur, l;
2661
2662 SHRINK;
2663 cur = CUR_CHAR(l);
2664 while (((cur != '<') || (ctxt->token == '<')) &&
2665 ((cur != '&') || (ctxt->token == '&')) &&
2666 (IS_CHAR(cur))) {
2667 COPY_BUF(l,buf,nbchar,cur);
2668 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2669 /*
2670 * Ok the segment is to be consumed as chars.
2671 */
2672 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2673 if (areBlanks(ctxt, buf, nbchar)) {
2674 if (ctxt->sax->ignorableWhitespace != NULL)
2675 ctxt->sax->ignorableWhitespace(ctxt->userData,
2676 buf, nbchar);
2677 } else {
2678 htmlCheckParagraph(ctxt);
2679 if (ctxt->sax->characters != NULL)
2680 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2681 }
2682 }
2683 nbchar = 0;
2684 }
2685 NEXTL(l);
2686 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002687 if (cur == 0) {
2688 SHRINK;
2689 GROW;
2690 cur = CUR_CHAR(l);
2691 }
Owen Taylor3473f882001-02-23 17:55:21 +00002692 }
2693 if (nbchar != 0) {
2694 /*
2695 * Ok the segment is to be consumed as chars.
2696 */
2697 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2698 if (areBlanks(ctxt, buf, nbchar)) {
2699 if (ctxt->sax->ignorableWhitespace != NULL)
2700 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2701 } else {
2702 htmlCheckParagraph(ctxt);
2703 if (ctxt->sax->characters != NULL)
2704 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2705 }
2706 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002707 } else {
2708 /*
2709 * Loop detection
2710 */
2711 if (cur == 0)
2712 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002713 }
2714}
2715
2716/**
2717 * htmlParseExternalID:
2718 * @ctxt: an HTML parser context
2719 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002720 *
2721 * Parse an External ID or a Public ID
2722 *
Owen Taylor3473f882001-02-23 17:55:21 +00002723 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2724 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2725 *
2726 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2727 *
2728 * Returns the function returns SystemLiteral and in the second
2729 * case publicID receives PubidLiteral, is strict is off
2730 * it is possible to return NULL and have publicID set.
2731 */
2732
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002733static xmlChar *
2734htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002735 xmlChar *URI = NULL;
2736
2737 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2738 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2739 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2740 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002741 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002742 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2743 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002744 }
2745 SKIP_BLANKS;
2746 URI = htmlParseSystemLiteral(ctxt);
2747 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002748 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2749 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002750 }
2751 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2752 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2753 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2754 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002755 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002756 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2757 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002758 }
2759 SKIP_BLANKS;
2760 *publicID = htmlParsePubidLiteral(ctxt);
2761 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002762 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2763 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2764 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002765 }
2766 SKIP_BLANKS;
2767 if ((CUR == '"') || (CUR == '\'')) {
2768 URI = htmlParseSystemLiteral(ctxt);
2769 }
2770 }
2771 return(URI);
2772}
2773
2774/**
2775 * htmlParseComment:
2776 * @ctxt: an HTML parser context
2777 *
2778 * Parse an XML (SGML) comment <!-- .... -->
2779 *
2780 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2781 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002782static void
Owen Taylor3473f882001-02-23 17:55:21 +00002783htmlParseComment(htmlParserCtxtPtr ctxt) {
2784 xmlChar *buf = NULL;
2785 int len;
2786 int size = HTML_PARSER_BUFFER_SIZE;
2787 int q, ql;
2788 int r, rl;
2789 int cur, l;
2790 xmlParserInputState state;
2791
2792 /*
2793 * Check that there is a comment right here.
2794 */
2795 if ((RAW != '<') || (NXT(1) != '!') ||
2796 (NXT(2) != '-') || (NXT(3) != '-')) return;
2797
2798 state = ctxt->instate;
2799 ctxt->instate = XML_PARSER_COMMENT;
2800 SHRINK;
2801 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002802 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002803 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002804 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002805 ctxt->instate = state;
2806 return;
2807 }
2808 q = CUR_CHAR(ql);
2809 NEXTL(ql);
2810 r = CUR_CHAR(rl);
2811 NEXTL(rl);
2812 cur = CUR_CHAR(l);
2813 len = 0;
2814 while (IS_CHAR(cur) &&
2815 ((cur != '>') ||
2816 (r != '-') || (q != '-'))) {
2817 if (len + 5 >= size) {
2818 size *= 2;
2819 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2820 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002821 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002822 ctxt->instate = state;
2823 return;
2824 }
2825 }
2826 COPY_BUF(ql,buf,len,q);
2827 q = r;
2828 ql = rl;
2829 r = cur;
2830 rl = l;
2831 NEXTL(l);
2832 cur = CUR_CHAR(l);
2833 if (cur == 0) {
2834 SHRINK;
2835 GROW;
2836 cur = CUR_CHAR(l);
2837 }
2838 }
2839 buf[len] = 0;
2840 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002841 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
2842 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002843 xmlFree(buf);
2844 } else {
2845 NEXT;
2846 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2847 (!ctxt->disableSAX))
2848 ctxt->sax->comment(ctxt->userData, buf);
2849 xmlFree(buf);
2850 }
2851 ctxt->instate = state;
2852}
2853
2854/**
2855 * htmlParseCharRef:
2856 * @ctxt: an HTML parser context
2857 *
2858 * parse Reference declarations
2859 *
2860 * [66] CharRef ::= '&#' [0-9]+ ';' |
2861 * '&#x' [0-9a-fA-F]+ ';'
2862 *
2863 * Returns the value parsed (as an int)
2864 */
2865int
2866htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2867 int val = 0;
2868
2869 if ((CUR == '&') && (NXT(1) == '#') &&
2870 (NXT(2) == 'x')) {
2871 SKIP(3);
2872 while (CUR != ';') {
2873 if ((CUR >= '0') && (CUR <= '9'))
2874 val = val * 16 + (CUR - '0');
2875 else if ((CUR >= 'a') && (CUR <= 'f'))
2876 val = val * 16 + (CUR - 'a') + 10;
2877 else if ((CUR >= 'A') && (CUR <= 'F'))
2878 val = val * 16 + (CUR - 'A') + 10;
2879 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002880 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
2881 "htmlParseCharRef: invalid hexadecimal value\n",
2882 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002883 return(0);
2884 }
2885 NEXT;
2886 }
2887 if (CUR == ';')
2888 NEXT;
2889 } else if ((CUR == '&') && (NXT(1) == '#')) {
2890 SKIP(2);
2891 while (CUR != ';') {
2892 if ((CUR >= '0') && (CUR <= '9'))
2893 val = val * 10 + (CUR - '0');
2894 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002895 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
2896 "htmlParseCharRef: invalid decimal value\n",
2897 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002898 return(0);
2899 }
2900 NEXT;
2901 }
2902 if (CUR == ';')
2903 NEXT;
2904 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002905 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
2906 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002907 }
2908 /*
2909 * Check the value IS_CHAR ...
2910 */
2911 if (IS_CHAR(val)) {
2912 return(val);
2913 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002914 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2915 "htmlParseCharRef: invalid xmlChar value %d\n",
2916 val);
Owen Taylor3473f882001-02-23 17:55:21 +00002917 }
2918 return(0);
2919}
2920
2921
2922/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00002923 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00002924 * @ctxt: an HTML parser context
2925 *
2926 * parse a DOCTYPE declaration
2927 *
2928 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2929 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2930 */
2931
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002932static void
Owen Taylor3473f882001-02-23 17:55:21 +00002933htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002934 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00002935 xmlChar *ExternalID = NULL;
2936 xmlChar *URI = NULL;
2937
2938 /*
2939 * We know that '<!DOCTYPE' has been detected.
2940 */
2941 SKIP(9);
2942
2943 SKIP_BLANKS;
2944
2945 /*
2946 * Parse the DOCTYPE name.
2947 */
2948 name = htmlParseName(ctxt);
2949 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002950 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2951 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
2952 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002953 }
2954 /*
2955 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2956 */
2957
2958 SKIP_BLANKS;
2959
2960 /*
2961 * Check for SystemID and ExternalID
2962 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002963 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002964 SKIP_BLANKS;
2965
2966 /*
2967 * We should be at the end of the DOCTYPE declaration.
2968 */
2969 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002970 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
2971 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002972 /* We shouldn't try to resynchronize ... */
2973 }
2974 NEXT;
2975
2976 /*
2977 * Create or update the document accordingly to the DOCTYPE
2978 */
2979 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2980 (!ctxt->disableSAX))
2981 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2982
2983 /*
2984 * Cleanup, since we don't use all those identifiers
2985 */
2986 if (URI != NULL) xmlFree(URI);
2987 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002988}
2989
2990/**
2991 * htmlParseAttribute:
2992 * @ctxt: an HTML parser context
2993 * @value: a xmlChar ** used to store the value of the attribute
2994 *
2995 * parse an attribute
2996 *
2997 * [41] Attribute ::= Name Eq AttValue
2998 *
2999 * [25] Eq ::= S? '=' S?
3000 *
3001 * With namespace:
3002 *
3003 * [NS 11] Attribute ::= QName Eq AttValue
3004 *
3005 * Also the case QName == xmlns:??? is handled independently as a namespace
3006 * definition.
3007 *
3008 * Returns the attribute name, and the value in *value.
3009 */
3010
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003011static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003012htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003013 const xmlChar *name;
3014 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003015
3016 *value = NULL;
3017 name = htmlParseHTMLName(ctxt);
3018 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003019 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3020 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003021 return(NULL);
3022 }
3023
3024 /*
3025 * read the value
3026 */
3027 SKIP_BLANKS;
3028 if (CUR == '=') {
3029 NEXT;
3030 SKIP_BLANKS;
3031 val = htmlParseAttValue(ctxt);
3032 /******
3033 } else {
3034 * TODO : some attribute must have values, some may not
3035 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3036 ctxt->sax->warning(ctxt->userData,
3037 "No value for attribute %s\n", name); */
3038 }
3039
3040 *value = val;
3041 return(name);
3042}
3043
3044/**
3045 * htmlCheckEncoding:
3046 * @ctxt: an HTML parser context
3047 * @attvalue: the attribute value
3048 *
3049 * Checks an http-equiv attribute from a Meta tag to detect
3050 * the encoding
3051 * If a new encoding is detected the parser is switched to decode
3052 * it and pass UTF8
3053 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003054static void
Owen Taylor3473f882001-02-23 17:55:21 +00003055htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3056 const xmlChar *encoding;
3057
3058 if ((ctxt == NULL) || (attvalue == NULL))
3059 return;
3060
3061 /* do not change encoding */
3062 if (ctxt->input->encoding != NULL)
3063 return;
3064
3065 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3066 if (encoding != NULL) {
3067 encoding += 8;
3068 } else {
3069 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3070 if (encoding != NULL)
3071 encoding += 9;
3072 }
3073 if (encoding != NULL) {
3074 xmlCharEncoding enc;
3075 xmlCharEncodingHandlerPtr handler;
3076
3077 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3078
3079 if (ctxt->input->encoding != NULL)
3080 xmlFree((xmlChar *) ctxt->input->encoding);
3081 ctxt->input->encoding = xmlStrdup(encoding);
3082
3083 enc = xmlParseCharEncoding((const char *) encoding);
3084 /*
3085 * registered set of known encodings
3086 */
3087 if (enc != XML_CHAR_ENCODING_ERROR) {
3088 xmlSwitchEncoding(ctxt, enc);
3089 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3090 } else {
3091 /*
3092 * fallback for unknown encodings
3093 */
3094 handler = xmlFindCharEncodingHandler((const char *) encoding);
3095 if (handler != NULL) {
3096 xmlSwitchToEncoding(ctxt, handler);
3097 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3098 } else {
3099 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3100 }
3101 }
3102
3103 if ((ctxt->input->buf != NULL) &&
3104 (ctxt->input->buf->encoder != NULL) &&
3105 (ctxt->input->buf->raw != NULL) &&
3106 (ctxt->input->buf->buffer != NULL)) {
3107 int nbchars;
3108 int processed;
3109
3110 /*
3111 * convert as much as possible to the parser reading buffer.
3112 */
3113 processed = ctxt->input->cur - ctxt->input->base;
3114 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3115 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3116 ctxt->input->buf->buffer,
3117 ctxt->input->buf->raw);
3118 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003119 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3120 "htmlCheckEncoding: encoder error\n",
3121 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003122 }
3123 ctxt->input->base =
3124 ctxt->input->cur = ctxt->input->buf->buffer->content;
3125 }
3126 }
3127}
3128
3129/**
3130 * htmlCheckMeta:
3131 * @ctxt: an HTML parser context
3132 * @atts: the attributes values
3133 *
3134 * Checks an attributes from a Meta tag
3135 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003136static void
Owen Taylor3473f882001-02-23 17:55:21 +00003137htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3138 int i;
3139 const xmlChar *att, *value;
3140 int http = 0;
3141 const xmlChar *content = NULL;
3142
3143 if ((ctxt == NULL) || (atts == NULL))
3144 return;
3145
3146 i = 0;
3147 att = atts[i++];
3148 while (att != NULL) {
3149 value = atts[i++];
3150 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3151 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3152 http = 1;
3153 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3154 content = value;
3155 att = atts[i++];
3156 }
3157 if ((http) && (content != NULL))
3158 htmlCheckEncoding(ctxt, content);
3159
3160}
3161
3162/**
3163 * htmlParseStartTag:
3164 * @ctxt: an HTML parser context
3165 *
3166 * parse a start of tag either for rule element or
3167 * EmptyElement. In both case we don't parse the tag closing chars.
3168 *
3169 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3170 *
3171 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3172 *
3173 * With namespace:
3174 *
3175 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3176 *
3177 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3178 *
3179 */
3180
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003181static void
Owen Taylor3473f882001-02-23 17:55:21 +00003182htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003183 const xmlChar *name;
3184 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003185 xmlChar *attvalue;
Daniel Veillardf403d292003-10-05 13:51:35 +00003186 const xmlChar **atts = ctxt->atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003187 int nbatts = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +00003188 int maxatts = ctxt->maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003189 int meta = 0;
3190 int i;
3191
3192 if (CUR != '<') return;
3193 NEXT;
3194
3195 GROW;
3196 name = htmlParseHTMLName(ctxt);
3197 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003198 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3199 "htmlParseStartTag: invalid element name\n",
3200 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003201 /* Dump the bogus tag like browsers do */
William M. Brack76e95df2003-10-18 16:20:14 +00003202 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Owen Taylor3473f882001-02-23 17:55:21 +00003203 NEXT;
3204 return;
3205 }
3206 if (xmlStrEqual(name, BAD_CAST"meta"))
3207 meta = 1;
3208
3209 /*
3210 * Check for auto-closure of HTML elements.
3211 */
3212 htmlAutoClose(ctxt, name);
3213
3214 /*
3215 * Check for implied HTML elements.
3216 */
3217 htmlCheckImplied(ctxt, name);
3218
3219 /*
3220 * Avoid html at any level > 0, head at any level != 1
3221 * or any attempt to recurse body
3222 */
3223 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003224 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3225 "htmlParseStartTag: misplaced <html> tag\n",
3226 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003227 return;
3228 }
3229 if ((ctxt->nameNr != 1) &&
3230 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003231 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3232 "htmlParseStartTag: misplaced <head> tag\n",
3233 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003234 return;
3235 }
3236 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003237 int indx;
3238 for (indx = 0;indx < ctxt->nameNr;indx++) {
3239 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003240 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3241 "htmlParseStartTag: misplaced <body> tag\n",
3242 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003243 return;
3244 }
3245 }
3246 }
3247
3248 /*
3249 * Now parse the attributes, it ends up with the ending
3250 *
3251 * (S Attribute)* S?
3252 */
3253 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003254 while ((IS_CHAR_CH(CUR)) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003255 (CUR != '>') &&
3256 ((CUR != '/') || (NXT(1) != '>'))) {
3257 long cons = ctxt->nbChars;
3258
3259 GROW;
3260 attname = htmlParseAttribute(ctxt, &attvalue);
3261 if (attname != NULL) {
3262
3263 /*
3264 * Well formedness requires at most one declaration of an attribute
3265 */
3266 for (i = 0; i < nbatts;i += 2) {
3267 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003268 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3269 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003270 if (attvalue != NULL)
3271 xmlFree(attvalue);
3272 goto failed;
3273 }
3274 }
3275
3276 /*
3277 * Add the pair to atts
3278 */
3279 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003280 maxatts = 22; /* allow for 10 attrs by default */
3281 atts = (const xmlChar **)
3282 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003283 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003284 htmlErrMemory(ctxt, NULL);
3285 if (attvalue != NULL)
3286 xmlFree(attvalue);
3287 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003288 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003289 ctxt->atts = atts;
3290 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003291 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003292 const xmlChar **n;
3293
Owen Taylor3473f882001-02-23 17:55:21 +00003294 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003295 n = (const xmlChar **) xmlRealloc((void *) atts,
3296 maxatts * sizeof(const xmlChar *));
3297 if (n == NULL) {
3298 htmlErrMemory(ctxt, NULL);
3299 if (attvalue != NULL)
3300 xmlFree(attvalue);
3301 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003302 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003303 atts = n;
3304 ctxt->atts = atts;
3305 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003306 }
3307 atts[nbatts++] = attname;
3308 atts[nbatts++] = attvalue;
3309 atts[nbatts] = NULL;
3310 atts[nbatts + 1] = NULL;
3311 }
3312 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003313 if (attvalue != NULL)
3314 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003315 /* Dump the bogus attribute string up to the next blank or
3316 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003317 while ((IS_CHAR_CH(CUR)) &&
3318 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003319 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003320 NEXT;
3321 }
3322
3323failed:
3324 SKIP_BLANKS;
3325 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003326 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3327 "htmlParseStartTag: problem parsing attributes\n",
3328 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003329 break;
3330 }
3331 }
3332
3333 /*
3334 * Handle specific association to the META tag
3335 */
3336 if (meta)
3337 htmlCheckMeta(ctxt, atts);
3338
3339 /*
3340 * SAX: Start of Element !
3341 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003342 htmlnamePush(ctxt, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003343 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3344 if (nbatts != 0)
3345 ctxt->sax->startElement(ctxt->userData, name, atts);
3346 else
3347 ctxt->sax->startElement(ctxt->userData, name, NULL);
3348 }
Owen Taylor3473f882001-02-23 17:55:21 +00003349
3350 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003351 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003352 if (atts[i] != NULL)
3353 xmlFree((xmlChar *) atts[i]);
3354 }
Owen Taylor3473f882001-02-23 17:55:21 +00003355 }
Owen Taylor3473f882001-02-23 17:55:21 +00003356}
3357
3358/**
3359 * htmlParseEndTag:
3360 * @ctxt: an HTML parser context
3361 *
3362 * parse an end of tag
3363 *
3364 * [42] ETag ::= '</' Name S? '>'
3365 *
3366 * With namespace
3367 *
3368 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003369 *
3370 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003371 */
3372
Daniel Veillardf420ac52001-07-04 16:04:09 +00003373static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003374htmlParseEndTag(htmlParserCtxtPtr ctxt)
3375{
3376 const xmlChar *name;
3377 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003378 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003379
3380 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003381 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3382 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003383 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003384 }
3385 SKIP(2);
3386
3387 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003388 if (name == NULL)
3389 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003390
3391 /*
3392 * We should definitely be at the ending "S? '>'" part
3393 */
3394 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003395 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003396 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3397 "End tag : expected '>'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003398 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003399 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003400
3401 /*
3402 * If the name read is not one of the element in the parsing stack
3403 * then return, it's just an error.
3404 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003405 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3406 if (xmlStrEqual(name, ctxt->nameTab[i]))
3407 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003408 }
3409 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003410 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3411 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003412 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003413 }
3414
3415
3416 /*
3417 * Check for auto-closure of HTML elements.
3418 */
3419
3420 htmlAutoCloseOnClose(ctxt, name);
3421
3422 /*
3423 * Well formedness constraints, opening and closing must match.
3424 * With the exception that the autoclose may have popped stuff out
3425 * of the stack.
3426 */
3427 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003428 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003429 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3430 "Opening and ending tag mismatch: %s and %s\n",
3431 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003432 }
3433 }
3434
3435 /*
3436 * SAX: End of Tag
3437 */
3438 oldname = ctxt->name;
3439 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003440 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3441 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003442 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003443 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003444 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003445 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003446 }
3447
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003448 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003449}
3450
3451
3452/**
3453 * htmlParseReference:
3454 * @ctxt: an HTML parser context
3455 *
3456 * parse and handle entity references in content,
3457 * this will end-up in a call to character() since this is either a
3458 * CharRef, or a predefined entity.
3459 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003460static void
Owen Taylor3473f882001-02-23 17:55:21 +00003461htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003462 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003463 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003464 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003465 if (CUR != '&') return;
3466
3467 if (NXT(1) == '#') {
3468 unsigned int c;
3469 int bits, i = 0;
3470
3471 c = htmlParseCharRef(ctxt);
3472 if (c == 0)
3473 return;
3474
3475 if (c < 0x80) { out[i++]= c; bits= -6; }
3476 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3477 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3478 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3479
3480 for ( ; bits >= 0; bits-= 6) {
3481 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3482 }
3483 out[i] = 0;
3484
3485 htmlCheckParagraph(ctxt);
3486 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3487 ctxt->sax->characters(ctxt->userData, out, i);
3488 } else {
3489 ent = htmlParseEntityRef(ctxt, &name);
3490 if (name == NULL) {
3491 htmlCheckParagraph(ctxt);
3492 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3493 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3494 return;
3495 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003496 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003497 htmlCheckParagraph(ctxt);
3498 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3499 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3500 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3501 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3502 }
3503 } else {
3504 unsigned int c;
3505 int bits, i = 0;
3506
3507 c = ent->value;
3508 if (c < 0x80)
3509 { out[i++]= c; bits= -6; }
3510 else if (c < 0x800)
3511 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3512 else if (c < 0x10000)
3513 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3514 else
3515 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3516
3517 for ( ; bits >= 0; bits-= 6) {
3518 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3519 }
3520 out[i] = 0;
3521
3522 htmlCheckParagraph(ctxt);
3523 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3524 ctxt->sax->characters(ctxt->userData, out, i);
3525 }
Owen Taylor3473f882001-02-23 17:55:21 +00003526 }
3527}
3528
3529/**
3530 * htmlParseContent:
3531 * @ctxt: an HTML parser context
3532 * @name: the node name
3533 *
3534 * Parse a content: comment, sub-element, reference or text.
3535 *
3536 */
3537
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003538static void
Owen Taylor3473f882001-02-23 17:55:21 +00003539htmlParseContent(htmlParserCtxtPtr ctxt) {
3540 xmlChar *currentNode;
3541 int depth;
3542
3543 currentNode = xmlStrdup(ctxt->name);
3544 depth = ctxt->nameNr;
3545 while (1) {
3546 long cons = ctxt->nbChars;
3547
3548 GROW;
3549 /*
3550 * Our tag or one of it's parent or children is ending.
3551 */
3552 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003553 if (htmlParseEndTag(ctxt) &&
3554 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3555 if (currentNode != NULL)
3556 xmlFree(currentNode);
3557 return;
3558 }
3559 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003560 }
3561
3562 /*
3563 * Has this node been popped out during parsing of
3564 * the next element
3565 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003566 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3567 (!xmlStrEqual(currentNode, ctxt->name)))
3568 {
Owen Taylor3473f882001-02-23 17:55:21 +00003569 if (currentNode != NULL) xmlFree(currentNode);
3570 return;
3571 }
3572
Daniel Veillardf9533d12001-03-03 10:04:57 +00003573 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3574 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003575 /*
3576 * Handle SCRIPT/STYLE separately
3577 */
3578 htmlParseScript(ctxt);
3579 } else {
3580 /*
3581 * Sometimes DOCTYPE arrives in the middle of the document
3582 */
3583 if ((CUR == '<') && (NXT(1) == '!') &&
3584 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3585 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3586 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3587 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003588 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3589 "Misplaced DOCTYPE declaration\n",
3590 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003591 htmlParseDocTypeDecl(ctxt);
3592 }
3593
3594 /*
3595 * First case : a comment
3596 */
3597 if ((CUR == '<') && (NXT(1) == '!') &&
3598 (NXT(2) == '-') && (NXT(3) == '-')) {
3599 htmlParseComment(ctxt);
3600 }
3601
3602 /*
3603 * Second case : a sub-element.
3604 */
3605 else if (CUR == '<') {
3606 htmlParseElement(ctxt);
3607 }
3608
3609 /*
3610 * Third case : a reference. If if has not been resolved,
3611 * parsing returns it's Name, create the node
3612 */
3613 else if (CUR == '&') {
3614 htmlParseReference(ctxt);
3615 }
3616
3617 /*
3618 * Fourth : end of the resource
3619 */
3620 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003621 htmlAutoCloseOnEnd(ctxt);
3622 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003623 }
3624
3625 /*
3626 * Last case, text. Note that References are handled directly.
3627 */
3628 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003629 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003630 }
3631
3632 if (cons == ctxt->nbChars) {
3633 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003634 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3635 "detected an error in element content\n",
3636 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003637 }
3638 break;
3639 }
3640 }
3641 GROW;
3642 }
3643 if (currentNode != NULL) xmlFree(currentNode);
3644}
3645
3646/**
3647 * htmlParseElement:
3648 * @ctxt: an HTML parser context
3649 *
3650 * parse an HTML element, this is highly recursive
3651 *
3652 * [39] element ::= EmptyElemTag | STag content ETag
3653 *
3654 * [41] Attribute ::= Name Eq AttValue
3655 */
3656
3657void
3658htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003659 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003660 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003661 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003662 htmlParserNodeInfo node_info;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003663 const xmlChar *oldname;
Owen Taylor3473f882001-02-23 17:55:21 +00003664 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003665 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003666
3667 /* Capture start position */
3668 if (ctxt->record_info) {
3669 node_info.begin_pos = ctxt->input->consumed +
3670 (CUR_PTR - ctxt->input->base);
3671 node_info.begin_line = ctxt->input->line;
3672 }
3673
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003674 oldname = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00003675 htmlParseStartTag(ctxt);
3676 name = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00003677 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3678 (name == NULL)) {
3679 if (CUR == '>')
3680 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003681 return;
3682 }
Owen Taylor3473f882001-02-23 17:55:21 +00003683
3684 /*
3685 * Lookup the info for that element.
3686 */
3687 info = htmlTagLookup(name);
3688 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003689 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
3690 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003691 }
3692
3693 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003694 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003695 */
3696 if ((CUR == '/') && (NXT(1) == '>')) {
3697 SKIP(2);
3698 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3699 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003700 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003701 return;
3702 }
3703
3704 if (CUR == '>') {
3705 NEXT;
3706 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003707 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3708 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003709
3710 /*
3711 * end of parsing of this node.
3712 */
3713 if (xmlStrEqual(name, ctxt->name)) {
3714 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00003715 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003716 }
3717
3718 /*
3719 * Capture end position and add node
3720 */
3721 if ( currentNode != NULL && ctxt->record_info ) {
3722 node_info.end_pos = ctxt->input->consumed +
3723 (CUR_PTR - ctxt->input->base);
3724 node_info.end_line = ctxt->input->line;
3725 node_info.node = ctxt->node;
3726 xmlParserAddNodeInfo(ctxt, &node_info);
3727 }
3728 return;
3729 }
3730
3731 /*
3732 * Check for an Empty Element from DTD definition
3733 */
3734 if ((info != NULL) && (info->empty)) {
3735 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3736 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003737 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003738 return;
3739 }
3740
3741 /*
3742 * Parse the content of the element:
3743 */
3744 currentNode = xmlStrdup(ctxt->name);
3745 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00003746 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003747 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003748 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003749 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003750 if (ctxt->nameNr < depth) break;
3751 }
3752
Owen Taylor3473f882001-02-23 17:55:21 +00003753 /*
3754 * Capture end position and add node
3755 */
3756 if ( currentNode != NULL && ctxt->record_info ) {
3757 node_info.end_pos = ctxt->input->consumed +
3758 (CUR_PTR - ctxt->input->base);
3759 node_info.end_line = ctxt->input->line;
3760 node_info.node = ctxt->node;
3761 xmlParserAddNodeInfo(ctxt, &node_info);
3762 }
William M. Brack76e95df2003-10-18 16:20:14 +00003763 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003764 htmlAutoCloseOnEnd(ctxt);
3765 }
3766
Owen Taylor3473f882001-02-23 17:55:21 +00003767 if (currentNode != NULL)
3768 xmlFree(currentNode);
3769}
3770
3771/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003772 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00003773 * @ctxt: an HTML parser context
3774 *
3775 * parse an HTML document (and build a tree if using the standard SAX
3776 * interface).
3777 *
3778 * Returns 0, -1 in case of error. the parser context is augmented
3779 * as a result of the parsing.
3780 */
3781
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00003782int
Owen Taylor3473f882001-02-23 17:55:21 +00003783htmlParseDocument(htmlParserCtxtPtr ctxt) {
3784 xmlDtdPtr dtd;
3785
Daniel Veillardd0463562001-10-13 09:15:48 +00003786 xmlInitParser();
3787
Owen Taylor3473f882001-02-23 17:55:21 +00003788 htmlDefaultSAXHandlerInit();
3789 ctxt->html = 1;
3790
3791 GROW;
3792 /*
3793 * SAX: beginning of the document processing.
3794 */
3795 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3796 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3797
3798 /*
3799 * Wipe out everything which is before the first '<'
3800 */
3801 SKIP_BLANKS;
3802 if (CUR == 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003803 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
3804 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003805 }
3806
3807 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3808 ctxt->sax->startDocument(ctxt->userData);
3809
3810
3811 /*
3812 * Parse possible comments before any content
3813 */
3814 while ((CUR == '<') && (NXT(1) == '!') &&
3815 (NXT(2) == '-') && (NXT(3) == '-')) {
3816 htmlParseComment(ctxt);
3817 SKIP_BLANKS;
3818 }
3819
3820
3821 /*
3822 * Then possibly doc type declaration(s) and more Misc
3823 * (doctypedecl Misc*)?
3824 */
3825 if ((CUR == '<') && (NXT(1) == '!') &&
3826 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3827 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3828 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3829 (UPP(8) == 'E')) {
3830 htmlParseDocTypeDecl(ctxt);
3831 }
3832 SKIP_BLANKS;
3833
3834 /*
3835 * Parse possible comments before any content
3836 */
3837 while ((CUR == '<') && (NXT(1) == '!') &&
3838 (NXT(2) == '-') && (NXT(3) == '-')) {
3839 htmlParseComment(ctxt);
3840 SKIP_BLANKS;
3841 }
3842
3843 /*
3844 * Time to start parsing the tree itself
3845 */
3846 htmlParseContent(ctxt);
3847
3848 /*
3849 * autoclose
3850 */
3851 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003852 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003853
3854
3855 /*
3856 * SAX: end of the document processing.
3857 */
3858 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3859 ctxt->sax->endDocument(ctxt->userData);
3860
3861 if (ctxt->myDoc != NULL) {
3862 dtd = xmlGetIntSubset(ctxt->myDoc);
3863 if (dtd == NULL)
3864 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00003865 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00003866 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3867 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3868 }
3869 if (! ctxt->wellFormed) return(-1);
3870 return(0);
3871}
3872
3873
3874/************************************************************************
3875 * *
3876 * Parser contexts handling *
3877 * *
3878 ************************************************************************/
3879
3880/**
3881 * xmlInitParserCtxt:
3882 * @ctxt: an HTML parser context
3883 *
3884 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00003885 *
3886 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00003887 */
3888
Daniel Veillardf403d292003-10-05 13:51:35 +00003889static int
Owen Taylor3473f882001-02-23 17:55:21 +00003890htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3891{
3892 htmlSAXHandler *sax;
3893
Daniel Veillardf403d292003-10-05 13:51:35 +00003894 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003895 memset(ctxt, 0, sizeof(htmlParserCtxt));
3896
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003897 ctxt->dict = xmlDictCreate();
3898 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003899 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
3900 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003901 }
Owen Taylor3473f882001-02-23 17:55:21 +00003902 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3903 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003904 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
3905 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003906 }
3907 else
3908 memset(sax, 0, sizeof(htmlSAXHandler));
3909
3910 /* Allocate the Input stack */
3911 ctxt->inputTab = (htmlParserInputPtr *)
3912 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3913 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003914 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003915 ctxt->inputNr = 0;
3916 ctxt->inputMax = 0;
3917 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00003918 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003919 }
3920 ctxt->inputNr = 0;
3921 ctxt->inputMax = 5;
3922 ctxt->input = NULL;
3923 ctxt->version = NULL;
3924 ctxt->encoding = NULL;
3925 ctxt->standalone = -1;
3926 ctxt->instate = XML_PARSER_START;
3927
3928 /* Allocate the Node stack */
3929 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3930 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003931 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003932 ctxt->nodeNr = 0;
3933 ctxt->nodeMax = 0;
3934 ctxt->node = NULL;
3935 ctxt->inputNr = 0;
3936 ctxt->inputMax = 0;
3937 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00003938 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003939 }
3940 ctxt->nodeNr = 0;
3941 ctxt->nodeMax = 10;
3942 ctxt->node = NULL;
3943
3944 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003945 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003946 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003947 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003948 ctxt->nameNr = 0;
3949 ctxt->nameMax = 10;
3950 ctxt->name = NULL;
3951 ctxt->nodeNr = 0;
3952 ctxt->nodeMax = 0;
3953 ctxt->node = NULL;
3954 ctxt->inputNr = 0;
3955 ctxt->inputMax = 0;
3956 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00003957 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003958 }
3959 ctxt->nameNr = 0;
3960 ctxt->nameMax = 10;
3961 ctxt->name = NULL;
3962
Daniel Veillard092643b2003-09-25 14:29:29 +00003963 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00003964 else {
3965 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00003966 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00003967 }
3968 ctxt->userData = ctxt;
3969 ctxt->myDoc = NULL;
3970 ctxt->wellFormed = 1;
3971 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00003972 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00003973 ctxt->html = 1;
3974 ctxt->record_info = 0;
3975 ctxt->validate = 0;
3976 ctxt->nbChars = 0;
3977 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00003978 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003979 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00003980 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003981}
3982
3983/**
3984 * htmlFreeParserCtxt:
3985 * @ctxt: an HTML parser context
3986 *
3987 * Free all the memory used by a parser context. However the parsed
3988 * document in ctxt->myDoc is not freed.
3989 */
3990
3991void
3992htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3993{
3994 xmlFreeParserCtxt(ctxt);
3995}
3996
3997/**
Daniel Veillard1d995272002-07-22 16:43:32 +00003998 * htmlNewParserCtxt:
3999 *
4000 * Allocate and initialize a new parser context.
4001 *
4002 * Returns the xmlParserCtxtPtr or NULL
4003 */
4004
4005static htmlParserCtxtPtr
4006htmlNewParserCtxt(void)
4007{
4008 xmlParserCtxtPtr ctxt;
4009
4010 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4011 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004012 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004013 return(NULL);
4014 }
4015 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004016 if (htmlInitParserCtxt(ctxt) < 0) {
4017 htmlFreeParserCtxt(ctxt);
4018 return(NULL);
4019 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004020 return(ctxt);
4021}
4022
4023/**
4024 * htmlCreateMemoryParserCtxt:
4025 * @buffer: a pointer to a char array
4026 * @size: the size of the array
4027 *
4028 * Create a parser context for an HTML in-memory document.
4029 *
4030 * Returns the new parser context or NULL
4031 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004032htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004033htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4034 xmlParserCtxtPtr ctxt;
4035 xmlParserInputPtr input;
4036 xmlParserInputBufferPtr buf;
4037
4038 if (buffer == NULL)
4039 return(NULL);
4040 if (size <= 0)
4041 return(NULL);
4042
4043 ctxt = htmlNewParserCtxt();
4044 if (ctxt == NULL)
4045 return(NULL);
4046
4047 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4048 if (buf == NULL) return(NULL);
4049
4050 input = xmlNewInputStream(ctxt);
4051 if (input == NULL) {
4052 xmlFreeParserCtxt(ctxt);
4053 return(NULL);
4054 }
4055
4056 input->filename = NULL;
4057 input->buf = buf;
4058 input->base = input->buf->buffer->content;
4059 input->cur = input->buf->buffer->content;
4060 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4061
4062 inputPush(ctxt, input);
4063 return(ctxt);
4064}
4065
4066/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004067 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004068 * @cur: a pointer to an array of xmlChar
4069 * @encoding: a free form C string describing the HTML document encoding, or NULL
4070 *
4071 * Create a parser context for an HTML document.
4072 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004073 * TODO: check the need to add encoding handling there
4074 *
Owen Taylor3473f882001-02-23 17:55:21 +00004075 * Returns the new parser context or NULL
4076 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004077static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00004078htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004079 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004080 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004081
Daniel Veillard1d995272002-07-22 16:43:32 +00004082 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004083 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004084 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004085 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4086
4087 if (encoding != NULL) {
4088 xmlCharEncoding enc;
4089 xmlCharEncodingHandlerPtr handler;
4090
4091 if (ctxt->input->encoding != NULL)
4092 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004093 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004094
4095 enc = xmlParseCharEncoding(encoding);
4096 /*
4097 * registered set of known encodings
4098 */
4099 if (enc != XML_CHAR_ENCODING_ERROR) {
4100 xmlSwitchEncoding(ctxt, enc);
4101 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004102 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4103 "Unsupported encoding %s\n",
4104 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004105 }
4106 } else {
4107 /*
4108 * fallback for unknown encodings
4109 */
4110 handler = xmlFindCharEncodingHandler((const char *) encoding);
4111 if (handler != NULL) {
4112 xmlSwitchToEncoding(ctxt, handler);
4113 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004114 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4115 "Unsupported encoding %s\n",
4116 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004117 }
4118 }
4119 }
4120 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004121}
4122
Daniel Veillard73b013f2003-09-30 12:36:01 +00004123#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004124/************************************************************************
4125 * *
4126 * Progressive parsing interfaces *
4127 * *
4128 ************************************************************************/
4129
4130/**
4131 * htmlParseLookupSequence:
4132 * @ctxt: an HTML parser context
4133 * @first: the first char to lookup
4134 * @next: the next char to lookup or zero
4135 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004136 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004137 *
4138 * Try to find if a sequence (first, next, third) or just (first next) or
4139 * (first) is available in the input stream.
4140 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4141 * to avoid rescanning sequences of bytes, it DOES change the state of the
4142 * parser, do not use liberally.
4143 * This is basically similar to xmlParseLookupSequence()
4144 *
4145 * Returns the index to the current parsing point if the full sequence
4146 * is available, -1 otherwise.
4147 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004148static int
Owen Taylor3473f882001-02-23 17:55:21 +00004149htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
William M. Brackc1939562003-08-05 15:52:22 +00004150 xmlChar next, xmlChar third, int iscomment) {
Owen Taylor3473f882001-02-23 17:55:21 +00004151 int base, len;
4152 htmlParserInputPtr in;
4153 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004154 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004155
4156 in = ctxt->input;
4157 if (in == NULL) return(-1);
4158 base = in->cur - in->base;
4159 if (base < 0) return(-1);
4160 if (ctxt->checkIndex > base)
4161 base = ctxt->checkIndex;
4162 if (in->buf == NULL) {
4163 buf = in->base;
4164 len = in->length;
4165 } else {
4166 buf = in->buf->buffer->content;
4167 len = in->buf->buffer->use;
4168 }
4169 /* take into account the sequence length */
4170 if (third) len -= 2;
4171 else if (next) len --;
4172 for (;base < len;base++) {
William M. Brackc1939562003-08-05 15:52:22 +00004173 if (!incomment && (base + 4 < len) && !iscomment) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004174 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4175 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4176 incomment = 1;
Daniel Veillard97e01882003-07-30 18:59:19 +00004177 /* do not increment past <! - some people use <!--> */
4178 base += 2;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004179 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004180 }
4181 if (incomment) {
William M. Brack4a557d92003-07-29 04:28:04 +00004182 if (base + 3 > len)
Daniel Veillardc1f78342001-11-10 11:43:05 +00004183 return(-1);
4184 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4185 (buf[base + 2] == '>')) {
4186 incomment = 0;
4187 base += 2;
4188 }
4189 continue;
4190 }
Owen Taylor3473f882001-02-23 17:55:21 +00004191 if (buf[base] == first) {
4192 if (third != 0) {
4193 if ((buf[base + 1] != next) ||
4194 (buf[base + 2] != third)) continue;
4195 } else if (next != 0) {
4196 if (buf[base + 1] != next) continue;
4197 }
4198 ctxt->checkIndex = 0;
4199#ifdef DEBUG_PUSH
4200 if (next == 0)
4201 xmlGenericError(xmlGenericErrorContext,
4202 "HPP: lookup '%c' found at %d\n",
4203 first, base);
4204 else if (third == 0)
4205 xmlGenericError(xmlGenericErrorContext,
4206 "HPP: lookup '%c%c' found at %d\n",
4207 first, next, base);
4208 else
4209 xmlGenericError(xmlGenericErrorContext,
4210 "HPP: lookup '%c%c%c' found at %d\n",
4211 first, next, third, base);
4212#endif
4213 return(base - (in->cur - in->base));
4214 }
4215 }
4216 ctxt->checkIndex = base;
4217#ifdef DEBUG_PUSH
4218 if (next == 0)
4219 xmlGenericError(xmlGenericErrorContext,
4220 "HPP: lookup '%c' failed\n", first);
4221 else if (third == 0)
4222 xmlGenericError(xmlGenericErrorContext,
4223 "HPP: lookup '%c%c' failed\n", first, next);
4224 else
4225 xmlGenericError(xmlGenericErrorContext,
4226 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4227#endif
4228 return(-1);
4229}
4230
4231/**
4232 * htmlParseTryOrFinish:
4233 * @ctxt: an HTML parser context
4234 * @terminate: last chunk indicator
4235 *
4236 * Try to progress on parsing
4237 *
4238 * Returns zero if no parsing was possible
4239 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004240static int
Owen Taylor3473f882001-02-23 17:55:21 +00004241htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4242 int ret = 0;
4243 htmlParserInputPtr in;
4244 int avail = 0;
4245 xmlChar cur, next;
4246
4247#ifdef DEBUG_PUSH
4248 switch (ctxt->instate) {
4249 case XML_PARSER_EOF:
4250 xmlGenericError(xmlGenericErrorContext,
4251 "HPP: try EOF\n"); break;
4252 case XML_PARSER_START:
4253 xmlGenericError(xmlGenericErrorContext,
4254 "HPP: try START\n"); break;
4255 case XML_PARSER_MISC:
4256 xmlGenericError(xmlGenericErrorContext,
4257 "HPP: try MISC\n");break;
4258 case XML_PARSER_COMMENT:
4259 xmlGenericError(xmlGenericErrorContext,
4260 "HPP: try COMMENT\n");break;
4261 case XML_PARSER_PROLOG:
4262 xmlGenericError(xmlGenericErrorContext,
4263 "HPP: try PROLOG\n");break;
4264 case XML_PARSER_START_TAG:
4265 xmlGenericError(xmlGenericErrorContext,
4266 "HPP: try START_TAG\n");break;
4267 case XML_PARSER_CONTENT:
4268 xmlGenericError(xmlGenericErrorContext,
4269 "HPP: try CONTENT\n");break;
4270 case XML_PARSER_CDATA_SECTION:
4271 xmlGenericError(xmlGenericErrorContext,
4272 "HPP: try CDATA_SECTION\n");break;
4273 case XML_PARSER_END_TAG:
4274 xmlGenericError(xmlGenericErrorContext,
4275 "HPP: try END_TAG\n");break;
4276 case XML_PARSER_ENTITY_DECL:
4277 xmlGenericError(xmlGenericErrorContext,
4278 "HPP: try ENTITY_DECL\n");break;
4279 case XML_PARSER_ENTITY_VALUE:
4280 xmlGenericError(xmlGenericErrorContext,
4281 "HPP: try ENTITY_VALUE\n");break;
4282 case XML_PARSER_ATTRIBUTE_VALUE:
4283 xmlGenericError(xmlGenericErrorContext,
4284 "HPP: try ATTRIBUTE_VALUE\n");break;
4285 case XML_PARSER_DTD:
4286 xmlGenericError(xmlGenericErrorContext,
4287 "HPP: try DTD\n");break;
4288 case XML_PARSER_EPILOG:
4289 xmlGenericError(xmlGenericErrorContext,
4290 "HPP: try EPILOG\n");break;
4291 case XML_PARSER_PI:
4292 xmlGenericError(xmlGenericErrorContext,
4293 "HPP: try PI\n");break;
4294 case XML_PARSER_SYSTEM_LITERAL:
4295 xmlGenericError(xmlGenericErrorContext,
4296 "HPP: try SYSTEM_LITERAL\n");break;
4297 }
4298#endif
4299
4300 while (1) {
4301
4302 in = ctxt->input;
4303 if (in == NULL) break;
4304 if (in->buf == NULL)
4305 avail = in->length - (in->cur - in->base);
4306 else
4307 avail = in->buf->buffer->use - (in->cur - in->base);
4308 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004309 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004310 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4311 /*
4312 * SAX: end of the document processing.
4313 */
4314 ctxt->instate = XML_PARSER_EOF;
4315 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4316 ctxt->sax->endDocument(ctxt->userData);
4317 }
4318 }
4319 if (avail < 1)
4320 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004321 cur = in->cur[0];
4322 if (cur == 0) {
4323 SKIP(1);
4324 continue;
4325 }
4326
Owen Taylor3473f882001-02-23 17:55:21 +00004327 switch (ctxt->instate) {
4328 case XML_PARSER_EOF:
4329 /*
4330 * Document parsing is done !
4331 */
4332 goto done;
4333 case XML_PARSER_START:
4334 /*
4335 * Very first chars read from the document flow.
4336 */
4337 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004338 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004339 SKIP_BLANKS;
4340 if (in->buf == NULL)
4341 avail = in->length - (in->cur - in->base);
4342 else
4343 avail = in->buf->buffer->use - (in->cur - in->base);
4344 }
4345 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4346 ctxt->sax->setDocumentLocator(ctxt->userData,
4347 &xmlDefaultSAXLocator);
4348 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4349 (!ctxt->disableSAX))
4350 ctxt->sax->startDocument(ctxt->userData);
4351
4352 cur = in->cur[0];
4353 next = in->cur[1];
4354 if ((cur == '<') && (next == '!') &&
4355 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4356 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4357 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4358 (UPP(8) == 'E')) {
4359 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004360 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004361 goto done;
4362#ifdef DEBUG_PUSH
4363 xmlGenericError(xmlGenericErrorContext,
4364 "HPP: Parsing internal subset\n");
4365#endif
4366 htmlParseDocTypeDecl(ctxt);
4367 ctxt->instate = XML_PARSER_PROLOG;
4368#ifdef DEBUG_PUSH
4369 xmlGenericError(xmlGenericErrorContext,
4370 "HPP: entering PROLOG\n");
4371#endif
4372 } else {
4373 ctxt->instate = XML_PARSER_MISC;
4374 }
4375#ifdef DEBUG_PUSH
4376 xmlGenericError(xmlGenericErrorContext,
4377 "HPP: entering MISC\n");
4378#endif
4379 break;
4380 case XML_PARSER_MISC:
4381 SKIP_BLANKS;
4382 if (in->buf == NULL)
4383 avail = in->length - (in->cur - in->base);
4384 else
4385 avail = in->buf->buffer->use - (in->cur - in->base);
4386 if (avail < 2)
4387 goto done;
4388 cur = in->cur[0];
4389 next = in->cur[1];
4390 if ((cur == '<') && (next == '!') &&
4391 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4392 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004393 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004394 goto done;
4395#ifdef DEBUG_PUSH
4396 xmlGenericError(xmlGenericErrorContext,
4397 "HPP: Parsing Comment\n");
4398#endif
4399 htmlParseComment(ctxt);
4400 ctxt->instate = XML_PARSER_MISC;
4401 } else if ((cur == '<') && (next == '!') &&
4402 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4403 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4404 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4405 (UPP(8) == 'E')) {
4406 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004407 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004408 goto done;
4409#ifdef DEBUG_PUSH
4410 xmlGenericError(xmlGenericErrorContext,
4411 "HPP: Parsing internal subset\n");
4412#endif
4413 htmlParseDocTypeDecl(ctxt);
4414 ctxt->instate = XML_PARSER_PROLOG;
4415#ifdef DEBUG_PUSH
4416 xmlGenericError(xmlGenericErrorContext,
4417 "HPP: entering PROLOG\n");
4418#endif
4419 } else if ((cur == '<') && (next == '!') &&
4420 (avail < 9)) {
4421 goto done;
4422 } else {
4423 ctxt->instate = XML_PARSER_START_TAG;
4424#ifdef DEBUG_PUSH
4425 xmlGenericError(xmlGenericErrorContext,
4426 "HPP: entering START_TAG\n");
4427#endif
4428 }
4429 break;
4430 case XML_PARSER_PROLOG:
4431 SKIP_BLANKS;
4432 if (in->buf == NULL)
4433 avail = in->length - (in->cur - in->base);
4434 else
4435 avail = in->buf->buffer->use - (in->cur - in->base);
4436 if (avail < 2)
4437 goto done;
4438 cur = in->cur[0];
4439 next = in->cur[1];
4440 if ((cur == '<') && (next == '!') &&
4441 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4442 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004443 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004444 goto done;
4445#ifdef DEBUG_PUSH
4446 xmlGenericError(xmlGenericErrorContext,
4447 "HPP: Parsing Comment\n");
4448#endif
4449 htmlParseComment(ctxt);
4450 ctxt->instate = XML_PARSER_PROLOG;
4451 } else if ((cur == '<') && (next == '!') &&
4452 (avail < 4)) {
4453 goto done;
4454 } else {
4455 ctxt->instate = XML_PARSER_START_TAG;
4456#ifdef DEBUG_PUSH
4457 xmlGenericError(xmlGenericErrorContext,
4458 "HPP: entering START_TAG\n");
4459#endif
4460 }
4461 break;
4462 case XML_PARSER_EPILOG:
4463 if (in->buf == NULL)
4464 avail = in->length - (in->cur - in->base);
4465 else
4466 avail = in->buf->buffer->use - (in->cur - in->base);
4467 if (avail < 1)
4468 goto done;
4469 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004470 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004471 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004472 goto done;
4473 }
4474 if (avail < 2)
4475 goto done;
4476 next = in->cur[1];
4477 if ((cur == '<') && (next == '!') &&
4478 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4479 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004480 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004481 goto done;
4482#ifdef DEBUG_PUSH
4483 xmlGenericError(xmlGenericErrorContext,
4484 "HPP: Parsing Comment\n");
4485#endif
4486 htmlParseComment(ctxt);
4487 ctxt->instate = XML_PARSER_EPILOG;
4488 } else if ((cur == '<') && (next == '!') &&
4489 (avail < 4)) {
4490 goto done;
4491 } else {
4492 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004493 ctxt->wellFormed = 0;
4494 ctxt->instate = XML_PARSER_EOF;
4495#ifdef DEBUG_PUSH
4496 xmlGenericError(xmlGenericErrorContext,
4497 "HPP: entering EOF\n");
4498#endif
4499 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4500 ctxt->sax->endDocument(ctxt->userData);
4501 goto done;
4502 }
4503 break;
4504 case XML_PARSER_START_TAG: {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004505 const xmlChar *name, *oldname;
Owen Taylor3473f882001-02-23 17:55:21 +00004506 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004507 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004508
4509 if (avail < 2)
4510 goto done;
4511 cur = in->cur[0];
4512 if (cur != '<') {
4513 ctxt->instate = XML_PARSER_CONTENT;
4514#ifdef DEBUG_PUSH
4515 xmlGenericError(xmlGenericErrorContext,
4516 "HPP: entering CONTENT\n");
4517#endif
4518 break;
4519 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004520 if (in->cur[1] == '/') {
4521 ctxt->instate = XML_PARSER_END_TAG;
4522 ctxt->checkIndex = 0;
4523#ifdef DEBUG_PUSH
4524 xmlGenericError(xmlGenericErrorContext,
4525 "HPP: entering END_TAG\n");
4526#endif
4527 break;
4528 }
Owen Taylor3473f882001-02-23 17:55:21 +00004529 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004530 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004531 goto done;
4532
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004533 oldname = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00004534 htmlParseStartTag(ctxt);
4535 name = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00004536 if (((depth == ctxt->nameNr) &&
4537 (xmlStrEqual(oldname, ctxt->name))) ||
4538 (name == NULL)) {
4539 if (CUR == '>')
4540 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004541 break;
4542 }
Owen Taylor3473f882001-02-23 17:55:21 +00004543
4544 /*
4545 * Lookup the info for that element.
4546 */
4547 info = htmlTagLookup(name);
4548 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004549 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4550 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004551 }
4552
4553 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004554 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004555 */
4556 if ((CUR == '/') && (NXT(1) == '>')) {
4557 SKIP(2);
4558 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4559 ctxt->sax->endElement(ctxt->userData, name);
4560 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004561 ctxt->instate = XML_PARSER_CONTENT;
4562#ifdef DEBUG_PUSH
4563 xmlGenericError(xmlGenericErrorContext,
4564 "HPP: entering CONTENT\n");
4565#endif
4566 break;
4567 }
4568
4569 if (CUR == '>') {
4570 NEXT;
4571 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004572 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4573 "Couldn't find end of Start Tag %s\n",
4574 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004575
4576 /*
4577 * end of parsing of this node.
4578 */
4579 if (xmlStrEqual(name, ctxt->name)) {
4580 nodePop(ctxt);
4581 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004582 }
4583
4584 ctxt->instate = XML_PARSER_CONTENT;
4585#ifdef DEBUG_PUSH
4586 xmlGenericError(xmlGenericErrorContext,
4587 "HPP: entering CONTENT\n");
4588#endif
4589 break;
4590 }
4591
4592 /*
4593 * Check for an Empty Element from DTD definition
4594 */
4595 if ((info != NULL) && (info->empty)) {
4596 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4597 ctxt->sax->endElement(ctxt->userData, name);
4598 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004599 }
4600 ctxt->instate = XML_PARSER_CONTENT;
4601#ifdef DEBUG_PUSH
4602 xmlGenericError(xmlGenericErrorContext,
4603 "HPP: entering CONTENT\n");
4604#endif
4605 break;
4606 }
4607 case XML_PARSER_CONTENT: {
4608 long cons;
4609 /*
4610 * Handle preparsed entities and charRef
4611 */
4612 if (ctxt->token != 0) {
4613 xmlChar chr[2] = { 0 , 0 } ;
4614
4615 chr[0] = (xmlChar) ctxt->token;
4616 htmlCheckParagraph(ctxt);
4617 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4618 ctxt->sax->characters(ctxt->userData, chr, 1);
4619 ctxt->token = 0;
4620 ctxt->checkIndex = 0;
4621 }
4622 if ((avail == 1) && (terminate)) {
4623 cur = in->cur[0];
4624 if ((cur != '<') && (cur != '&')) {
4625 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00004626 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004627 if (ctxt->sax->ignorableWhitespace != NULL)
4628 ctxt->sax->ignorableWhitespace(
4629 ctxt->userData, &cur, 1);
4630 } else {
4631 htmlCheckParagraph(ctxt);
4632 if (ctxt->sax->characters != NULL)
4633 ctxt->sax->characters(
4634 ctxt->userData, &cur, 1);
4635 }
4636 }
4637 ctxt->token = 0;
4638 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00004639 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00004640 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004641 }
Owen Taylor3473f882001-02-23 17:55:21 +00004642 }
4643 if (avail < 2)
4644 goto done;
4645 cur = in->cur[0];
4646 next = in->cur[1];
4647 cons = ctxt->nbChars;
4648 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4649 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4650 /*
4651 * Handle SCRIPT/STYLE separately
4652 */
4653 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004654 (htmlParseLookupSequence(ctxt, '<', '/', 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004655 goto done;
4656 htmlParseScript(ctxt);
4657 if ((cur == '<') && (next == '/')) {
4658 ctxt->instate = XML_PARSER_END_TAG;
4659 ctxt->checkIndex = 0;
4660#ifdef DEBUG_PUSH
4661 xmlGenericError(xmlGenericErrorContext,
4662 "HPP: entering END_TAG\n");
4663#endif
4664 break;
4665 }
4666 } else {
4667 /*
4668 * Sometimes DOCTYPE arrives in the middle of the document
4669 */
4670 if ((cur == '<') && (next == '!') &&
4671 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4672 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4673 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4674 (UPP(8) == 'E')) {
4675 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004676 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004677 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00004678 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4679 "Misplaced DOCTYPE declaration\n",
4680 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004681 htmlParseDocTypeDecl(ctxt);
4682 } else if ((cur == '<') && (next == '!') &&
4683 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4684 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004685 (htmlParseLookupSequence(
4686 ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004687 goto done;
4688#ifdef DEBUG_PUSH
4689 xmlGenericError(xmlGenericErrorContext,
4690 "HPP: Parsing Comment\n");
4691#endif
4692 htmlParseComment(ctxt);
4693 ctxt->instate = XML_PARSER_CONTENT;
4694 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4695 goto done;
4696 } else if ((cur == '<') && (next == '/')) {
4697 ctxt->instate = XML_PARSER_END_TAG;
4698 ctxt->checkIndex = 0;
4699#ifdef DEBUG_PUSH
4700 xmlGenericError(xmlGenericErrorContext,
4701 "HPP: entering END_TAG\n");
4702#endif
4703 break;
4704 } else if (cur == '<') {
4705 ctxt->instate = XML_PARSER_START_TAG;
4706 ctxt->checkIndex = 0;
4707#ifdef DEBUG_PUSH
4708 xmlGenericError(xmlGenericErrorContext,
4709 "HPP: entering START_TAG\n");
4710#endif
4711 break;
4712 } else if (cur == '&') {
4713 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004714 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004715 goto done;
4716#ifdef DEBUG_PUSH
4717 xmlGenericError(xmlGenericErrorContext,
4718 "HPP: Parsing Reference\n");
4719#endif
4720 /* TODO: check generation of subtrees if noent !!! */
4721 htmlParseReference(ctxt);
4722 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00004723 /*
4724 * check that the text sequence is complete
4725 * before handing out the data to the parser
4726 * to avoid problems with erroneous end of
4727 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00004728 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00004729 if ((!terminate) &&
4730 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
4731 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00004732 ctxt->checkIndex = 0;
4733#ifdef DEBUG_PUSH
4734 xmlGenericError(xmlGenericErrorContext,
4735 "HPP: Parsing char data\n");
4736#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004737 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004738 }
4739 }
4740 if (cons == ctxt->nbChars) {
4741 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004742 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4743 "detected an error in element content\n",
4744 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004745 }
4746 NEXT;
4747 break;
4748 }
4749
4750 break;
4751 }
4752 case XML_PARSER_END_TAG:
4753 if (avail < 2)
4754 goto done;
4755 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004756 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004757 goto done;
4758 htmlParseEndTag(ctxt);
4759 if (ctxt->nameNr == 0) {
4760 ctxt->instate = XML_PARSER_EPILOG;
4761 } else {
4762 ctxt->instate = XML_PARSER_CONTENT;
4763 }
4764 ctxt->checkIndex = 0;
4765#ifdef DEBUG_PUSH
4766 xmlGenericError(xmlGenericErrorContext,
4767 "HPP: entering CONTENT\n");
4768#endif
4769 break;
4770 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00004771 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4772 "HPP: internal error, state == CDATA\n",
4773 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004774 ctxt->instate = XML_PARSER_CONTENT;
4775 ctxt->checkIndex = 0;
4776#ifdef DEBUG_PUSH
4777 xmlGenericError(xmlGenericErrorContext,
4778 "HPP: entering CONTENT\n");
4779#endif
4780 break;
4781 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00004782 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4783 "HPP: internal error, state == DTD\n",
4784 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004785 ctxt->instate = XML_PARSER_CONTENT;
4786 ctxt->checkIndex = 0;
4787#ifdef DEBUG_PUSH
4788 xmlGenericError(xmlGenericErrorContext,
4789 "HPP: entering CONTENT\n");
4790#endif
4791 break;
4792 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00004793 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4794 "HPP: internal error, state == COMMENT\n",
4795 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004796 ctxt->instate = XML_PARSER_CONTENT;
4797 ctxt->checkIndex = 0;
4798#ifdef DEBUG_PUSH
4799 xmlGenericError(xmlGenericErrorContext,
4800 "HPP: entering CONTENT\n");
4801#endif
4802 break;
4803 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00004804 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4805 "HPP: internal error, state == PI\n",
4806 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004807 ctxt->instate = XML_PARSER_CONTENT;
4808 ctxt->checkIndex = 0;
4809#ifdef DEBUG_PUSH
4810 xmlGenericError(xmlGenericErrorContext,
4811 "HPP: entering CONTENT\n");
4812#endif
4813 break;
4814 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00004815 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4816 "HPP: internal error, state == ENTITY_DECL\n",
4817 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004818 ctxt->instate = XML_PARSER_CONTENT;
4819 ctxt->checkIndex = 0;
4820#ifdef DEBUG_PUSH
4821 xmlGenericError(xmlGenericErrorContext,
4822 "HPP: entering CONTENT\n");
4823#endif
4824 break;
4825 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00004826 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4827 "HPP: internal error, state == ENTITY_VALUE\n",
4828 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004829 ctxt->instate = XML_PARSER_CONTENT;
4830 ctxt->checkIndex = 0;
4831#ifdef DEBUG_PUSH
4832 xmlGenericError(xmlGenericErrorContext,
4833 "HPP: entering DTD\n");
4834#endif
4835 break;
4836 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00004837 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4838 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
4839 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004840 ctxt->instate = XML_PARSER_START_TAG;
4841 ctxt->checkIndex = 0;
4842#ifdef DEBUG_PUSH
4843 xmlGenericError(xmlGenericErrorContext,
4844 "HPP: entering START_TAG\n");
4845#endif
4846 break;
4847 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00004848 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4849 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
4850 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004851 ctxt->instate = XML_PARSER_CONTENT;
4852 ctxt->checkIndex = 0;
4853#ifdef DEBUG_PUSH
4854 xmlGenericError(xmlGenericErrorContext,
4855 "HPP: entering CONTENT\n");
4856#endif
4857 break;
4858 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00004859 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4860 "HPP: internal error, state == XML_PARSER_IGNORE\n",
4861 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004862 ctxt->instate = XML_PARSER_CONTENT;
4863 ctxt->checkIndex = 0;
4864#ifdef DEBUG_PUSH
4865 xmlGenericError(xmlGenericErrorContext,
4866 "HPP: entering CONTENT\n");
4867#endif
4868 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00004869 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00004870 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4871 "HPP: internal error, state == XML_PARSER_LITERAL\n",
4872 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00004873 ctxt->instate = XML_PARSER_CONTENT;
4874 ctxt->checkIndex = 0;
4875#ifdef DEBUG_PUSH
4876 xmlGenericError(xmlGenericErrorContext,
4877 "HPP: entering CONTENT\n");
4878#endif
4879 break;
4880
Owen Taylor3473f882001-02-23 17:55:21 +00004881 }
4882 }
4883done:
4884 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004885 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004886 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4887 /*
4888 * SAX: end of the document processing.
4889 */
4890 ctxt->instate = XML_PARSER_EOF;
4891 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4892 ctxt->sax->endDocument(ctxt->userData);
4893 }
4894 }
4895 if ((ctxt->myDoc != NULL) &&
4896 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4897 (ctxt->instate == XML_PARSER_EPILOG))) {
4898 xmlDtdPtr dtd;
4899 dtd = xmlGetIntSubset(ctxt->myDoc);
4900 if (dtd == NULL)
4901 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00004902 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004903 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4904 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4905 }
4906#ifdef DEBUG_PUSH
4907 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4908#endif
4909 return(ret);
4910}
4911
4912/**
Owen Taylor3473f882001-02-23 17:55:21 +00004913 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00004914 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00004915 * @chunk: an char array
4916 * @size: the size in byte of the chunk
4917 * @terminate: last chunk indicator
4918 *
4919 * Parse a Chunk of memory
4920 *
4921 * Returns zero if no error, the xmlParserErrors otherwise.
4922 */
4923int
4924htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4925 int terminate) {
4926 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4927 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4928 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4929 int cur = ctxt->input->cur - ctxt->input->base;
4930
4931 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4932 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4933 ctxt->input->cur = ctxt->input->base + cur;
4934#ifdef DEBUG_PUSH
4935 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4936#endif
4937
Daniel Veillard14f752c2003-08-09 11:44:50 +00004938#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00004939 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4940 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00004941#endif
Owen Taylor3473f882001-02-23 17:55:21 +00004942 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00004943 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
4944 xmlParserInputBufferPtr in = ctxt->input->buf;
4945 if ((in->encoder != NULL) && (in->buffer != NULL) &&
4946 (in->raw != NULL)) {
4947 int nbchars;
4948
4949 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
4950 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004951 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
4952 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00004953 return(XML_ERR_INVALID_ENCODING);
4954 }
4955 }
4956 }
Owen Taylor3473f882001-02-23 17:55:21 +00004957 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00004958 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00004959 if (terminate) {
4960 if ((ctxt->instate != XML_PARSER_EOF) &&
4961 (ctxt->instate != XML_PARSER_EPILOG) &&
4962 (ctxt->instate != XML_PARSER_MISC)) {
4963 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004964 ctxt->wellFormed = 0;
4965 }
4966 if (ctxt->instate != XML_PARSER_EOF) {
4967 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4968 ctxt->sax->endDocument(ctxt->userData);
4969 }
4970 ctxt->instate = XML_PARSER_EOF;
4971 }
4972 return((xmlParserErrors) ctxt->errNo);
4973}
Daniel Veillard73b013f2003-09-30 12:36:01 +00004974#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00004975
4976/************************************************************************
4977 * *
4978 * User entry points *
4979 * *
4980 ************************************************************************/
4981
4982/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004983 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004984 * @sax: a SAX handler
4985 * @user_data: The user data returned on SAX callbacks
4986 * @chunk: a pointer to an array of chars
4987 * @size: number of chars in the array
4988 * @filename: an optional file name or URI
4989 * @enc: an optional encoding
4990 *
4991 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00004992 * The value of @filename is used for fetching external entities
4993 * and error/warning reports.
4994 *
4995 * Returns the new parser context or NULL
4996 */
4997htmlParserCtxtPtr
4998htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4999 const char *chunk, int size, const char *filename,
5000 xmlCharEncoding enc) {
5001 htmlParserCtxtPtr ctxt;
5002 htmlParserInputPtr inputStream;
5003 xmlParserInputBufferPtr buf;
5004
Daniel Veillardd0463562001-10-13 09:15:48 +00005005 xmlInitParser();
5006
Owen Taylor3473f882001-02-23 17:55:21 +00005007 buf = xmlAllocParserInputBuffer(enc);
5008 if (buf == NULL) return(NULL);
5009
Daniel Veillardf403d292003-10-05 13:51:35 +00005010 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005011 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005012 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005013 return(NULL);
5014 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00005015 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5016 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005017 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005018 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005019 xmlFree(ctxt->sax);
5020 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5021 if (ctxt->sax == NULL) {
5022 xmlFree(buf);
5023 xmlFree(ctxt);
5024 return(NULL);
5025 }
5026 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5027 if (user_data != NULL)
5028 ctxt->userData = user_data;
5029 }
5030 if (filename == NULL) {
5031 ctxt->directory = NULL;
5032 } else {
5033 ctxt->directory = xmlParserGetDirectory(filename);
5034 }
5035
5036 inputStream = htmlNewInputStream(ctxt);
5037 if (inputStream == NULL) {
5038 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005039 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005040 return(NULL);
5041 }
5042
5043 if (filename == NULL)
5044 inputStream->filename = NULL;
5045 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005046 inputStream->filename = (char *)
5047 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005048 inputStream->buf = buf;
5049 inputStream->base = inputStream->buf->buffer->content;
5050 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillard5f704af2003-03-05 10:01:43 +00005051 inputStream->end =
5052 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005053
5054 inputPush(ctxt, inputStream);
5055
5056 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5057 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005058 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5059 int cur = ctxt->input->cur - ctxt->input->base;
5060
Owen Taylor3473f882001-02-23 17:55:21 +00005061 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005062
5063 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5064 ctxt->input->cur = ctxt->input->base + cur;
5065 ctxt->input->end =
5066 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005067#ifdef DEBUG_PUSH
5068 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5069#endif
5070 }
5071
5072 return(ctxt);
5073}
5074
5075/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005076 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005077 * @cur: a pointer to an array of xmlChar
5078 * @encoding: a free form C string describing the HTML document encoding, or NULL
5079 * @sax: the SAX handler block
5080 * @userData: if using SAX, this pointer will be provided on callbacks.
5081 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005082 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5083 * to handle parse events. If sax is NULL, fallback to the default DOM
5084 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00005085 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005086 * Returns the resulting document tree unless SAX is NULL or the document is
5087 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005088 */
5089
5090htmlDocPtr
5091htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5092 htmlDocPtr ret;
5093 htmlParserCtxtPtr ctxt;
5094
Daniel Veillardd0463562001-10-13 09:15:48 +00005095 xmlInitParser();
5096
Owen Taylor3473f882001-02-23 17:55:21 +00005097 if (cur == NULL) return(NULL);
5098
5099
5100 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5101 if (ctxt == NULL) return(NULL);
5102 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005103 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005104 ctxt->sax = sax;
5105 ctxt->userData = userData;
5106 }
5107
5108 htmlParseDocument(ctxt);
5109 ret = ctxt->myDoc;
5110 if (sax != NULL) {
5111 ctxt->sax = NULL;
5112 ctxt->userData = NULL;
5113 }
5114 htmlFreeParserCtxt(ctxt);
5115
5116 return(ret);
5117}
5118
5119/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005120 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005121 * @cur: a pointer to an array of xmlChar
5122 * @encoding: a free form C string describing the HTML document encoding, or NULL
5123 *
5124 * parse an HTML in-memory document and build a tree.
5125 *
5126 * Returns the resulting document tree
5127 */
5128
5129htmlDocPtr
5130htmlParseDoc(xmlChar *cur, const char *encoding) {
5131 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5132}
5133
5134
5135/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005136 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005137 * @filename: the filename
5138 * @encoding: a free form C string describing the HTML document encoding, or NULL
5139 *
5140 * Create a parser context for a file content.
5141 * Automatic support for ZLIB/Compress compressed document is provided
5142 * by default if found at compile-time.
5143 *
5144 * Returns the new parser context or NULL
5145 */
5146htmlParserCtxtPtr
5147htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5148{
5149 htmlParserCtxtPtr ctxt;
5150 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005151 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005152 /* htmlCharEncoding enc; */
5153 xmlChar *content, *content_line = (xmlChar *) "charset=";
5154
Daniel Veillardf403d292003-10-05 13:51:35 +00005155 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005156 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00005157 return(NULL);
5158 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005159 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5160 if (canonicFilename == NULL) {
5161 if (xmlDefaultSAXHandler.error != NULL) {
5162 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5163 }
Daniel Veillard104caa32003-05-13 22:54:05 +00005164 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005165 return(NULL);
5166 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005167
5168 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5169 xmlFree(canonicFilename);
5170 if (inputStream == NULL) {
5171 xmlFreeParserCtxt(ctxt);
5172 return(NULL);
5173 }
Owen Taylor3473f882001-02-23 17:55:21 +00005174
5175 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005176
Owen Taylor3473f882001-02-23 17:55:21 +00005177 /* set encoding */
5178 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005179 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Owen Taylor3473f882001-02-23 17:55:21 +00005180 if (content) {
5181 strcpy ((char *)content, (char *)content_line);
5182 strcat ((char *)content, (char *)encoding);
5183 htmlCheckEncoding (ctxt, content);
5184 xmlFree (content);
5185 }
5186 }
5187
5188 return(ctxt);
5189}
5190
5191/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005192 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005193 * @filename: the filename
5194 * @encoding: a free form C string describing the HTML document encoding, or NULL
5195 * @sax: the SAX handler block
5196 * @userData: if using SAX, this pointer will be provided on callbacks.
5197 *
5198 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5199 * compressed document is provided by default if found at compile-time.
5200 * It use the given SAX function block to handle the parsing callback.
5201 * If sax is NULL, fallback to the default DOM tree building routines.
5202 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005203 * Returns the resulting document tree unless SAX is NULL or the document is
5204 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005205 */
5206
5207htmlDocPtr
5208htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5209 void *userData) {
5210 htmlDocPtr ret;
5211 htmlParserCtxtPtr ctxt;
5212 htmlSAXHandlerPtr oldsax = NULL;
5213
Daniel Veillardd0463562001-10-13 09:15:48 +00005214 xmlInitParser();
5215
Owen Taylor3473f882001-02-23 17:55:21 +00005216 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5217 if (ctxt == NULL) return(NULL);
5218 if (sax != NULL) {
5219 oldsax = ctxt->sax;
5220 ctxt->sax = sax;
5221 ctxt->userData = userData;
5222 }
5223
5224 htmlParseDocument(ctxt);
5225
5226 ret = ctxt->myDoc;
5227 if (sax != NULL) {
5228 ctxt->sax = oldsax;
5229 ctxt->userData = NULL;
5230 }
5231 htmlFreeParserCtxt(ctxt);
5232
5233 return(ret);
5234}
5235
5236/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005237 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005238 * @filename: the filename
5239 * @encoding: a free form C string describing the HTML document encoding, or NULL
5240 *
5241 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5242 * compressed document is provided by default if found at compile-time.
5243 *
5244 * Returns the resulting document tree
5245 */
5246
5247htmlDocPtr
5248htmlParseFile(const char *filename, const char *encoding) {
5249 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5250}
5251
5252/**
5253 * htmlHandleOmittedElem:
5254 * @val: int 0 or 1
5255 *
5256 * Set and return the previous value for handling HTML omitted tags.
5257 *
5258 * Returns the last value for 0 for no handling, 1 for auto insertion.
5259 */
5260
5261int
5262htmlHandleOmittedElem(int val) {
5263 int old = htmlOmittedDefaultValue;
5264
5265 htmlOmittedDefaultValue = val;
5266 return(old);
5267}
5268
Daniel Veillard930dfb62003-02-05 10:17:38 +00005269/**
5270 * htmlElementAllowedHere:
5271 * @parent: HTML parent element
5272 * @elt: HTML element
5273 *
5274 * Checks whether an HTML element may be a direct child of a parent element.
5275 * Note - doesn't check for deprecated elements
5276 *
5277 * Returns 1 if allowed; 0 otherwise.
5278 */
5279int
5280htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5281 const char** p ;
5282
5283 if ( ! elt || ! parent || ! parent->subelts )
5284 return 0 ;
5285
5286 for ( p = parent->subelts; *p; ++p )
5287 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5288 return 1 ;
5289
5290 return 0 ;
5291}
5292/**
5293 * htmlElementStatusHere:
5294 * @parent: HTML parent element
5295 * @elt: HTML element
5296 *
5297 * Checks whether an HTML element may be a direct child of a parent element.
5298 * and if so whether it is valid or deprecated.
5299 *
5300 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5301 */
5302htmlStatus
5303htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5304 if ( ! parent || ! elt )
5305 return HTML_INVALID ;
5306 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5307 return HTML_INVALID ;
5308
5309 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5310}
5311/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005312 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005313 * @elt: HTML element
5314 * @attr: HTML attribute
5315 * @legacy: whether to allow deprecated attributes
5316 *
5317 * Checks whether an attribute is valid for an element
5318 * Has full knowledge of Required and Deprecated attributes
5319 *
5320 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5321 */
5322htmlStatus
5323htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5324 const char** p ;
5325
5326 if ( !elt || ! attr )
5327 return HTML_INVALID ;
5328
5329 if ( elt->attrs_req )
5330 for ( p = elt->attrs_req; *p; ++p)
5331 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5332 return HTML_REQUIRED ;
5333
5334 if ( elt->attrs_opt )
5335 for ( p = elt->attrs_opt; *p; ++p)
5336 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5337 return HTML_VALID ;
5338
5339 if ( legacy && elt->attrs_depr )
5340 for ( p = elt->attrs_depr; *p; ++p)
5341 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5342 return HTML_DEPRECATED ;
5343
5344 return HTML_INVALID ;
5345}
5346/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005347 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005348 * @node: an htmlNodePtr in a tree
5349 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005350 * for Element nodes)
5351 *
5352 * Checks whether the tree node is valid. Experimental (the author
5353 * only uses the HTML enhancements in a SAX parser)
5354 *
5355 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5356 * legacy allowed) or htmlElementStatusHere (otherwise).
5357 * for Attribute nodes, a return from htmlAttrAllowed
5358 * for other nodes, HTML_NA (no checks performed)
5359 */
5360htmlStatus
5361htmlNodeStatus(const htmlNodePtr node, int legacy) {
5362 if ( ! node )
5363 return HTML_INVALID ;
5364
5365 switch ( node->type ) {
5366 case XML_ELEMENT_NODE:
5367 return legacy
5368 ? ( htmlElementAllowedHere (
5369 htmlTagLookup(node->parent->name) , node->name
5370 ) ? HTML_VALID : HTML_INVALID )
5371 : htmlElementStatusHere(
5372 htmlTagLookup(node->parent->name) ,
5373 htmlTagLookup(node->name) )
5374 ;
5375 case XML_ATTRIBUTE_NODE:
5376 return htmlAttrAllowed(
5377 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5378 default: return HTML_NA ;
5379 }
5380}
Daniel Veillard9475a352003-09-26 12:47:50 +00005381/************************************************************************
5382 * *
5383 * New set (2.6.0) of simpler and more flexible APIs *
5384 * *
5385 ************************************************************************/
5386/**
5387 * DICT_FREE:
5388 * @str: a string
5389 *
5390 * Free a string if it is not owned by the "dict" dictionnary in the
5391 * current scope
5392 */
5393#define DICT_FREE(str) \
5394 if ((str) && ((!dict) || \
5395 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5396 xmlFree((char *)(str));
5397
5398/**
5399 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00005400 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00005401 *
5402 * Reset a parser context
5403 */
5404void
5405htmlCtxtReset(htmlParserCtxtPtr ctxt)
5406{
5407 xmlParserInputPtr input;
5408 xmlDictPtr dict = ctxt->dict;
5409
5410 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5411 xmlFreeInputStream(input);
5412 }
5413 ctxt->inputNr = 0;
5414 ctxt->input = NULL;
5415
5416 ctxt->spaceNr = 0;
5417 ctxt->spaceTab[0] = -1;
5418 ctxt->space = &ctxt->spaceTab[0];
5419
5420
5421 ctxt->nodeNr = 0;
5422 ctxt->node = NULL;
5423
5424 ctxt->nameNr = 0;
5425 ctxt->name = NULL;
5426
5427 DICT_FREE(ctxt->version);
5428 ctxt->version = NULL;
5429 DICT_FREE(ctxt->encoding);
5430 ctxt->encoding = NULL;
5431 DICT_FREE(ctxt->directory);
5432 ctxt->directory = NULL;
5433 DICT_FREE(ctxt->extSubURI);
5434 ctxt->extSubURI = NULL;
5435 DICT_FREE(ctxt->extSubSystem);
5436 ctxt->extSubSystem = NULL;
5437 if (ctxt->myDoc != NULL)
5438 xmlFreeDoc(ctxt->myDoc);
5439 ctxt->myDoc = NULL;
5440
5441 ctxt->standalone = -1;
5442 ctxt->hasExternalSubset = 0;
5443 ctxt->hasPErefs = 0;
5444 ctxt->html = 1;
5445 ctxt->external = 0;
5446 ctxt->instate = XML_PARSER_START;
5447 ctxt->token = 0;
5448
5449 ctxt->wellFormed = 1;
5450 ctxt->nsWellFormed = 1;
5451 ctxt->valid = 1;
5452 ctxt->vctxt.userData = ctxt;
5453 ctxt->vctxt.error = xmlParserValidityError;
5454 ctxt->vctxt.warning = xmlParserValidityWarning;
5455 ctxt->record_info = 0;
5456 ctxt->nbChars = 0;
5457 ctxt->checkIndex = 0;
5458 ctxt->inSubset = 0;
5459 ctxt->errNo = XML_ERR_OK;
5460 ctxt->depth = 0;
5461 ctxt->charset = XML_CHAR_ENCODING_UTF8;
5462 ctxt->catalogs = NULL;
5463 xmlInitNodeInfoSeq(&ctxt->node_seq);
5464
5465 if (ctxt->attsDefault != NULL) {
5466 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5467 ctxt->attsDefault = NULL;
5468 }
5469 if (ctxt->attsSpecial != NULL) {
5470 xmlHashFree(ctxt->attsSpecial, NULL);
5471 ctxt->attsSpecial = NULL;
5472 }
5473}
5474
5475/**
5476 * htmlCtxtUseOptions:
5477 * @ctxt: an HTML parser context
5478 * @options: a combination of htmlParserOption(s)
5479 *
5480 * Applies the options to the parser context
5481 *
5482 * Returns 0 in case of success, the set of unknown or unimplemented options
5483 * in case of error.
5484 */
5485int
5486htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5487{
5488 if (options & HTML_PARSE_NOWARNING) {
5489 ctxt->sax->warning = NULL;
5490 options -= XML_PARSE_NOWARNING;
5491 }
5492 if (options & HTML_PARSE_NOERROR) {
5493 ctxt->sax->error = NULL;
5494 ctxt->sax->fatalError = NULL;
5495 options -= XML_PARSE_NOERROR;
5496 }
5497 if (options & HTML_PARSE_PEDANTIC) {
5498 ctxt->pedantic = 1;
5499 options -= XML_PARSE_PEDANTIC;
5500 } else
5501 ctxt->pedantic = 0;
5502 if (options & XML_PARSE_NOBLANKS) {
5503 ctxt->keepBlanks = 0;
5504 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5505 options -= XML_PARSE_NOBLANKS;
5506 } else
5507 ctxt->keepBlanks = 1;
5508 ctxt->dictNames = 0;
5509 return (options);
5510}
5511
5512/**
5513 * htmlDoRead:
5514 * @ctxt: an HTML parser context
5515 * @URL: the base URL to use for the document
5516 * @encoding: the document encoding, or NULL
5517 * @options: a combination of htmlParserOption(s)
5518 * @reuse: keep the context for reuse
5519 *
5520 * Common front-end for the htmlRead functions
5521 *
5522 * Returns the resulting document tree or NULL
5523 */
5524static htmlDocPtr
5525htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5526 int options, int reuse)
5527{
5528 htmlDocPtr ret;
5529
5530 htmlCtxtUseOptions(ctxt, options);
5531 ctxt->html = 1;
5532 if (encoding != NULL) {
5533 xmlCharEncodingHandlerPtr hdlr;
5534
5535 hdlr = xmlFindCharEncodingHandler(encoding);
5536 if (hdlr != NULL)
5537 xmlSwitchToEncoding(ctxt, hdlr);
5538 }
5539 if ((URL != NULL) && (ctxt->input != NULL) &&
5540 (ctxt->input->filename == NULL))
5541 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
5542 htmlParseDocument(ctxt);
5543 ret = ctxt->myDoc;
5544 ctxt->myDoc = NULL;
5545 if (!reuse) {
5546 if ((ctxt->dictNames) &&
5547 (ret != NULL) &&
5548 (ret->dict == ctxt->dict))
5549 ctxt->dict = NULL;
5550 xmlFreeParserCtxt(ctxt);
5551 } else {
5552 /* Must duplicate the reference to the dictionary */
5553 if ((ctxt->dictNames) &&
5554 (ret != NULL) &&
5555 (ret->dict == ctxt->dict))
5556 xmlDictReference(ctxt->dict);
5557 }
5558 return (ret);
5559}
5560
5561/**
5562 * htmlReadDoc:
5563 * @cur: a pointer to a zero terminated string
5564 * @URL: the base URL to use for the document
5565 * @encoding: the document encoding, or NULL
5566 * @options: a combination of htmlParserOption(s)
5567 *
5568 * parse an XML in-memory document and build a tree.
5569 *
5570 * Returns the resulting document tree
5571 */
5572htmlDocPtr
5573htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
5574{
5575 htmlParserCtxtPtr ctxt;
5576
5577 if (cur == NULL)
5578 return (NULL);
5579
5580 ctxt = xmlCreateDocParserCtxt(cur);
5581 if (ctxt == NULL)
5582 return (NULL);
5583 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5584}
5585
5586/**
5587 * htmlReadFile:
5588 * @filename: a file or URL
5589 * @encoding: the document encoding, or NULL
5590 * @options: a combination of htmlParserOption(s)
5591 *
5592 * parse an XML file from the filesystem or the network.
5593 *
5594 * Returns the resulting document tree
5595 */
5596htmlDocPtr
5597htmlReadFile(const char *filename, const char *encoding, int options)
5598{
5599 htmlParserCtxtPtr ctxt;
5600
5601 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5602 if (ctxt == NULL)
5603 return (NULL);
5604 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
5605}
5606
5607/**
5608 * htmlReadMemory:
5609 * @buffer: a pointer to a char array
5610 * @size: the size of the array
5611 * @URL: the base URL to use for the document
5612 * @encoding: the document encoding, or NULL
5613 * @options: a combination of htmlParserOption(s)
5614 *
5615 * parse an XML in-memory document and build a tree.
5616 *
5617 * Returns the resulting document tree
5618 */
5619htmlDocPtr
5620htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
5621{
5622 htmlParserCtxtPtr ctxt;
5623
5624 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
5625 if (ctxt == NULL)
5626 return (NULL);
5627 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5628}
5629
5630/**
5631 * htmlReadFd:
5632 * @fd: an open file descriptor
5633 * @URL: the base URL to use for the document
5634 * @encoding: the document encoding, or NULL
5635 * @options: a combination of htmlParserOption(s)
5636 *
5637 * parse an XML from a file descriptor and build a tree.
5638 *
5639 * Returns the resulting document tree
5640 */
5641htmlDocPtr
5642htmlReadFd(int fd, const char *URL, const char *encoding, int options)
5643{
5644 htmlParserCtxtPtr ctxt;
5645 xmlParserInputBufferPtr input;
5646 xmlParserInputPtr stream;
5647
5648 if (fd < 0)
5649 return (NULL);
5650
5651 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
5652 if (input == NULL)
5653 return (NULL);
5654 ctxt = xmlNewParserCtxt();
5655 if (ctxt == NULL) {
5656 xmlFreeParserInputBuffer(input);
5657 return (NULL);
5658 }
5659 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5660 if (stream == NULL) {
5661 xmlFreeParserInputBuffer(input);
5662 xmlFreeParserCtxt(ctxt);
5663 return (NULL);
5664 }
5665 inputPush(ctxt, stream);
5666 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5667}
5668
5669/**
5670 * htmlReadIO:
5671 * @ioread: an I/O read function
5672 * @ioclose: an I/O close function
5673 * @ioctx: an I/O handler
5674 * @URL: the base URL to use for the document
5675 * @encoding: the document encoding, or NULL
5676 * @options: a combination of htmlParserOption(s)
5677 *
5678 * parse an HTML document from I/O functions and source and build a tree.
5679 *
5680 * Returns the resulting document tree
5681 */
5682htmlDocPtr
5683htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
5684 void *ioctx, const char *URL, const char *encoding, int options)
5685{
5686 htmlParserCtxtPtr ctxt;
5687 xmlParserInputBufferPtr input;
5688 xmlParserInputPtr stream;
5689
5690 if (ioread == NULL)
5691 return (NULL);
5692
5693 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
5694 XML_CHAR_ENCODING_NONE);
5695 if (input == NULL)
5696 return (NULL);
5697 ctxt = xmlNewParserCtxt();
5698 if (ctxt == NULL) {
5699 xmlFreeParserInputBuffer(input);
5700 return (NULL);
5701 }
5702 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5703 if (stream == NULL) {
5704 xmlFreeParserInputBuffer(input);
5705 xmlFreeParserCtxt(ctxt);
5706 return (NULL);
5707 }
5708 inputPush(ctxt, stream);
5709 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5710}
5711
5712/**
5713 * htmlCtxtReadDoc:
5714 * @ctxt: an HTML parser context
5715 * @cur: a pointer to a zero terminated string
5716 * @URL: the base URL to use for the document
5717 * @encoding: the document encoding, or NULL
5718 * @options: a combination of htmlParserOption(s)
5719 *
5720 * parse an XML in-memory document and build a tree.
5721 * This reuses the existing @ctxt parser context
5722 *
5723 * Returns the resulting document tree
5724 */
5725htmlDocPtr
5726htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
5727 const char *URL, const char *encoding, int options)
5728{
5729 xmlParserInputPtr stream;
5730
5731 if (cur == NULL)
5732 return (NULL);
5733 if (ctxt == NULL)
5734 return (NULL);
5735
5736 htmlCtxtReset(ctxt);
5737
5738 stream = xmlNewStringInputStream(ctxt, cur);
5739 if (stream == NULL) {
5740 return (NULL);
5741 }
5742 inputPush(ctxt, stream);
5743 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5744}
5745
5746/**
5747 * htmlCtxtReadFile:
5748 * @ctxt: an HTML parser context
5749 * @filename: a file or URL
5750 * @encoding: the document encoding, or NULL
5751 * @options: a combination of htmlParserOption(s)
5752 *
5753 * parse an XML file from the filesystem or the network.
5754 * This reuses the existing @ctxt parser context
5755 *
5756 * Returns the resulting document tree
5757 */
5758htmlDocPtr
5759htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
5760 const char *encoding, int options)
5761{
5762 xmlParserInputPtr stream;
5763
5764 if (filename == NULL)
5765 return (NULL);
5766 if (ctxt == NULL)
5767 return (NULL);
5768
5769 htmlCtxtReset(ctxt);
5770
5771 stream = xmlNewInputFromFile(ctxt, filename);
5772 if (stream == NULL) {
5773 return (NULL);
5774 }
5775 inputPush(ctxt, stream);
5776 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
5777}
5778
5779/**
5780 * htmlCtxtReadMemory:
5781 * @ctxt: an HTML parser context
5782 * @buffer: a pointer to a char array
5783 * @size: the size of the array
5784 * @URL: the base URL to use for the document
5785 * @encoding: the document encoding, or NULL
5786 * @options: a combination of htmlParserOption(s)
5787 *
5788 * parse an XML in-memory document and build a tree.
5789 * This reuses the existing @ctxt parser context
5790 *
5791 * Returns the resulting document tree
5792 */
5793htmlDocPtr
5794htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
5795 const char *URL, const char *encoding, int options)
5796{
5797 xmlParserInputBufferPtr input;
5798 xmlParserInputPtr stream;
5799
5800 if (ctxt == NULL)
5801 return (NULL);
5802 if (buffer == NULL)
5803 return (NULL);
5804
5805 htmlCtxtReset(ctxt);
5806
5807 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5808 if (input == NULL) {
5809 return(NULL);
5810 }
5811
5812 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5813 if (stream == NULL) {
5814 xmlFreeParserInputBuffer(input);
5815 return(NULL);
5816 }
5817
5818 inputPush(ctxt, stream);
5819 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5820}
5821
5822/**
5823 * htmlCtxtReadFd:
5824 * @ctxt: an HTML parser context
5825 * @fd: an open file descriptor
5826 * @URL: the base URL to use for the document
5827 * @encoding: the document encoding, or NULL
5828 * @options: a combination of htmlParserOption(s)
5829 *
5830 * parse an XML from a file descriptor and build a tree.
5831 * This reuses the existing @ctxt parser context
5832 *
5833 * Returns the resulting document tree
5834 */
5835htmlDocPtr
5836htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
5837 const char *URL, const char *encoding, int options)
5838{
5839 xmlParserInputBufferPtr input;
5840 xmlParserInputPtr stream;
5841
5842 if (fd < 0)
5843 return (NULL);
5844 if (ctxt == NULL)
5845 return (NULL);
5846
5847 htmlCtxtReset(ctxt);
5848
5849
5850 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
5851 if (input == NULL)
5852 return (NULL);
5853 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5854 if (stream == NULL) {
5855 xmlFreeParserInputBuffer(input);
5856 return (NULL);
5857 }
5858 inputPush(ctxt, stream);
5859 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5860}
5861
5862/**
5863 * htmlCtxtReadIO:
5864 * @ctxt: an HTML parser context
5865 * @ioread: an I/O read function
5866 * @ioclose: an I/O close function
5867 * @ioctx: an I/O handler
5868 * @URL: the base URL to use for the document
5869 * @encoding: the document encoding, or NULL
5870 * @options: a combination of htmlParserOption(s)
5871 *
5872 * parse an HTML document from I/O functions and source and build a tree.
5873 * This reuses the existing @ctxt parser context
5874 *
5875 * Returns the resulting document tree
5876 */
5877htmlDocPtr
5878htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
5879 xmlInputCloseCallback ioclose, void *ioctx,
5880 const char *URL,
5881 const char *encoding, int options)
5882{
5883 xmlParserInputBufferPtr input;
5884 xmlParserInputPtr stream;
5885
5886 if (ioread == NULL)
5887 return (NULL);
5888 if (ctxt == NULL)
5889 return (NULL);
5890
5891 htmlCtxtReset(ctxt);
5892
5893 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
5894 XML_CHAR_ENCODING_NONE);
5895 if (input == NULL)
5896 return (NULL);
5897 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5898 if (stream == NULL) {
5899 xmlFreeParserInputBuffer(input);
5900 return (NULL);
5901 }
5902 inputPush(ctxt, stream);
5903 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5904}
5905
Owen Taylor3473f882001-02-23 17:55:21 +00005906#endif /* LIBXML_HTML_ENABLED */