blob: da17efe5f16b34e34d3b6f50fc01c9d0248cb1fc [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillardf403d292003-10-05 13:51:35 +000062 * Some factorized error routines *
63 * *
64 ************************************************************************/
65
66/**
William M. Brackedb65a72004-02-06 07:36:04 +000067 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000068 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
Daniel Veillard157fee02003-10-31 10:36:03 +000076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000079 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
Daniel Veillard157fee02003-10-31 10:36:03 +0000109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000112 if (ctxt != NULL)
113 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
117 NULL, 0, 0,
118 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000119 if (ctxt != NULL)
120 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
128 * @val: integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
135{
Daniel Veillard157fee02003-10-31 10:36:03 +0000136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
138 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000139 if (ctxt != NULL)
140 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000144 if (ctxt != NULL)
145 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000146}
147
148/************************************************************************
149 * *
Owen Taylor3473f882001-02-23 17:55:21 +0000150 * Parser stacks related functions and macros *
151 * *
152 ************************************************************************/
153
Daniel Veillard1c732d22002-11-30 11:22:59 +0000154/**
155 * htmlnamePush:
156 * @ctxt: an HTML parser context
157 * @value: the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000162 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000163static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000165{
166 if (ctxt->nameNr >= ctxt->nameMax) {
167 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000168 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000169 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000170 ctxt->nameMax *
171 sizeof(ctxt->nameTab[0]));
172 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000173 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000174 return (0);
175 }
176 }
177 ctxt->nameTab[ctxt->nameNr] = value;
178 ctxt->name = value;
179 return (ctxt->nameNr++);
180}
181/**
182 * htmlnamePop:
183 * @ctxt: an HTML parser context
184 *
185 * Pops the top element name from the name stack
186 *
187 * Returns the name just removed
188 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000189static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000190htmlnamePop(htmlParserCtxtPtr ctxt)
191{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000192 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000193
Daniel Veillard1c732d22002-11-30 11:22:59 +0000194 if (ctxt->nameNr <= 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000195 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000196 ctxt->nameNr--;
197 if (ctxt->nameNr < 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000198 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000199 if (ctxt->nameNr > 0)
200 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
201 else
202 ctxt->name = NULL;
203 ret = ctxt->nameTab[ctxt->nameNr];
Daniel Veillard24505b02005-07-28 23:49:35 +0000204 ctxt->nameTab[ctxt->nameNr] = NULL;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000205 return (ret);
206}
Owen Taylor3473f882001-02-23 17:55:21 +0000207
208/*
209 * Macros for accessing the content. Those should be used only by the parser,
210 * and not exported.
211 *
212 * Dirty macros, i.e. one need to make assumption on the context to use them
213 *
214 * CUR_PTR return the current pointer to the xmlChar to be parsed.
215 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
216 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
217 * in UNICODE mode. This should be used internally by the parser
218 * only to compare to ASCII values otherwise it would break when
219 * running with UTF-8 encoding.
220 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
221 * to compare on ASCII based substring.
222 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
223 * it should be used only to compare on ASCII based substring.
224 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000225 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000226 *
227 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
228 *
229 * CURRENT Returns the current char value, with the full decoding of
230 * UTF-8 if we are using this mode. It returns an int.
231 * NEXT Skip to the next character, this does the proper decoding
232 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000233 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000234 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
235 */
236
237#define UPPER (toupper(*ctxt->input->cur))
238
Daniel Veillard77a90a72003-03-22 00:04:05 +0000239#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000240
241#define NXT(val) ctxt->input->cur[(val)]
242
243#define UPP(val) (toupper(ctxt->input->cur[(val)]))
244
245#define CUR_PTR ctxt->input->cur
246
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000247#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
248 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
249 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000250
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000251#define GROW if ((ctxt->progressive == 0) && \
252 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
253 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000254
255#define CURRENT ((int) (*ctxt->input->cur))
256
257#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
258
259/* Inported from XML */
260
Daniel Veillard561b7f82002-03-20 21:55:57 +0000261/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
262#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000263#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000264
Daniel Veillard561b7f82002-03-20 21:55:57 +0000265#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000266#define NXT(val) ctxt->input->cur[(val)]
267#define CUR_PTR ctxt->input->cur
268
269
270#define NEXTL(l) do { \
271 if (*(ctxt->input->cur) == '\n') { \
272 ctxt->input->line++; ctxt->input->col = 1; \
273 } else ctxt->input->col++; \
274 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
275 } while (0)
276
277/************
278 \
279 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
280 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
281 ************/
282
283#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
284#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
285
286#define COPY_BUF(l,b,i,v) \
287 if (l == 1) b[i++] = (xmlChar) v; \
288 else i += xmlCopyChar(l,&b[i],v)
289
290/**
291 * htmlCurrentChar:
292 * @ctxt: the HTML parser context
293 * @len: pointer to the length of the char read
294 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000295 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000296 * bytes in the input buffer. Implement the end of line normalization:
297 * 2.11 End-of-Line Handling
298 * If the encoding is unspecified, in the case we find an ISO-Latin-1
299 * char, then the encoding converter is plugged in automatically.
300 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000301 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000302 */
303
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000304static int
Owen Taylor3473f882001-02-23 17:55:21 +0000305htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
306 if (ctxt->instate == XML_PARSER_EOF)
307 return(0);
308
309 if (ctxt->token != 0) {
310 *len = 0;
311 return(ctxt->token);
312 }
313 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
314 /*
315 * We are supposed to handle UTF8, check it's valid
316 * From rfc2044: encoding of the Unicode values on UTF-8:
317 *
318 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
319 * 0000 0000-0000 007F 0xxxxxxx
320 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
321 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
322 *
323 * Check for the 0x110000 limit too
324 */
325 const unsigned char *cur = ctxt->input->cur;
326 unsigned char c;
327 unsigned int val;
328
329 c = *cur;
330 if (c & 0x80) {
331 if (cur[1] == 0)
332 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
333 if ((cur[1] & 0xc0) != 0x80)
334 goto encoding_error;
335 if ((c & 0xe0) == 0xe0) {
336
337 if (cur[2] == 0)
338 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
339 if ((cur[2] & 0xc0) != 0x80)
340 goto encoding_error;
341 if ((c & 0xf0) == 0xf0) {
342 if (cur[3] == 0)
343 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
344 if (((c & 0xf8) != 0xf0) ||
345 ((cur[3] & 0xc0) != 0x80))
346 goto encoding_error;
347 /* 4-byte code */
348 *len = 4;
349 val = (cur[0] & 0x7) << 18;
350 val |= (cur[1] & 0x3f) << 12;
351 val |= (cur[2] & 0x3f) << 6;
352 val |= cur[3] & 0x3f;
353 } else {
354 /* 3-byte code */
355 *len = 3;
356 val = (cur[0] & 0xf) << 12;
357 val |= (cur[1] & 0x3f) << 6;
358 val |= cur[2] & 0x3f;
359 }
360 } else {
361 /* 2-byte code */
362 *len = 2;
363 val = (cur[0] & 0x1f) << 6;
364 val |= cur[1] & 0x3f;
365 }
366 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000367 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
368 "Char 0x%X out of allowed range\n", val);
Owen Taylor3473f882001-02-23 17:55:21 +0000369 }
370 return(val);
371 } else {
372 /* 1-byte code */
373 *len = 1;
374 return((int) *ctxt->input->cur);
375 }
376 }
377 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000378 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000379 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000380 * XML constructs only use < 128 chars
381 */
382 *len = 1;
383 if ((int) *ctxt->input->cur < 0x80)
384 return((int) *ctxt->input->cur);
385
386 /*
387 * Humm this is bad, do an automatic flow conversion
388 */
389 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
390 ctxt->charset = XML_CHAR_ENCODING_UTF8;
391 return(xmlCurrentChar(ctxt, len));
392
393encoding_error:
394 /*
395 * If we detect an UTF8 error that probably mean that the
396 * input encoding didn't get properly advertized in the
397 * declaration header. Report the error and switch the encoding
398 * to ISO-Latin-1 (if you don't like this policy, just declare the
399 * encoding !)
400 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000401 {
402 char buffer[150];
403
Daniel Veillard861101d2007-06-12 08:38:57 +0000404 if (ctxt->input->end - ctxt->input->cur >= 4) {
405 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
406 ctxt->input->cur[0], ctxt->input->cur[1],
407 ctxt->input->cur[2], ctxt->input->cur[3]);
408 } else {
409 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
410 }
Daniel Veillarda03e3652004-11-02 18:45:30 +0000411 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
412 "Input is not proper UTF-8, indicate encoding !\n",
413 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000414 }
415
416 ctxt->charset = XML_CHAR_ENCODING_8859_1;
417 *len = 1;
418 return((int) *ctxt->input->cur);
419}
420
421/**
Owen Taylor3473f882001-02-23 17:55:21 +0000422 * htmlSkipBlankChars:
423 * @ctxt: the HTML parser context
424 *
425 * skip all blanks character found at that point in the input streams.
426 *
427 * Returns the number of space chars skipped
428 */
429
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000430static int
Owen Taylor3473f882001-02-23 17:55:21 +0000431htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
432 int res = 0;
433
William M. Brack76e95df2003-10-18 16:20:14 +0000434 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000435 if ((*ctxt->input->cur == 0) &&
436 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
437 xmlPopInput(ctxt);
438 } else {
439 if (*(ctxt->input->cur) == '\n') {
440 ctxt->input->line++; ctxt->input->col = 1;
441 } else ctxt->input->col++;
442 ctxt->input->cur++;
443 ctxt->nbChars++;
444 if (*ctxt->input->cur == 0)
445 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
446 }
447 res++;
448 }
449 return(res);
450}
451
452
453
454/************************************************************************
455 * *
456 * The list of HTML elements and their properties *
457 * *
458 ************************************************************************/
459
460/*
461 * Start Tag: 1 means the start tag can be ommited
462 * End Tag: 1 means the end tag can be ommited
463 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000464 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000465 * Depr: this element is deprecated
466 * DTD: 1 means that this element is valid only in the Loose DTD
467 * 2 means that this element is valid only in the Frameset DTD
468 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000469 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000470 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000471 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000472
473/* Definitions and a couple of vars for HTML Elements */
474
475#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000476#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000477#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000478#define NB_PHRASE 10
Daniel Veillard491e58e2007-05-02 16:15:18 +0000479#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
480#define NB_SPECIAL 16
Daniel Veillard930dfb62003-02-05 10:17:38 +0000481#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000482#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
483#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
484#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000485#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000486#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000487#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000488#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000489#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000490#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000491#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000492#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000493#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000494#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000495#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000496#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000497#define EMPTY NULL
498
499
Daniel Veillard065abe82006-07-03 08:55:04 +0000500static const char* const html_flow[] = { FLOW, NULL } ;
501static const char* const html_inline[] = { INLINE, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000502
503/* placeholders: elts with content but no subelements */
Daniel Veillard065abe82006-07-03 08:55:04 +0000504static const char* const html_pcdata[] = { NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000505#define html_cdata html_pcdata
506
507
508/* ... and for HTML Attributes */
509
510#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000511#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000512#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000513#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000514#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000515#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000516#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000517#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000518#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000519#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000520#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000521#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000522
Daniel Veillard065abe82006-07-03 08:55:04 +0000523static const char* const html_attrs[] = { ATTRS, NULL } ;
524static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
525static const char* const core_attrs[] = { COREATTRS, NULL } ;
526static const char* const i18n_attrs[] = { I18N, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000527
528
529/* Other declarations that should go inline ... */
Daniel Veillard065abe82006-07-03 08:55:04 +0000530static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000531 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
532 "tabindex", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000533static const char* const target_attr[] = { "target", NULL } ;
534static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
535static const char* const alt_attr[] = { "alt", NULL } ;
536static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
537static const char* const href_attrs[] = { "href", NULL } ;
538static const char* const clear_attrs[] = { "clear", NULL } ;
539static const char* const inline_p[] = { INLINE, "p", NULL } ;
540
541static const char* const flow_param[] = { FLOW, "param", NULL } ;
542static const char* const applet_attrs[] = { COREATTRS , "codebase",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000543 "archive", "alt", "name", "height", "width", "align",
544 "hspace", "vspace", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000545static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000546 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000547static const char* const basefont_attrs[] =
Daniel Veillard930dfb62003-02-05 10:17:38 +0000548 { "id", "size", "color", "face", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000549static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
550static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
551static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
552static const char* const body_depr[] = { "background", "bgcolor", "text",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000553 "link", "vlink", "alink", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000554static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000555 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
556
557
Daniel Veillard065abe82006-07-03 08:55:04 +0000558static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
559static const char* const col_elt[] = { "col", NULL } ;
560static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
561static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
562static const char* const dl_contents[] = { "dt", "dd", NULL } ;
563static const char* const compact_attr[] = { "compact", NULL } ;
564static const char* const label_attr[] = { "label", NULL } ;
565static const char* const fieldset_contents[] = { FLOW, "legend" } ;
566static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
567static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
568static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
569static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
570static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
571static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
572static const char* const head_attrs[] = { I18N, "profile", NULL } ;
573static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
574static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
575static const char* const version_attr[] = { "version", NULL } ;
576static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
577static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
578static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
Daniel Veillard491e58e2007-05-02 16:15:18 +0000579static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000580static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
581static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
582static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
583static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
584static const char* const align_attr[] = { "align", NULL } ;
585static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
586static const char* const map_contents[] = { BLOCK, "area", NULL } ;
587static const char* const name_attr[] = { "name", NULL } ;
588static const char* const action_attr[] = { "action", NULL } ;
589static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
590static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
591static const char* const content_attr[] = { "content", NULL } ;
592static const char* const type_attr[] = { "type", NULL } ;
593static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
594static const char* const object_contents[] = { FLOW, "param", NULL } ;
595static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
596static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
597static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
598static const char* const option_elt[] = { "option", NULL } ;
599static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
600static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
601static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
602static const char* const width_attr[] = { "width", NULL } ;
603static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
604static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
605static const char* const language_attr[] = { "language", NULL } ;
606static const char* const select_content[] = { "optgroup", "option", NULL } ;
607static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
608static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
Roland Steiner04f8eef2009-05-12 09:16:16 +0200609static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000610static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
611static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
612static const char* const tr_elt[] = { "tr", NULL } ;
613static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
614static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
615static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
616static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
617static const char* const tr_contents[] = { "th", "td", NULL } ;
618static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
619static const char* const li_elt[] = { "li", NULL } ;
620static const char* const ul_depr[] = { "type", "compact", NULL} ;
621static const char* const dir_attr[] = { "dir", NULL} ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000622
623#define DECL (const char**)
624
Daniel Veillard22090732001-07-16 00:06:07 +0000625static const htmlElemDesc
626html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000627{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
628 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
629},
630{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
631 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
632},
633{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
634 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
635},
636{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
637 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
638},
639{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
640 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
641},
642{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
643 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
644},
645{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
646 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
647},
648{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
649 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
650},
651{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
652 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
653},
654{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
655 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
656},
657{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
658 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
659},
660{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
661 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
662},
663{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
664 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
665},
666{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
667 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
668},
669{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
670 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
671},
672{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
673 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
674},
675{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
676 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
677},
678{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
679 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
680},
681{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
682 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
683},
684{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
685 EMPTY , NULL , DECL col_attrs , NULL, NULL
686},
687{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
688 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
689},
690{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
691 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
692},
693{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
694 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
695},
696{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
697 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
698},
699{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
700 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
701},
702{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
703 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
704},
705{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
William M. Bracke978ae22007-03-21 06:16:02 +0000706 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
Daniel Veillard930dfb62003-02-05 10:17:38 +0000707},
708{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
709 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
710},
711{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
712 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
713},
Daniel Veillard640f89e2008-01-11 06:24:09 +0000714{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
Daniel Veillard491e58e2007-05-02 16:15:18 +0000715 EMPTY, NULL, DECL embed_attrs, NULL, NULL
716},
Daniel Veillard930dfb62003-02-05 10:17:38 +0000717{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
718 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
719},
720{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
721 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
722},
723{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
724 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
725},
726{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
727 EMPTY, NULL, NULL, DECL frame_attrs, NULL
728},
729{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
730 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
731},
732{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
733 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
734},
735{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
736 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
737},
738{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
739 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
740},
741{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
742 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
743},
744{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
745 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
746},
747{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
748 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
749},
750{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
751 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
752},
753{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
754 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
755},
756{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
757 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
758},
759{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
760 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
761},
762{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
763 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
764},
765{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
William M. Bracke978ae22007-03-21 06:16:02 +0000766 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
Daniel Veillard930dfb62003-02-05 10:17:38 +0000767},
768{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
769 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
770},
771{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
772 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
773},
774{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
775 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
776},
777{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
778 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
779},
780{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
781 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
782},
783{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
784 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
785},
786{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
787 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
788},
789{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
790 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
791},
792{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
William M. Bracke978ae22007-03-21 06:16:02 +0000793 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000794},
795{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
796 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
797},
798{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
799 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
800},
801{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
802 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
803},
804{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
805 DECL html_flow, "div", DECL html_attrs, NULL, NULL
806},
807{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
808 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
809},
810{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
811 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
812},
813{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
William M. Bracke978ae22007-03-21 06:16:02 +0000814 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000815},
816{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
817 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
818},
819{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
820 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
821},
822{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
William M. Bracke978ae22007-03-21 06:16:02 +0000823 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000824},
825{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
826 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
827},
828{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
829 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
830},
831{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
832 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
833},
834{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
835 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
836},
837{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
838 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
839},
840{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
841 DECL select_content, NULL, DECL select_attrs, NULL, NULL
842},
843{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
844 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
845},
846{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
847 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
848},
849{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
850 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
851},
852{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
853 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
854},
855{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
856 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
857},
858{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
859 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
860},
861{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
862 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
863},
864{ "table", 0, 0, 0, 0, 0, 0, 0, "",
865 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
866},
867{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
868 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
869},
870{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
871 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
872},
873{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
874 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
875},
876{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
877 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
878},
879{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
880 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
881},
882{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
883 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
884},
885{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
886 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
887},
888{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
889 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
890},
891{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
892 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
893},
894{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
895 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
896},
897{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
898 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
899},
900{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
901 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
902}
Owen Taylor3473f882001-02-23 17:55:21 +0000903};
904
905/*
Owen Taylor3473f882001-02-23 17:55:21 +0000906 * start tags that imply the end of current element
907 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000908static const char * const htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000909"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
910 "dl", "ul", "ol", "menu", "dir", "address", "pre",
911 "listing", "xmp", "head", NULL,
912"head", "p", NULL,
913"title", "p", NULL,
914"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +0000915"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000916"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
917 "pre", "listing", "xmp", "head", "li", NULL,
918"hr", "p", "head", NULL,
919"h1", "p", "head", NULL,
920"h2", "p", "head", NULL,
921"h3", "p", "head", NULL,
922"h4", "p", "head", NULL,
923"h5", "p", "head", NULL,
924"h6", "p", "head", NULL,
925"dir", "p", "head", NULL,
926"address", "p", "head", "ul", NULL,
927"pre", "p", "head", "ul", NULL,
928"listing", "p", "head", NULL,
929"xmp", "p", "head", NULL,
930"blockquote", "p", "head", NULL,
931"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
932 "xmp", "head", NULL,
933"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
934 "head", "dd", NULL,
935"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
936 "head", "dt", NULL,
937"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
938 "listing", "xmp", NULL,
939"ol", "p", "head", "ul", NULL,
940"menu", "p", "head", "ul", NULL,
941"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
942"div", "p", "head", NULL,
943"noscript", "p", "head", NULL,
944"center", "font", "b", "i", "p", "head", NULL,
945"a", "a", NULL,
946"caption", "p", NULL,
947"colgroup", "caption", "colgroup", "col", "p", NULL,
948"col", "caption", "col", "p", NULL,
949"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
950 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000951"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
952"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000953"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
954"thead", "caption", "col", "colgroup", NULL,
955"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
956 "tbody", "p", NULL,
957"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
958 "tfoot", "tbody", "p", NULL,
959"optgroup", "option", NULL,
960"option", "option", NULL,
961"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
962 "pre", "listing", "xmp", "a", NULL,
963NULL
964};
965
966/*
967 * The list of HTML elements which are supposed not to have
968 * CDATA content and where a p element will be implied
969 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000970 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000971 * implied paragraph
972 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000973static const char *const htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000974 "html",
975 "head",
Owen Taylor3473f882001-02-23 17:55:21 +0000976 NULL
977};
978
979/*
980 * The list of HTML attributes which are of content %Script;
981 * NOTE: when adding ones, check htmlIsScriptAttribute() since
982 * it assumes the name starts with 'on'
983 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000984static const char *const htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000985 "onclick",
986 "ondblclick",
987 "onmousedown",
988 "onmouseup",
989 "onmouseover",
990 "onmousemove",
991 "onmouseout",
992 "onkeypress",
993 "onkeydown",
994 "onkeyup",
995 "onload",
996 "onunload",
997 "onfocus",
998 "onblur",
999 "onsubmit",
1000 "onrest",
1001 "onchange",
1002 "onselect"
1003};
1004
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001005/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001006 * This table is used by the htmlparser to know what to do with
1007 * broken html pages. By assigning different priorities to different
1008 * elements the parser can decide how to handle extra endtags.
1009 * Endtags are only allowed to close elements with lower or equal
1010 * priority.
1011 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001012
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001013typedef struct {
1014 const char *name;
1015 int priority;
1016} elementPriority;
1017
Daniel Veillard22090732001-07-16 00:06:07 +00001018static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001019 {"div", 150},
1020 {"td", 160},
1021 {"th", 160},
1022 {"tr", 170},
1023 {"thead", 180},
1024 {"tbody", 180},
1025 {"tfoot", 180},
1026 {"table", 190},
1027 {"head", 200},
1028 {"body", 200},
1029 {"html", 220},
1030 {NULL, 100} /* Default priority */
1031};
Owen Taylor3473f882001-02-23 17:55:21 +00001032
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001033static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001034static int htmlStartCloseIndexinitialized = 0;
1035
1036/************************************************************************
1037 * *
1038 * functions to handle HTML specific data *
1039 * *
1040 ************************************************************************/
1041
1042/**
1043 * htmlInitAutoClose:
1044 *
1045 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1046 * This is not reentrant. Call xmlInitParser() once before processing in
1047 * case of use in multithreaded programs.
1048 */
1049void
1050htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001051 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001052
1053 if (htmlStartCloseIndexinitialized) return;
1054
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001055 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1056 indx = 0;
1057 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
Daniel Veillard28aac0b2006-10-16 08:31:18 +00001058 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001059 while (htmlStartClose[i] != NULL) i++;
1060 i++;
1061 }
1062 htmlStartCloseIndexinitialized = 1;
1063}
1064
1065/**
1066 * htmlTagLookup:
1067 * @tag: The tag name in lowercase
1068 *
1069 * Lookup the HTML tag in the ElementTable
1070 *
1071 * Returns the related htmlElemDescPtr or NULL if not found.
1072 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001073const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001074htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001075 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001076
1077 for (i = 0; i < (sizeof(html40ElementTable) /
1078 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001079 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001080 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001081 }
1082 return(NULL);
1083}
1084
1085/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001086 * htmlGetEndPriority:
1087 * @name: The name of the element to look up the priority for.
1088 *
1089 * Return value: The "endtag" priority.
1090 **/
1091static int
1092htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001093 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001094
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001095 while ((htmlEndPriority[i].name != NULL) &&
1096 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1097 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001098
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001099 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001100}
1101
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001102
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001103/**
Owen Taylor3473f882001-02-23 17:55:21 +00001104 * htmlCheckAutoClose:
1105 * @newtag: The new tag name
1106 * @oldtag: The old tag name
1107 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001108 * Checks whether the new tag is one of the registered valid tags for
1109 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001110 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1111 *
1112 * Returns 0 if no, 1 if yes.
1113 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001114static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001115htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1116{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001117 int i, indx;
1118 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001119
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001120 if (htmlStartCloseIndexinitialized == 0)
1121 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001122
1123 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001124 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001125 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001126 if (closed == NULL)
1127 return (0);
1128 if (xmlStrEqual(BAD_CAST * closed, newtag))
1129 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001130 }
1131
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001132 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001133 i++;
1134 while (htmlStartClose[i] != NULL) {
1135 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001136 return (1);
1137 }
1138 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001139 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001140 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001141}
1142
1143/**
1144 * htmlAutoCloseOnClose:
1145 * @ctxt: an HTML parser context
1146 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001147 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001148 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001149 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001150 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001151static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001152htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1153{
1154 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001155 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001156
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001157 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001158
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001159 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001160
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001161 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1162 break;
1163 /*
1164 * A missplaced endtag can only close elements with lower
1165 * or equal priority, so if we find an element with higher
1166 * priority before we find an element with
1167 * matching name, we just ignore this endtag
1168 */
1169 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1170 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001171 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001172 if (i < 0)
1173 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001174
1175 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001176 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001177 if ((info != NULL) && (info->endTag == 3)) {
1178 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1179 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001180 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001181 }
1182 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1183 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001184 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001185 }
1186}
1187
1188/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001189 * htmlAutoCloseOnEnd:
1190 * @ctxt: an HTML parser context
1191 *
1192 * Close all remaining tags at the end of the stream
1193 */
1194static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001195htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1196{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001197 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001198
William M. Brack899e64a2003-09-26 18:03:42 +00001199 if (ctxt->nameNr == 0)
1200 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001201 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001202 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1203 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001204 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001205 }
1206}
1207
1208/**
Owen Taylor3473f882001-02-23 17:55:21 +00001209 * htmlAutoClose:
1210 * @ctxt: an HTML parser context
1211 * @newtag: The new tag name or NULL
1212 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001213 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001214 * The list is kept in htmlStartClose array. This function is
1215 * called when a new tag has been detected and generates the
1216 * appropriates closes if possible/needed.
1217 * If newtag is NULL this mean we are at the end of the resource
1218 * and we should check
1219 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001220static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001221htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1222{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001223 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001224 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001225 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1226 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001227 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001228 }
1229 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001230 htmlAutoCloseOnEnd(ctxt);
1231 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001232 }
1233 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001234 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1235 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1236 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001237 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1238 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001239 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001240 }
Owen Taylor3473f882001-02-23 17:55:21 +00001241}
1242
1243/**
1244 * htmlAutoCloseTag:
1245 * @doc: the HTML document
1246 * @name: The tag name
1247 * @elem: the HTML element
1248 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001249 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001250 * The list is kept in htmlStartClose array. This function checks
1251 * if the element or one of it's children would autoclose the
1252 * given tag.
1253 *
1254 * Returns 1 if autoclose, 0 otherwise
1255 */
1256int
1257htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1258 htmlNodePtr child;
1259
1260 if (elem == NULL) return(1);
1261 if (xmlStrEqual(name, elem->name)) return(0);
1262 if (htmlCheckAutoClose(elem->name, name)) return(1);
1263 child = elem->children;
1264 while (child != NULL) {
1265 if (htmlAutoCloseTag(doc, name, child)) return(1);
1266 child = child->next;
1267 }
1268 return(0);
1269}
1270
1271/**
1272 * htmlIsAutoClosed:
1273 * @doc: the HTML document
1274 * @elem: the HTML element
1275 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001276 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001277 * The list is kept in htmlStartClose array. This function checks
1278 * if a tag is autoclosed by one of it's child
1279 *
1280 * Returns 1 if autoclosed, 0 otherwise
1281 */
1282int
1283htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1284 htmlNodePtr child;
1285
1286 if (elem == NULL) return(1);
1287 child = elem->children;
1288 while (child != NULL) {
1289 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1290 child = child->next;
1291 }
1292 return(0);
1293}
1294
1295/**
1296 * htmlCheckImplied:
1297 * @ctxt: an HTML parser context
1298 * @newtag: The new tag name
1299 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001300 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001301 * called when a new tag has been detected and generates the
1302 * appropriates implicit tags if missing
1303 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001304static void
Owen Taylor3473f882001-02-23 17:55:21 +00001305htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1306 if (!htmlOmittedDefaultValue)
1307 return;
1308 if (xmlStrEqual(newtag, BAD_CAST"html"))
1309 return;
1310 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001311 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001312 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1313 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1314 }
1315 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1316 return;
1317 if ((ctxt->nameNr <= 1) &&
1318 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1319 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1320 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1321 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1322 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1323 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1324 /*
1325 * dropped OBJECT ... i you put it first BODY will be
1326 * assumed !
1327 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001328 htmlnamePush(ctxt, BAD_CAST"head");
Owen Taylor3473f882001-02-23 17:55:21 +00001329 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1330 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1331 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1332 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1333 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1334 int i;
1335 for (i = 0;i < ctxt->nameNr;i++) {
1336 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1337 return;
1338 }
1339 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1340 return;
1341 }
1342 }
1343
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001344 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001345 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1346 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1347 }
1348}
1349
1350/**
1351 * htmlCheckParagraph
1352 * @ctxt: an HTML parser context
1353 *
1354 * Check whether a p element need to be implied before inserting
1355 * characters in the current element.
1356 *
1357 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1358 * in case of error.
1359 */
1360
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001361static int
Owen Taylor3473f882001-02-23 17:55:21 +00001362htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1363 const xmlChar *tag;
1364 int i;
1365
1366 if (ctxt == NULL)
1367 return(-1);
1368 tag = ctxt->name;
1369 if (tag == NULL) {
1370 htmlAutoClose(ctxt, BAD_CAST"p");
1371 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001372 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001373 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1374 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1375 return(1);
1376 }
1377 if (!htmlOmittedDefaultValue)
1378 return(0);
1379 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1380 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001381 htmlAutoClose(ctxt, BAD_CAST"p");
1382 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001383 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001384 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1385 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1386 return(1);
1387 }
1388 }
1389 return(0);
1390}
1391
1392/**
1393 * htmlIsScriptAttribute:
1394 * @name: an attribute name
1395 *
1396 * Check if an attribute is of content type Script
1397 *
1398 * Returns 1 is the attribute is a script 0 otherwise
1399 */
1400int
1401htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001402 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001403
1404 if (name == NULL)
1405 return(0);
1406 /*
1407 * all script attributes start with 'on'
1408 */
1409 if ((name[0] != 'o') || (name[1] != 'n'))
1410 return(0);
1411 for (i = 0;
1412 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1413 i++) {
1414 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1415 return(1);
1416 }
1417 return(0);
1418}
1419
1420/************************************************************************
1421 * *
1422 * The list of HTML predefined entities *
1423 * *
1424 ************************************************************************/
1425
1426
Daniel Veillard22090732001-07-16 00:06:07 +00001427static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001428/*
1429 * the 4 absolute ones, plus apostrophe.
1430 */
1431{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1432{ 38, "amp", "ampersand, U+0026 ISOnum" },
1433{ 39, "apos", "single quote" },
1434{ 60, "lt", "less-than sign, U+003C ISOnum" },
1435{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1436
1437/*
1438 * A bunch still in the 128-255 range
1439 * Replacing them depend really on the charset used.
1440 */
1441{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1442{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1443{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1444{ 163, "pound","pound sign, U+00A3 ISOnum" },
1445{ 164, "curren","currency sign, U+00A4 ISOnum" },
1446{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1447{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1448{ 167, "sect", "section sign, U+00A7 ISOnum" },
1449{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1450{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1451{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1452{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1453{ 172, "not", "not sign, U+00AC ISOnum" },
1454{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1455{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1456{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1457{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1458{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1459{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1460{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1461{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1462{ 181, "micro","micro sign, U+00B5 ISOnum" },
1463{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1464{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1465{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1466{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1467{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1468{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1469{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1470{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1471{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1472{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1473{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1474{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1475{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1476{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1477{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1478{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1479{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1480{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1481{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1482{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1483{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1484{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1485{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1486{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1487{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1488{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1489{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1490{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1491{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1492{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1493{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1494{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1495{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1496{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1497{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1498{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1499{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1500{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1501{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1502{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1503{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1504{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1505{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1506{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1507{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1508{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1509{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1510{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1511{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1512{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1513{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1514{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1515{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1516{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1517{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1518{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1519{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1520{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1521{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1522{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1523{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1524{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1525{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1526{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1527{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1528{ 247, "divide","division sign, U+00F7 ISOnum" },
1529{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1530{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1531{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1532{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1533{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1534{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1535{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1536{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1537
1538{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1539{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1540{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1541{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1542{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1543
1544/*
1545 * Anything below should really be kept as entities references
1546 */
1547{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1548
1549{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1550{ 732, "tilde","small tilde, U+02DC ISOdia" },
1551
1552{ 913, "Alpha","greek capital letter alpha, U+0391" },
1553{ 914, "Beta", "greek capital letter beta, U+0392" },
1554{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1555{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1556{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1557{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1558{ 919, "Eta", "greek capital letter eta, U+0397" },
1559{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1560{ 921, "Iota", "greek capital letter iota, U+0399" },
1561{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001562{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001563{ 924, "Mu", "greek capital letter mu, U+039C" },
1564{ 925, "Nu", "greek capital letter nu, U+039D" },
1565{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1566{ 927, "Omicron","greek capital letter omicron, U+039F" },
1567{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1568{ 929, "Rho", "greek capital letter rho, U+03A1" },
1569{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1570{ 932, "Tau", "greek capital letter tau, U+03A4" },
1571{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1572{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1573{ 935, "Chi", "greek capital letter chi, U+03A7" },
1574{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1575{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1576
1577{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1578{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1579{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1580{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1581{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1582{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1583{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1584{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1585{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1586{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1587{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1588{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1589{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1590{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1591{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1592{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1593{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1594{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1595{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1596{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1597{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1598{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1599{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1600{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1601{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1602{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1603{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1604{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1605
1606{ 8194, "ensp", "en space, U+2002 ISOpub" },
1607{ 8195, "emsp", "em space, U+2003 ISOpub" },
1608{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1609{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1610{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1611{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1612{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1613{ 8211, "ndash","en dash, U+2013 ISOpub" },
1614{ 8212, "mdash","em dash, U+2014 ISOpub" },
1615{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1616{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1617{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1618{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1619{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1620{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1621{ 8224, "dagger","dagger, U+2020 ISOpub" },
1622{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1623
1624{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1625{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1626
1627{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1628
1629{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1630{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1631
1632{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1633{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1634
1635{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1636{ 8260, "frasl","fraction slash, U+2044 NEW" },
1637
1638{ 8364, "euro", "euro sign, U+20AC NEW" },
1639
1640{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1641{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1642{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1643{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1644{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1645{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1646{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1647{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1648{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1649{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1650{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1651{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1652{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1653{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1654{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1655{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1656
1657{ 8704, "forall","for all, U+2200 ISOtech" },
1658{ 8706, "part", "partial differential, U+2202 ISOtech" },
1659{ 8707, "exist","there exists, U+2203 ISOtech" },
1660{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1661{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1662{ 8712, "isin", "element of, U+2208 ISOtech" },
1663{ 8713, "notin","not an element of, U+2209 ISOtech" },
1664{ 8715, "ni", "contains as member, U+220B ISOtech" },
1665{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001666{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001667{ 8722, "minus","minus sign, U+2212 ISOtech" },
1668{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1669{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1670{ 8733, "prop", "proportional to, U+221D ISOtech" },
1671{ 8734, "infin","infinity, U+221E ISOtech" },
1672{ 8736, "ang", "angle, U+2220 ISOamso" },
1673{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1674{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1675{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1676{ 8746, "cup", "union = cup, U+222A ISOtech" },
1677{ 8747, "int", "integral, U+222B ISOtech" },
1678{ 8756, "there4","therefore, U+2234 ISOtech" },
1679{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1680{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1681{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1682{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1683{ 8801, "equiv","identical to, U+2261 ISOtech" },
1684{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1685{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1686{ 8834, "sub", "subset of, U+2282 ISOtech" },
1687{ 8835, "sup", "superset of, U+2283 ISOtech" },
1688{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1689{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1690{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1691{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1692{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1693{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1694{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1695{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1696{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1697{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1698{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1699{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1700{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1701{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1702
1703{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1704{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1705{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1706{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1707
1708};
1709
1710/************************************************************************
1711 * *
1712 * Commodity functions to handle entities *
1713 * *
1714 ************************************************************************/
1715
1716/*
1717 * Macro used to grow the current buffer.
1718 */
1719#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001720 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001721 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001722 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1723 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001724 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001725 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001726 return(NULL); \
1727 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001728 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001729}
1730
1731/**
1732 * htmlEntityLookup:
1733 * @name: the entity name
1734 *
1735 * Lookup the given entity in EntitiesTable
1736 *
1737 * TODO: the linear scan is really ugly, an hash table is really needed.
1738 *
1739 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1740 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001741const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001742htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001743 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001744
1745 for (i = 0;i < (sizeof(html40EntitiesTable)/
1746 sizeof(html40EntitiesTable[0]));i++) {
1747 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001748 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001749 }
1750 }
1751 return(NULL);
1752}
1753
1754/**
1755 * htmlEntityValueLookup:
1756 * @value: the entity's unicode value
1757 *
1758 * Lookup the given entity in EntitiesTable
1759 *
1760 * TODO: the linear scan is really ugly, an hash table is really needed.
1761 *
1762 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1763 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001764const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001765htmlEntityValueLookup(unsigned int value) {
1766 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001767
1768 for (i = 0;i < (sizeof(html40EntitiesTable)/
1769 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001770 if (html40EntitiesTable[i].value >= value) {
1771 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001772 break;
William M. Brack78637da2003-07-31 14:47:38 +00001773 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001774 }
Owen Taylor3473f882001-02-23 17:55:21 +00001775 }
1776 return(NULL);
1777}
1778
1779/**
1780 * UTF8ToHtml:
1781 * @out: a pointer to an array of bytes to store the result
1782 * @outlen: the length of @out
1783 * @in: a pointer to an array of UTF-8 chars
1784 * @inlen: the length of @in
1785 *
1786 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1787 * plus HTML entities block of chars out.
1788 *
1789 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1790 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001791 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001792 * The value of @outlen after return is the number of octets consumed.
1793 */
1794int
1795UTF8ToHtml(unsigned char* out, int *outlen,
1796 const unsigned char* in, int *inlen) {
1797 const unsigned char* processed = in;
1798 const unsigned char* outend;
1799 const unsigned char* outstart = out;
1800 const unsigned char* instart = in;
1801 const unsigned char* inend;
1802 unsigned int c, d;
1803 int trailing;
1804
Daniel Veillardce682bc2004-11-05 17:22:25 +00001805 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00001806 if (in == NULL) {
1807 /*
1808 * initialization nothing to do
1809 */
1810 *outlen = 0;
1811 *inlen = 0;
1812 return(0);
1813 }
1814 inend = in + (*inlen);
1815 outend = out + (*outlen);
1816 while (in < inend) {
1817 d = *in++;
1818 if (d < 0x80) { c= d; trailing= 0; }
1819 else if (d < 0xC0) {
1820 /* trailing byte in leading position */
1821 *outlen = out - outstart;
1822 *inlen = processed - instart;
1823 return(-2);
1824 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1825 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1826 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1827 else {
1828 /* no chance for this in Ascii */
1829 *outlen = out - outstart;
1830 *inlen = processed - instart;
1831 return(-2);
1832 }
1833
1834 if (inend - in < trailing) {
1835 break;
1836 }
1837
1838 for ( ; trailing; trailing--) {
1839 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1840 break;
1841 c <<= 6;
1842 c |= d & 0x3F;
1843 }
1844
1845 /* assertion: c is a single UTF-4 value */
1846 if (c < 0x80) {
1847 if (out + 1 >= outend)
1848 break;
1849 *out++ = c;
1850 } else {
1851 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001852 const htmlEntityDesc * ent;
Daniel Veillard1032ac42006-11-23 16:18:30 +00001853 const char *cp;
1854 char nbuf[16];
Owen Taylor3473f882001-02-23 17:55:21 +00001855
1856 /*
1857 * Try to lookup a predefined HTML entity for it
1858 */
1859
1860 ent = htmlEntityValueLookup(c);
1861 if (ent == NULL) {
Daniel Veillard1032ac42006-11-23 16:18:30 +00001862 snprintf(nbuf, sizeof(nbuf), "#%u", c);
1863 cp = nbuf;
Owen Taylor3473f882001-02-23 17:55:21 +00001864 }
Daniel Veillard1032ac42006-11-23 16:18:30 +00001865 else
1866 cp = ent->name;
1867 len = strlen(cp);
Owen Taylor3473f882001-02-23 17:55:21 +00001868 if (out + 2 + len >= outend)
1869 break;
1870 *out++ = '&';
Daniel Veillard1032ac42006-11-23 16:18:30 +00001871 memcpy(out, cp, len);
Owen Taylor3473f882001-02-23 17:55:21 +00001872 out += len;
1873 *out++ = ';';
1874 }
1875 processed = in;
1876 }
1877 *outlen = out - outstart;
1878 *inlen = processed - instart;
1879 return(0);
1880}
1881
1882/**
1883 * htmlEncodeEntities:
1884 * @out: a pointer to an array of bytes to store the result
1885 * @outlen: the length of @out
1886 * @in: a pointer to an array of UTF-8 chars
1887 * @inlen: the length of @in
1888 * @quoteChar: the quote character to escape (' or ") or zero.
1889 *
1890 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1891 * plus HTML entities block of chars out.
1892 *
1893 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1894 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001895 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001896 * The value of @outlen after return is the number of octets consumed.
1897 */
1898int
1899htmlEncodeEntities(unsigned char* out, int *outlen,
1900 const unsigned char* in, int *inlen, int quoteChar) {
1901 const unsigned char* processed = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001902 const unsigned char* outend;
Owen Taylor3473f882001-02-23 17:55:21 +00001903 const unsigned char* outstart = out;
1904 const unsigned char* instart = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001905 const unsigned char* inend;
Owen Taylor3473f882001-02-23 17:55:21 +00001906 unsigned int c, d;
1907 int trailing;
1908
Daniel Veillardce682bc2004-11-05 17:22:25 +00001909 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
1910 return(-1);
1911 outend = out + (*outlen);
1912 inend = in + (*inlen);
Owen Taylor3473f882001-02-23 17:55:21 +00001913 while (in < inend) {
1914 d = *in++;
1915 if (d < 0x80) { c= d; trailing= 0; }
1916 else if (d < 0xC0) {
1917 /* trailing byte in leading position */
1918 *outlen = out - outstart;
1919 *inlen = processed - instart;
1920 return(-2);
1921 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1922 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1923 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1924 else {
1925 /* no chance for this in Ascii */
1926 *outlen = out - outstart;
1927 *inlen = processed - instart;
1928 return(-2);
1929 }
1930
1931 if (inend - in < trailing)
1932 break;
1933
1934 while (trailing--) {
1935 if (((d= *in++) & 0xC0) != 0x80) {
1936 *outlen = out - outstart;
1937 *inlen = processed - instart;
1938 return(-2);
1939 }
1940 c <<= 6;
1941 c |= d & 0x3F;
1942 }
1943
1944 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001945 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1946 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001947 if (out >= outend)
1948 break;
1949 *out++ = c;
1950 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001951 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001952 const char *cp;
1953 char nbuf[16];
1954 int len;
1955
1956 /*
1957 * Try to lookup a predefined HTML entity for it
1958 */
1959 ent = htmlEntityValueLookup(c);
1960 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001961 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001962 cp = nbuf;
1963 }
1964 else
1965 cp = ent->name;
1966 len = strlen(cp);
1967 if (out + 2 + len > outend)
1968 break;
1969 *out++ = '&';
1970 memcpy(out, cp, len);
1971 out += len;
1972 *out++ = ';';
1973 }
1974 processed = in;
1975 }
1976 *outlen = out - outstart;
1977 *inlen = processed - instart;
1978 return(0);
1979}
1980
Owen Taylor3473f882001-02-23 17:55:21 +00001981/************************************************************************
1982 * *
1983 * Commodity functions to handle streams *
1984 * *
1985 ************************************************************************/
1986
1987/**
Owen Taylor3473f882001-02-23 17:55:21 +00001988 * htmlNewInputStream:
1989 * @ctxt: an HTML parser context
1990 *
1991 * Create a new input stream structure
1992 * Returns the new input stream or NULL
1993 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001994static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001995htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1996 htmlParserInputPtr input;
1997
1998 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1999 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002000 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002001 return(NULL);
2002 }
2003 memset(input, 0, sizeof(htmlParserInput));
2004 input->filename = NULL;
2005 input->directory = NULL;
2006 input->base = NULL;
2007 input->cur = NULL;
2008 input->buf = NULL;
2009 input->line = 1;
2010 input->col = 1;
2011 input->buf = NULL;
2012 input->free = NULL;
2013 input->version = NULL;
2014 input->consumed = 0;
2015 input->length = 0;
2016 return(input);
2017}
2018
2019
2020/************************************************************************
2021 * *
2022 * Commodity functions, cleanup needed ? *
2023 * *
2024 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002025/*
2026 * all tags allowing pc data from the html 4.01 loose dtd
2027 * NOTE: it might be more apropriate to integrate this information
2028 * into the html40ElementTable array but I don't want to risk any
2029 * binary incomptibility
2030 */
2031static const char *allowPCData[] = {
2032 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2033 "blockquote", "body", "button", "caption", "center", "cite", "code",
2034 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2035 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2036 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2037 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2038};
Owen Taylor3473f882001-02-23 17:55:21 +00002039
2040/**
2041 * areBlanks:
2042 * @ctxt: an HTML parser context
2043 * @str: a xmlChar *
2044 * @len: the size of @str
2045 *
2046 * Is this a sequence of blank chars that one can ignore ?
2047 *
2048 * Returns 1 if ignorable 0 otherwise.
2049 */
2050
2051static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002052 unsigned int i;
2053 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002054 xmlNodePtr lastChild;
Daniel Veillard36d73402005-09-01 09:52:30 +00002055 xmlDtdPtr dtd;
Owen Taylor3473f882001-02-23 17:55:21 +00002056
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002057 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002058 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002059
2060 if (CUR == 0) return(1);
2061 if (CUR != '<') return(0);
2062 if (ctxt->name == NULL)
2063 return(1);
2064 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2065 return(1);
2066 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2067 return(1);
Daniel Veillard36d73402005-09-01 09:52:30 +00002068
2069 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2070 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2071 dtd = xmlGetIntSubset(ctxt->myDoc);
2072 if (dtd != NULL && dtd->ExternalID != NULL) {
2073 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2074 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2075 return(1);
2076 }
2077 }
2078
Owen Taylor3473f882001-02-23 17:55:21 +00002079 if (ctxt->node == NULL) return(0);
2080 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002081 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2082 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002083 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002084 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2085 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002086 /* keep ws in constructs like ...<b> </b>...
2087 for all tags "b" allowing PCDATA */
2088 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2089 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2090 return(0);
2091 }
2092 }
Owen Taylor3473f882001-02-23 17:55:21 +00002093 } else if (xmlNodeIsText(lastChild)) {
2094 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002095 } else {
2096 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2097 for all tags "p" allowing PCDATA */
2098 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2099 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2100 return(0);
2101 }
2102 }
Owen Taylor3473f882001-02-23 17:55:21 +00002103 }
2104 return(1);
2105}
2106
2107/**
Owen Taylor3473f882001-02-23 17:55:21 +00002108 * htmlNewDocNoDtD:
2109 * @URI: URI for the dtd, or NULL
2110 * @ExternalID: the external ID of the DTD, or NULL
2111 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002112 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2113 * are NULL
2114 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002115 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002116 */
2117htmlDocPtr
2118htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2119 xmlDocPtr cur;
2120
2121 /*
2122 * Allocate a new document and fill the fields.
2123 */
2124 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2125 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002126 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002127 return(NULL);
2128 }
2129 memset(cur, 0, sizeof(xmlDoc));
2130
2131 cur->type = XML_HTML_DOCUMENT_NODE;
2132 cur->version = NULL;
2133 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002134 cur->doc = cur;
2135 cur->name = NULL;
2136 cur->children = NULL;
2137 cur->extSubset = NULL;
2138 cur->oldNs = NULL;
2139 cur->encoding = NULL;
2140 cur->standalone = 1;
2141 cur->compression = 0;
2142 cur->ids = NULL;
2143 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002144 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002145 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardae0765b2008-07-31 19:54:59 +00002146 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002147 if ((ExternalID != NULL) ||
2148 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002149 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002150 return(cur);
2151}
2152
2153/**
2154 * htmlNewDoc:
2155 * @URI: URI for the dtd, or NULL
2156 * @ExternalID: the external ID of the DTD, or NULL
2157 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002158 * Creates a new HTML document
2159 *
Owen Taylor3473f882001-02-23 17:55:21 +00002160 * Returns a new document
2161 */
2162htmlDocPtr
2163htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2164 if ((URI == NULL) && (ExternalID == NULL))
2165 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002166 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2167 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002168
2169 return(htmlNewDocNoDtD(URI, ExternalID));
2170}
2171
2172
2173/************************************************************************
2174 * *
2175 * The parser itself *
2176 * Relates to http://www.w3.org/TR/html40 *
2177 * *
2178 ************************************************************************/
2179
2180/************************************************************************
2181 * *
2182 * The parser itself *
2183 * *
2184 ************************************************************************/
2185
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002186static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002187
Owen Taylor3473f882001-02-23 17:55:21 +00002188/**
2189 * htmlParseHTMLName:
2190 * @ctxt: an HTML parser context
2191 *
2192 * parse an HTML tag or attribute name, note that we convert it to lowercase
2193 * since HTML names are not case-sensitive.
2194 *
2195 * Returns the Tag Name parsed or NULL
2196 */
2197
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002198static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002199htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002200 int i = 0;
2201 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2202
William M. Brackd1757ab2004-10-02 22:07:48 +00002203 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Owen Taylor3473f882001-02-23 17:55:21 +00002204 (CUR != ':')) return(NULL);
2205
2206 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002207 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Owen Taylor3473f882001-02-23 17:55:21 +00002208 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2209 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2210 else loc[i] = CUR;
2211 i++;
2212
2213 NEXT;
2214 }
2215
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002216 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002217}
2218
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002219
2220/**
2221 * htmlParseHTMLName_nonInvasive:
2222 * @ctxt: an HTML parser context
2223 *
2224 * parse an HTML tag or attribute name, note that we convert it to lowercase
2225 * since HTML names are not case-sensitive, this doesn't consume the data
2226 * from the stream, it's a look-ahead
2227 *
2228 * Returns the Tag Name parsed or NULL
2229 */
2230
2231static const xmlChar *
2232htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2233 int i = 0;
2234 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2235
2236 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2237 (NXT(1) != ':')) return(NULL);
2238
2239 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2240 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2241 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2242 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2243 else loc[i] = NXT(1+i);
2244 i++;
2245 }
2246
2247 return(xmlDictLookup(ctxt->dict, loc, i));
2248}
2249
2250
Owen Taylor3473f882001-02-23 17:55:21 +00002251/**
2252 * htmlParseName:
2253 * @ctxt: an HTML parser context
2254 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002255 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002256 *
2257 * Returns the Name parsed or NULL
2258 */
2259
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002260static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002261htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002262 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002263 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002264 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002265
2266 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002267
2268 /*
2269 * Accelerator for simple ASCII names
2270 */
2271 in = ctxt->input->cur;
2272 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2273 ((*in >= 0x41) && (*in <= 0x5A)) ||
2274 (*in == '_') || (*in == ':')) {
2275 in++;
2276 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2277 ((*in >= 0x41) && (*in <= 0x5A)) ||
2278 ((*in >= 0x30) && (*in <= 0x39)) ||
2279 (*in == '_') || (*in == '-') ||
2280 (*in == ':') || (*in == '.'))
2281 in++;
2282 if ((*in > 0) && (*in < 0x80)) {
2283 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002284 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002285 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002286 ctxt->nbChars += count;
2287 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002288 return(ret);
2289 }
2290 }
2291 return(htmlParseNameComplex(ctxt));
2292}
2293
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002294static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002295htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002296 int len = 0, l;
2297 int c;
2298 int count = 0;
2299
2300 /*
2301 * Handler for more complex cases
2302 */
2303 GROW;
2304 c = CUR_CHAR(l);
2305 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2306 (!IS_LETTER(c) && (c != '_') &&
2307 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002308 return(NULL);
2309 }
2310
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002311 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2312 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2313 (c == '.') || (c == '-') ||
2314 (c == '_') || (c == ':') ||
2315 (IS_COMBINING(c)) ||
2316 (IS_EXTENDER(c)))) {
2317 if (count++ > 100) {
2318 count = 0;
2319 GROW;
2320 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002321 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002322 NEXTL(l);
2323 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002324 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002325 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002326}
2327
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002328
Owen Taylor3473f882001-02-23 17:55:21 +00002329/**
2330 * htmlParseHTMLAttribute:
2331 * @ctxt: an HTML parser context
2332 * @stop: a char stop value
2333 *
2334 * parse an HTML attribute value till the stop (quote), if
2335 * stop is 0 then it stops at the first space
2336 *
2337 * Returns the attribute parsed or NULL
2338 */
2339
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002340static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002341htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2342 xmlChar *buffer = NULL;
2343 int buffer_size = 0;
2344 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002345 const xmlChar *name = NULL;
2346 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002347 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002348
2349 /*
2350 * allocate a translation buffer.
2351 */
2352 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002353 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002354 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002355 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002356 return(NULL);
2357 }
2358 out = buffer;
2359
2360 /*
2361 * Ok loop until we reach one of the ending chars
2362 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002363 while ((CUR != 0) && (CUR != stop)) {
2364 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002365 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002366 if (CUR == '&') {
2367 if (NXT(1) == '#') {
2368 unsigned int c;
2369 int bits;
2370
2371 c = htmlParseCharRef(ctxt);
2372 if (c < 0x80)
2373 { *out++ = c; bits= -6; }
2374 else if (c < 0x800)
2375 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2376 else if (c < 0x10000)
2377 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2378 else
2379 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2380
2381 for ( ; bits >= 0; bits-= 6) {
2382 *out++ = ((c >> bits) & 0x3F) | 0x80;
2383 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002384
2385 if (out - buffer > buffer_size - 100) {
2386 int indx = out - buffer;
2387
2388 growBuffer(buffer);
2389 out = &buffer[indx];
2390 }
Owen Taylor3473f882001-02-23 17:55:21 +00002391 } else {
2392 ent = htmlParseEntityRef(ctxt, &name);
2393 if (name == NULL) {
2394 *out++ = '&';
2395 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002396 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002397
2398 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002399 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002400 }
2401 } else if (ent == NULL) {
2402 *out++ = '&';
2403 cur = name;
2404 while (*cur != 0) {
2405 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002406 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002407
2408 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002409 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002410 }
2411 *out++ = *cur++;
2412 }
Owen Taylor3473f882001-02-23 17:55:21 +00002413 } else {
2414 unsigned int c;
2415 int bits;
2416
2417 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002418 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002419
2420 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002421 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002422 }
Daniel Veillard48519092006-10-17 15:56:35 +00002423 c = ent->value;
Owen Taylor3473f882001-02-23 17:55:21 +00002424 if (c < 0x80)
2425 { *out++ = c; bits= -6; }
2426 else if (c < 0x800)
2427 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2428 else if (c < 0x10000)
2429 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2430 else
2431 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2432
2433 for ( ; bits >= 0; bits-= 6) {
2434 *out++ = ((c >> bits) & 0x3F) | 0x80;
2435 }
Owen Taylor3473f882001-02-23 17:55:21 +00002436 }
2437 }
2438 } else {
2439 unsigned int c;
2440 int bits, l;
2441
2442 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002443 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002444
2445 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002446 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002447 }
2448 c = CUR_CHAR(l);
2449 if (c < 0x80)
2450 { *out++ = c; bits= -6; }
2451 else if (c < 0x800)
2452 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2453 else if (c < 0x10000)
2454 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2455 else
2456 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2457
2458 for ( ; bits >= 0; bits-= 6) {
2459 *out++ = ((c >> bits) & 0x3F) | 0x80;
2460 }
2461 NEXT;
2462 }
2463 }
2464 *out++ = 0;
2465 return(buffer);
2466}
2467
2468/**
Owen Taylor3473f882001-02-23 17:55:21 +00002469 * htmlParseEntityRef:
2470 * @ctxt: an HTML parser context
2471 * @str: location to store the entity name
2472 *
2473 * parse an HTML ENTITY references
2474 *
2475 * [68] EntityRef ::= '&' Name ';'
2476 *
2477 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2478 * if non-NULL *str will have to be freed by the caller.
2479 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002480const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002481htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2482 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002483 const htmlEntityDesc * ent = NULL;
Daniel Veillard42595322004-11-08 10:52:06 +00002484
2485 if (str != NULL) *str = NULL;
2486 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002487
2488 if (CUR == '&') {
2489 NEXT;
2490 name = htmlParseName(ctxt);
2491 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002492 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2493 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002494 } else {
2495 GROW;
2496 if (CUR == ';') {
Daniel Veillard42595322004-11-08 10:52:06 +00002497 if (str != NULL)
2498 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002499
2500 /*
2501 * Lookup the entity in the table.
2502 */
2503 ent = htmlEntityLookup(name);
2504 if (ent != NULL) /* OK that's ugly !!! */
2505 NEXT;
2506 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002507 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2508 "htmlParseEntityRef: expecting ';'\n",
2509 NULL, NULL);
Daniel Veillard42595322004-11-08 10:52:06 +00002510 if (str != NULL)
2511 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002512 }
2513 }
2514 }
2515 return(ent);
2516}
2517
2518/**
2519 * htmlParseAttValue:
2520 * @ctxt: an HTML parser context
2521 *
2522 * parse a value for an attribute
2523 * Note: the parser won't do substitution of entities here, this
2524 * will be handled later in xmlStringGetNodeList, unless it was
2525 * asked for ctxt->replaceEntities != 0
2526 *
2527 * Returns the AttValue parsed or NULL.
2528 */
2529
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002530static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002531htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2532 xmlChar *ret = NULL;
2533
2534 if (CUR == '"') {
2535 NEXT;
2536 ret = htmlParseHTMLAttribute(ctxt, '"');
2537 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002538 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2539 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002540 } else
2541 NEXT;
2542 } else if (CUR == '\'') {
2543 NEXT;
2544 ret = htmlParseHTMLAttribute(ctxt, '\'');
2545 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002546 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2547 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002548 } else
2549 NEXT;
2550 } else {
2551 /*
2552 * That's an HTMLism, the attribute value may not be quoted
2553 */
2554 ret = htmlParseHTMLAttribute(ctxt, 0);
2555 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002556 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2557 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002558 }
2559 }
2560 return(ret);
2561}
2562
2563/**
2564 * htmlParseSystemLiteral:
2565 * @ctxt: an HTML parser context
2566 *
2567 * parse an HTML Literal
2568 *
2569 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2570 *
2571 * Returns the SystemLiteral parsed or NULL
2572 */
2573
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002574static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002575htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2576 const xmlChar *q;
2577 xmlChar *ret = NULL;
2578
2579 if (CUR == '"') {
2580 NEXT;
2581 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002582 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002583 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002584 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002585 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2586 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002587 } else {
2588 ret = xmlStrndup(q, CUR_PTR - q);
2589 NEXT;
2590 }
2591 } else if (CUR == '\'') {
2592 NEXT;
2593 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002594 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002595 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002596 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002597 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2598 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002599 } else {
2600 ret = xmlStrndup(q, CUR_PTR - q);
2601 NEXT;
2602 }
2603 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002604 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2605 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002606 }
2607
2608 return(ret);
2609}
2610
2611/**
2612 * htmlParsePubidLiteral:
2613 * @ctxt: an HTML parser context
2614 *
2615 * parse an HTML public literal
2616 *
2617 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2618 *
2619 * Returns the PubidLiteral parsed or NULL.
2620 */
2621
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002622static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002623htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2624 const xmlChar *q;
2625 xmlChar *ret = NULL;
2626 /*
2627 * Name ::= (Letter | '_') (NameChar)*
2628 */
2629 if (CUR == '"') {
2630 NEXT;
2631 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002632 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002633 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002634 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2635 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002636 } else {
2637 ret = xmlStrndup(q, CUR_PTR - q);
2638 NEXT;
2639 }
2640 } else if (CUR == '\'') {
2641 NEXT;
2642 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002643 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002644 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002645 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002646 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2647 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002648 } else {
2649 ret = xmlStrndup(q, CUR_PTR - q);
2650 NEXT;
2651 }
2652 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002653 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2654 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002655 }
2656
2657 return(ret);
2658}
2659
2660/**
2661 * htmlParseScript:
2662 * @ctxt: an HTML parser context
2663 *
2664 * parse the content of an HTML SCRIPT or STYLE element
2665 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2666 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2667 * http://www.w3.org/TR/html4/types.html#type-script
2668 * http://www.w3.org/TR/html4/types.html#h-6.15
2669 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2670 *
2671 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2672 * element and the value of intrinsic event attributes. User agents must
2673 * not evaluate script data as HTML markup but instead must pass it on as
2674 * data to a script engine.
2675 * NOTES:
2676 * - The content is passed like CDATA
2677 * - the attributes for style and scripting "onXXX" are also described
2678 * as CDATA but SGML allows entities references in attributes so their
2679 * processing is identical as other attributes
2680 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002681static void
Owen Taylor3473f882001-02-23 17:55:21 +00002682htmlParseScript(htmlParserCtxtPtr ctxt) {
Daniel Veillard7d2b3232005-07-14 08:57:39 +00002683 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
Owen Taylor3473f882001-02-23 17:55:21 +00002684 int nbchar = 0;
Daniel Veillard358fef42005-07-13 16:37:38 +00002685 int cur,l;
Owen Taylor3473f882001-02-23 17:55:21 +00002686
2687 SHRINK;
Daniel Veillard358fef42005-07-13 16:37:38 +00002688 cur = CUR_CHAR(l);
William M. Brack76e95df2003-10-18 16:20:14 +00002689 while (IS_CHAR_CH(cur)) {
Daniel Veillard42720242007-04-16 07:02:31 +00002690 if ((cur == '<') && (NXT(1) == '/')) {
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002691 /*
2692 * One should break here, the specification is clear:
2693 * Authors should therefore escape "</" within the content.
2694 * Escape mechanisms are specific to each scripting or
2695 * style sheet language.
2696 *
2697 * In recovery mode, only break if end tag match the
2698 * current tag, effectively ignoring all tags inside the
2699 * script/style block and treating the entire block as
2700 * CDATA.
2701 */
2702 if (ctxt->recovery) {
2703 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2704 xmlStrlen(ctxt->name)) == 0)
2705 {
2706 break; /* while */
2707 } else {
2708 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
Daniel Veillard2cf36a12005-10-25 12:21:29 +00002709 "Element %s embeds close tag\n",
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002710 ctxt->name, NULL);
2711 }
2712 } else {
2713 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2714 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2715 {
2716 break; /* while */
2717 }
2718 }
Owen Taylor3473f882001-02-23 17:55:21 +00002719 }
Daniel Veillard358fef42005-07-13 16:37:38 +00002720 COPY_BUF(l,buf,nbchar,cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002721 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2722 if (ctxt->sax->cdataBlock!= NULL) {
2723 /*
2724 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2725 */
2726 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002727 } else if (ctxt->sax->characters != NULL) {
2728 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002729 }
2730 nbchar = 0;
2731 }
Daniel Veillardb9900082005-10-25 12:36:29 +00002732 GROW;
Daniel Veillard358fef42005-07-13 16:37:38 +00002733 NEXTL(l);
2734 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002735 }
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002736
Daniel Veillard68716a72006-10-16 09:32:17 +00002737 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002738 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2739 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002740 NEXT;
2741 }
2742
2743 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2744 if (ctxt->sax->cdataBlock!= NULL) {
2745 /*
2746 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2747 */
2748 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002749 } else if (ctxt->sax->characters != NULL) {
2750 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002751 }
2752 }
2753}
2754
2755
2756/**
2757 * htmlParseCharData:
2758 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002759 *
2760 * parse a CharData section.
2761 * if we are within a CDATA section ']]>' marks an end of section.
2762 *
2763 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2764 */
2765
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002766static void
2767htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002768 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2769 int nbchar = 0;
2770 int cur, l;
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002771 int chunk = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002772
2773 SHRINK;
2774 cur = CUR_CHAR(l);
2775 while (((cur != '<') || (ctxt->token == '<')) &&
2776 ((cur != '&') || (ctxt->token == '&')) &&
Daniel Veillardc5b43cc2008-01-11 07:41:39 +00002777 (cur != 0)) {
2778 if (!(IS_CHAR(cur))) {
2779 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2780 "Invalid char in CDATA 0x%X\n", cur);
2781 } else {
2782 COPY_BUF(l,buf,nbchar,cur);
2783 }
Owen Taylor3473f882001-02-23 17:55:21 +00002784 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2785 /*
2786 * Ok the segment is to be consumed as chars.
2787 */
2788 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2789 if (areBlanks(ctxt, buf, nbchar)) {
2790 if (ctxt->sax->ignorableWhitespace != NULL)
2791 ctxt->sax->ignorableWhitespace(ctxt->userData,
2792 buf, nbchar);
2793 } else {
2794 htmlCheckParagraph(ctxt);
2795 if (ctxt->sax->characters != NULL)
2796 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2797 }
2798 }
2799 nbchar = 0;
2800 }
2801 NEXTL(l);
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002802 chunk++;
2803 if (chunk > HTML_PARSER_BUFFER_SIZE) {
2804 chunk = 0;
2805 SHRINK;
2806 GROW;
2807 }
Owen Taylor3473f882001-02-23 17:55:21 +00002808 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002809 if (cur == 0) {
2810 SHRINK;
2811 GROW;
2812 cur = CUR_CHAR(l);
2813 }
Owen Taylor3473f882001-02-23 17:55:21 +00002814 }
2815 if (nbchar != 0) {
Daniel Veillardd2755a82005-08-07 23:42:39 +00002816 buf[nbchar] = 0;
2817
Owen Taylor3473f882001-02-23 17:55:21 +00002818 /*
2819 * Ok the segment is to be consumed as chars.
2820 */
2821 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2822 if (areBlanks(ctxt, buf, nbchar)) {
2823 if (ctxt->sax->ignorableWhitespace != NULL)
2824 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2825 } else {
2826 htmlCheckParagraph(ctxt);
2827 if (ctxt->sax->characters != NULL)
2828 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2829 }
2830 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002831 } else {
2832 /*
2833 * Loop detection
2834 */
2835 if (cur == 0)
2836 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002837 }
2838}
2839
2840/**
2841 * htmlParseExternalID:
2842 * @ctxt: an HTML parser context
2843 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002844 *
2845 * Parse an External ID or a Public ID
2846 *
Owen Taylor3473f882001-02-23 17:55:21 +00002847 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2848 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2849 *
2850 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2851 *
2852 * Returns the function returns SystemLiteral and in the second
2853 * case publicID receives PubidLiteral, is strict is off
2854 * it is possible to return NULL and have publicID set.
2855 */
2856
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002857static xmlChar *
2858htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002859 xmlChar *URI = NULL;
2860
2861 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2862 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2863 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2864 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002865 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002866 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2867 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002868 }
2869 SKIP_BLANKS;
2870 URI = htmlParseSystemLiteral(ctxt);
2871 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002872 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2873 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002874 }
2875 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2876 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2877 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2878 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002879 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002880 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2881 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002882 }
2883 SKIP_BLANKS;
2884 *publicID = htmlParsePubidLiteral(ctxt);
2885 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002886 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2887 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2888 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002889 }
2890 SKIP_BLANKS;
2891 if ((CUR == '"') || (CUR == '\'')) {
2892 URI = htmlParseSystemLiteral(ctxt);
2893 }
2894 }
2895 return(URI);
2896}
2897
2898/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00002899 * xmlParsePI:
2900 * @ctxt: an XML parser context
2901 *
2902 * parse an XML Processing Instruction.
2903 *
2904 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2905 */
2906static void
2907htmlParsePI(htmlParserCtxtPtr ctxt) {
2908 xmlChar *buf = NULL;
2909 int len = 0;
2910 int size = HTML_PARSER_BUFFER_SIZE;
2911 int cur, l;
2912 const xmlChar *target;
2913 xmlParserInputState state;
2914 int count = 0;
2915
2916 if ((RAW == '<') && (NXT(1) == '?')) {
2917 state = ctxt->instate;
2918 ctxt->instate = XML_PARSER_PI;
2919 /*
2920 * this is a Processing Instruction.
2921 */
2922 SKIP(2);
2923 SHRINK;
2924
2925 /*
2926 * Parse the target name and check for special support like
2927 * namespace.
2928 */
2929 target = htmlParseName(ctxt);
2930 if (target != NULL) {
2931 if (RAW == '>') {
2932 SKIP(1);
2933
2934 /*
2935 * SAX: PI detected.
2936 */
2937 if ((ctxt->sax) && (!ctxt->disableSAX) &&
2938 (ctxt->sax->processingInstruction != NULL))
2939 ctxt->sax->processingInstruction(ctxt->userData,
2940 target, NULL);
2941 ctxt->instate = state;
2942 return;
2943 }
2944 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
2945 if (buf == NULL) {
2946 htmlErrMemory(ctxt, NULL);
2947 ctxt->instate = state;
2948 return;
2949 }
2950 cur = CUR;
2951 if (!IS_BLANK(cur)) {
2952 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2953 "ParsePI: PI %s space expected\n", target, NULL);
2954 }
2955 SKIP_BLANKS;
2956 cur = CUR_CHAR(l);
2957 while (IS_CHAR(cur) && (cur != '>')) {
2958 if (len + 5 >= size) {
2959 xmlChar *tmp;
2960
2961 size *= 2;
2962 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2963 if (tmp == NULL) {
2964 htmlErrMemory(ctxt, NULL);
2965 xmlFree(buf);
2966 ctxt->instate = state;
2967 return;
2968 }
2969 buf = tmp;
2970 }
2971 count++;
2972 if (count > 50) {
2973 GROW;
2974 count = 0;
2975 }
2976 COPY_BUF(l,buf,len,cur);
2977 NEXTL(l);
2978 cur = CUR_CHAR(l);
2979 if (cur == 0) {
2980 SHRINK;
2981 GROW;
2982 cur = CUR_CHAR(l);
2983 }
2984 }
2985 buf[len] = 0;
2986 if (cur != '>') {
2987 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
2988 "ParsePI: PI %s never end ...\n", target, NULL);
2989 } else {
2990 SKIP(1);
2991
2992 /*
2993 * SAX: PI detected.
2994 */
2995 if ((ctxt->sax) && (!ctxt->disableSAX) &&
2996 (ctxt->sax->processingInstruction != NULL))
2997 ctxt->sax->processingInstruction(ctxt->userData,
2998 target, buf);
2999 }
3000 xmlFree(buf);
3001 } else {
3002 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3003 "PI is not started correctly", NULL, NULL);
3004 }
3005 ctxt->instate = state;
3006 }
3007}
3008
3009/**
Owen Taylor3473f882001-02-23 17:55:21 +00003010 * htmlParseComment:
3011 * @ctxt: an HTML parser context
3012 *
3013 * Parse an XML (SGML) comment <!-- .... -->
3014 *
3015 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3016 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003017static void
Owen Taylor3473f882001-02-23 17:55:21 +00003018htmlParseComment(htmlParserCtxtPtr ctxt) {
3019 xmlChar *buf = NULL;
3020 int len;
3021 int size = HTML_PARSER_BUFFER_SIZE;
3022 int q, ql;
3023 int r, rl;
3024 int cur, l;
3025 xmlParserInputState state;
3026
3027 /*
3028 * Check that there is a comment right here.
3029 */
3030 if ((RAW != '<') || (NXT(1) != '!') ||
3031 (NXT(2) != '-') || (NXT(3) != '-')) return;
3032
3033 state = ctxt->instate;
3034 ctxt->instate = XML_PARSER_COMMENT;
3035 SHRINK;
3036 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003037 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00003038 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003039 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003040 ctxt->instate = state;
3041 return;
3042 }
3043 q = CUR_CHAR(ql);
3044 NEXTL(ql);
3045 r = CUR_CHAR(rl);
3046 NEXTL(rl);
3047 cur = CUR_CHAR(l);
3048 len = 0;
3049 while (IS_CHAR(cur) &&
3050 ((cur != '>') ||
3051 (r != '-') || (q != '-'))) {
3052 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00003053 xmlChar *tmp;
3054
Owen Taylor3473f882001-02-23 17:55:21 +00003055 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00003056 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3057 if (tmp == NULL) {
3058 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00003059 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003060 ctxt->instate = state;
3061 return;
3062 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00003063 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00003064 }
3065 COPY_BUF(ql,buf,len,q);
3066 q = r;
3067 ql = rl;
3068 r = cur;
3069 rl = l;
3070 NEXTL(l);
3071 cur = CUR_CHAR(l);
3072 if (cur == 0) {
3073 SHRINK;
3074 GROW;
3075 cur = CUR_CHAR(l);
3076 }
3077 }
3078 buf[len] = 0;
3079 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003080 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3081 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003082 xmlFree(buf);
3083 } else {
3084 NEXT;
3085 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3086 (!ctxt->disableSAX))
3087 ctxt->sax->comment(ctxt->userData, buf);
3088 xmlFree(buf);
3089 }
3090 ctxt->instate = state;
3091}
3092
3093/**
3094 * htmlParseCharRef:
3095 * @ctxt: an HTML parser context
3096 *
3097 * parse Reference declarations
3098 *
3099 * [66] CharRef ::= '&#' [0-9]+ ';' |
3100 * '&#x' [0-9a-fA-F]+ ';'
3101 *
3102 * Returns the value parsed (as an int)
3103 */
3104int
3105htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3106 int val = 0;
3107
Daniel Veillarda03e3652004-11-02 18:45:30 +00003108 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3109 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3110 "htmlParseCharRef: context error\n",
3111 NULL, NULL);
3112 return(0);
3113 }
Owen Taylor3473f882001-02-23 17:55:21 +00003114 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003115 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003116 SKIP(3);
3117 while (CUR != ';') {
3118 if ((CUR >= '0') && (CUR <= '9'))
3119 val = val * 16 + (CUR - '0');
3120 else if ((CUR >= 'a') && (CUR <= 'f'))
3121 val = val * 16 + (CUR - 'a') + 10;
3122 else if ((CUR >= 'A') && (CUR <= 'F'))
3123 val = val * 16 + (CUR - 'A') + 10;
3124 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003125 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
Daniel Veillard36de63e2008-04-03 09:05:05 +00003126 "htmlParseCharRef: missing semicolumn\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003127 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003128 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003129 }
3130 NEXT;
3131 }
3132 if (CUR == ';')
3133 NEXT;
3134 } else if ((CUR == '&') && (NXT(1) == '#')) {
3135 SKIP(2);
3136 while (CUR != ';') {
3137 if ((CUR >= '0') && (CUR <= '9'))
3138 val = val * 10 + (CUR - '0');
3139 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003140 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
Daniel Veillard36de63e2008-04-03 09:05:05 +00003141 "htmlParseCharRef: missing semicolumn\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003142 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003143 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003144 }
3145 NEXT;
3146 }
3147 if (CUR == ';')
3148 NEXT;
3149 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003150 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3151 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003152 }
3153 /*
3154 * Check the value IS_CHAR ...
3155 */
3156 if (IS_CHAR(val)) {
3157 return(val);
3158 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003159 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3160 "htmlParseCharRef: invalid xmlChar value %d\n",
3161 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003162 }
3163 return(0);
3164}
3165
3166
3167/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003168 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003169 * @ctxt: an HTML parser context
3170 *
3171 * parse a DOCTYPE declaration
3172 *
3173 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3174 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3175 */
3176
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003177static void
Owen Taylor3473f882001-02-23 17:55:21 +00003178htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003179 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003180 xmlChar *ExternalID = NULL;
3181 xmlChar *URI = NULL;
3182
3183 /*
3184 * We know that '<!DOCTYPE' has been detected.
3185 */
3186 SKIP(9);
3187
3188 SKIP_BLANKS;
3189
3190 /*
3191 * Parse the DOCTYPE name.
3192 */
3193 name = htmlParseName(ctxt);
3194 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003195 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3196 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3197 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003198 }
3199 /*
3200 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3201 */
3202
3203 SKIP_BLANKS;
3204
3205 /*
3206 * Check for SystemID and ExternalID
3207 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003208 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003209 SKIP_BLANKS;
3210
3211 /*
3212 * We should be at the end of the DOCTYPE declaration.
3213 */
3214 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003215 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3216 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003217 /* We shouldn't try to resynchronize ... */
3218 }
3219 NEXT;
3220
3221 /*
3222 * Create or update the document accordingly to the DOCTYPE
3223 */
3224 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3225 (!ctxt->disableSAX))
3226 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3227
3228 /*
3229 * Cleanup, since we don't use all those identifiers
3230 */
3231 if (URI != NULL) xmlFree(URI);
3232 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003233}
3234
3235/**
3236 * htmlParseAttribute:
3237 * @ctxt: an HTML parser context
3238 * @value: a xmlChar ** used to store the value of the attribute
3239 *
3240 * parse an attribute
3241 *
3242 * [41] Attribute ::= Name Eq AttValue
3243 *
3244 * [25] Eq ::= S? '=' S?
3245 *
3246 * With namespace:
3247 *
3248 * [NS 11] Attribute ::= QName Eq AttValue
3249 *
3250 * Also the case QName == xmlns:??? is handled independently as a namespace
3251 * definition.
3252 *
3253 * Returns the attribute name, and the value in *value.
3254 */
3255
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003256static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003257htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003258 const xmlChar *name;
3259 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003260
3261 *value = NULL;
3262 name = htmlParseHTMLName(ctxt);
3263 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003264 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3265 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003266 return(NULL);
3267 }
3268
3269 /*
3270 * read the value
3271 */
3272 SKIP_BLANKS;
3273 if (CUR == '=') {
3274 NEXT;
3275 SKIP_BLANKS;
3276 val = htmlParseAttValue(ctxt);
Daniel Veillardc47d2632006-10-17 16:13:27 +00003277 } else if (htmlIsBooleanAttr(name)) {
3278 /*
3279 * assume a minimized attribute
3280 */
3281 val = xmlStrdup(name);
Owen Taylor3473f882001-02-23 17:55:21 +00003282 }
3283
3284 *value = val;
3285 return(name);
3286}
3287
3288/**
3289 * htmlCheckEncoding:
3290 * @ctxt: an HTML parser context
3291 * @attvalue: the attribute value
3292 *
3293 * Checks an http-equiv attribute from a Meta tag to detect
3294 * the encoding
3295 * If a new encoding is detected the parser is switched to decode
3296 * it and pass UTF8
3297 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003298static void
Owen Taylor3473f882001-02-23 17:55:21 +00003299htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3300 const xmlChar *encoding;
3301
3302 if ((ctxt == NULL) || (attvalue == NULL))
3303 return;
3304
3305 /* do not change encoding */
3306 if (ctxt->input->encoding != NULL)
3307 return;
3308
3309 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3310 if (encoding != NULL) {
3311 encoding += 8;
3312 } else {
3313 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3314 if (encoding != NULL)
3315 encoding += 9;
3316 }
3317 if (encoding != NULL) {
3318 xmlCharEncoding enc;
3319 xmlCharEncodingHandlerPtr handler;
3320
3321 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3322
3323 if (ctxt->input->encoding != NULL)
3324 xmlFree((xmlChar *) ctxt->input->encoding);
3325 ctxt->input->encoding = xmlStrdup(encoding);
3326
3327 enc = xmlParseCharEncoding((const char *) encoding);
3328 /*
3329 * registered set of known encodings
3330 */
3331 if (enc != XML_CHAR_ENCODING_ERROR) {
Daniel Veillard7e303562006-10-16 13:14:55 +00003332 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3333 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3334 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3335 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3336 (ctxt->input->buf != NULL) &&
3337 (ctxt->input->buf->encoder == NULL)) {
3338 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3339 "htmlCheckEncoding: wrong encoding meta\n",
3340 NULL, NULL);
3341 } else {
3342 xmlSwitchEncoding(ctxt, enc);
3343 }
Owen Taylor3473f882001-02-23 17:55:21 +00003344 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3345 } else {
3346 /*
3347 * fallback for unknown encodings
3348 */
3349 handler = xmlFindCharEncodingHandler((const char *) encoding);
3350 if (handler != NULL) {
3351 xmlSwitchToEncoding(ctxt, handler);
3352 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3353 } else {
3354 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3355 }
3356 }
3357
3358 if ((ctxt->input->buf != NULL) &&
3359 (ctxt->input->buf->encoder != NULL) &&
3360 (ctxt->input->buf->raw != NULL) &&
3361 (ctxt->input->buf->buffer != NULL)) {
3362 int nbchars;
3363 int processed;
3364
3365 /*
3366 * convert as much as possible to the parser reading buffer.
3367 */
3368 processed = ctxt->input->cur - ctxt->input->base;
3369 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3370 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3371 ctxt->input->buf->buffer,
3372 ctxt->input->buf->raw);
3373 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003374 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3375 "htmlCheckEncoding: encoder error\n",
3376 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003377 }
3378 ctxt->input->base =
3379 ctxt->input->cur = ctxt->input->buf->buffer->content;
3380 }
3381 }
3382}
3383
3384/**
3385 * htmlCheckMeta:
3386 * @ctxt: an HTML parser context
3387 * @atts: the attributes values
3388 *
3389 * Checks an attributes from a Meta tag
3390 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003391static void
Owen Taylor3473f882001-02-23 17:55:21 +00003392htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3393 int i;
3394 const xmlChar *att, *value;
3395 int http = 0;
3396 const xmlChar *content = NULL;
3397
3398 if ((ctxt == NULL) || (atts == NULL))
3399 return;
3400
3401 i = 0;
3402 att = atts[i++];
3403 while (att != NULL) {
3404 value = atts[i++];
3405 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3406 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3407 http = 1;
3408 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3409 content = value;
3410 att = atts[i++];
3411 }
3412 if ((http) && (content != NULL))
3413 htmlCheckEncoding(ctxt, content);
3414
3415}
3416
3417/**
3418 * htmlParseStartTag:
3419 * @ctxt: an HTML parser context
3420 *
3421 * parse a start of tag either for rule element or
3422 * EmptyElement. In both case we don't parse the tag closing chars.
3423 *
3424 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3425 *
3426 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3427 *
3428 * With namespace:
3429 *
3430 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3431 *
3432 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3433 *
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003434 * Returns 0 in case of success, -1 in case of error and 1 if discarded
Owen Taylor3473f882001-02-23 17:55:21 +00003435 */
3436
Daniel Veillard597f1c12005-07-03 23:00:18 +00003437static int
Owen Taylor3473f882001-02-23 17:55:21 +00003438htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003439 const xmlChar *name;
3440 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003441 xmlChar *attvalue;
Daniel Veillard30e76072006-03-09 14:13:55 +00003442 const xmlChar **atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003443 int nbatts = 0;
Daniel Veillard30e76072006-03-09 14:13:55 +00003444 int maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003445 int meta = 0;
3446 int i;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003447 int discardtag = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003448
Daniel Veillarda03e3652004-11-02 18:45:30 +00003449 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3450 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3451 "htmlParseStartTag: context error\n", NULL, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003452 return -1;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003453 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003454 if (CUR != '<') return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003455 NEXT;
3456
Daniel Veillard30e76072006-03-09 14:13:55 +00003457 atts = ctxt->atts;
3458 maxatts = ctxt->maxatts;
3459
Owen Taylor3473f882001-02-23 17:55:21 +00003460 GROW;
3461 name = htmlParseHTMLName(ctxt);
3462 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003463 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3464 "htmlParseStartTag: invalid element name\n",
3465 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003466 /* Dump the bogus tag like browsers do */
William M. Brack76e95df2003-10-18 16:20:14 +00003467 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Owen Taylor3473f882001-02-23 17:55:21 +00003468 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003469 return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003470 }
3471 if (xmlStrEqual(name, BAD_CAST"meta"))
3472 meta = 1;
3473
3474 /*
3475 * Check for auto-closure of HTML elements.
3476 */
3477 htmlAutoClose(ctxt, name);
3478
3479 /*
3480 * Check for implied HTML elements.
3481 */
3482 htmlCheckImplied(ctxt, name);
3483
3484 /*
3485 * Avoid html at any level > 0, head at any level != 1
3486 * or any attempt to recurse body
3487 */
3488 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003489 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3490 "htmlParseStartTag: misplaced <html> tag\n",
3491 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003492 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003493 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003494 }
3495 if ((ctxt->nameNr != 1) &&
3496 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003497 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3498 "htmlParseStartTag: misplaced <head> tag\n",
3499 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003500 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003501 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003502 }
3503 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003504 int indx;
3505 for (indx = 0;indx < ctxt->nameNr;indx++) {
3506 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003507 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3508 "htmlParseStartTag: misplaced <body> tag\n",
3509 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003510 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003511 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003512 }
3513 }
3514 }
3515
3516 /*
3517 * Now parse the attributes, it ends up with the ending
3518 *
3519 * (S Attribute)* S?
3520 */
3521 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003522 while ((IS_CHAR_CH(CUR)) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003523 (CUR != '>') &&
3524 ((CUR != '/') || (NXT(1) != '>'))) {
3525 long cons = ctxt->nbChars;
3526
3527 GROW;
3528 attname = htmlParseAttribute(ctxt, &attvalue);
3529 if (attname != NULL) {
3530
3531 /*
3532 * Well formedness requires at most one declaration of an attribute
3533 */
3534 for (i = 0; i < nbatts;i += 2) {
3535 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003536 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3537 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003538 if (attvalue != NULL)
3539 xmlFree(attvalue);
3540 goto failed;
3541 }
3542 }
3543
3544 /*
3545 * Add the pair to atts
3546 */
3547 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003548 maxatts = 22; /* allow for 10 attrs by default */
3549 atts = (const xmlChar **)
3550 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003551 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003552 htmlErrMemory(ctxt, NULL);
3553 if (attvalue != NULL)
3554 xmlFree(attvalue);
3555 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003556 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003557 ctxt->atts = atts;
3558 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003559 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003560 const xmlChar **n;
3561
Owen Taylor3473f882001-02-23 17:55:21 +00003562 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003563 n = (const xmlChar **) xmlRealloc((void *) atts,
3564 maxatts * sizeof(const xmlChar *));
3565 if (n == NULL) {
3566 htmlErrMemory(ctxt, NULL);
3567 if (attvalue != NULL)
3568 xmlFree(attvalue);
3569 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003570 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003571 atts = n;
3572 ctxt->atts = atts;
3573 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003574 }
3575 atts[nbatts++] = attname;
3576 atts[nbatts++] = attvalue;
3577 atts[nbatts] = NULL;
3578 atts[nbatts + 1] = NULL;
3579 }
3580 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003581 if (attvalue != NULL)
3582 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003583 /* Dump the bogus attribute string up to the next blank or
3584 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003585 while ((IS_CHAR_CH(CUR)) &&
3586 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003587 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003588 NEXT;
3589 }
3590
3591failed:
3592 SKIP_BLANKS;
3593 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003594 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3595 "htmlParseStartTag: problem parsing attributes\n",
3596 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003597 break;
3598 }
3599 }
3600
3601 /*
3602 * Handle specific association to the META tag
3603 */
William M. Bracke978ae22007-03-21 06:16:02 +00003604 if (meta && (nbatts != 0))
Owen Taylor3473f882001-02-23 17:55:21 +00003605 htmlCheckMeta(ctxt, atts);
3606
3607 /*
3608 * SAX: Start of Element !
3609 */
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003610 if (!discardtag) {
3611 htmlnamePush(ctxt, name);
3612 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3613 if (nbatts != 0)
3614 ctxt->sax->startElement(ctxt->userData, name, atts);
3615 else
3616 ctxt->sax->startElement(ctxt->userData, name, NULL);
3617 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003618 }
Owen Taylor3473f882001-02-23 17:55:21 +00003619
3620 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003621 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003622 if (atts[i] != NULL)
3623 xmlFree((xmlChar *) atts[i]);
3624 }
Owen Taylor3473f882001-02-23 17:55:21 +00003625 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003626
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003627 return(discardtag);
Owen Taylor3473f882001-02-23 17:55:21 +00003628}
3629
3630/**
3631 * htmlParseEndTag:
3632 * @ctxt: an HTML parser context
3633 *
3634 * parse an end of tag
3635 *
3636 * [42] ETag ::= '</' Name S? '>'
3637 *
3638 * With namespace
3639 *
3640 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003641 *
3642 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003643 */
3644
Daniel Veillardf420ac52001-07-04 16:04:09 +00003645static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003646htmlParseEndTag(htmlParserCtxtPtr ctxt)
3647{
3648 const xmlChar *name;
3649 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003650 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003651
3652 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003653 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3654 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003655 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003656 }
3657 SKIP(2);
3658
3659 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003660 if (name == NULL)
3661 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003662 /*
3663 * We should definitely be at the ending "S? '>'" part
3664 */
3665 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003666 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003667 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3668 "End tag : expected '>'\n", NULL, NULL);
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00003669 if (ctxt->recovery) {
3670 /*
3671 * We're not at the ending > !!
3672 * Error, unless in recover mode where we search forwards
3673 * until we find a >
3674 */
3675 while (CUR != '\0' && CUR != '>') NEXT;
3676 NEXT;
3677 }
Owen Taylor3473f882001-02-23 17:55:21 +00003678 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003679 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003680
3681 /*
Daniel Veillarded86dc22008-04-24 11:58:41 +00003682 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3683 * out now.
3684 */
3685 if ((ctxt->depth > 0) &&
3686 (xmlStrEqual(name, BAD_CAST "html") ||
3687 xmlStrEqual(name, BAD_CAST "body") ||
3688 xmlStrEqual(name, BAD_CAST "head"))) {
3689 ctxt->depth--;
3690 return (0);
3691 }
3692
3693 /*
Owen Taylor3473f882001-02-23 17:55:21 +00003694 * If the name read is not one of the element in the parsing stack
3695 * then return, it's just an error.
3696 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003697 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3698 if (xmlStrEqual(name, ctxt->nameTab[i]))
3699 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003700 }
3701 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003702 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3703 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003704 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003705 }
3706
3707
3708 /*
3709 * Check for auto-closure of HTML elements.
3710 */
3711
3712 htmlAutoCloseOnClose(ctxt, name);
3713
3714 /*
3715 * Well formedness constraints, opening and closing must match.
3716 * With the exception that the autoclose may have popped stuff out
3717 * of the stack.
3718 */
3719 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003720 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003721 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3722 "Opening and ending tag mismatch: %s and %s\n",
3723 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003724 }
3725 }
3726
3727 /*
3728 * SAX: End of Tag
3729 */
3730 oldname = ctxt->name;
3731 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003732 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3733 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003734 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003735 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003736 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003737 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003738 }
3739
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003740 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003741}
3742
3743
3744/**
3745 * htmlParseReference:
3746 * @ctxt: an HTML parser context
3747 *
3748 * parse and handle entity references in content,
3749 * this will end-up in a call to character() since this is either a
3750 * CharRef, or a predefined entity.
3751 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003752static void
Owen Taylor3473f882001-02-23 17:55:21 +00003753htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003754 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003755 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003756 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003757 if (CUR != '&') return;
3758
3759 if (NXT(1) == '#') {
3760 unsigned int c;
3761 int bits, i = 0;
3762
3763 c = htmlParseCharRef(ctxt);
3764 if (c == 0)
3765 return;
3766
3767 if (c < 0x80) { out[i++]= c; bits= -6; }
3768 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3769 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3770 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3771
3772 for ( ; bits >= 0; bits-= 6) {
3773 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3774 }
3775 out[i] = 0;
3776
3777 htmlCheckParagraph(ctxt);
3778 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3779 ctxt->sax->characters(ctxt->userData, out, i);
3780 } else {
3781 ent = htmlParseEntityRef(ctxt, &name);
3782 if (name == NULL) {
3783 htmlCheckParagraph(ctxt);
3784 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3785 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3786 return;
3787 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003788 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003789 htmlCheckParagraph(ctxt);
3790 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3791 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3792 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3793 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3794 }
3795 } else {
3796 unsigned int c;
3797 int bits, i = 0;
3798
3799 c = ent->value;
3800 if (c < 0x80)
3801 { out[i++]= c; bits= -6; }
3802 else if (c < 0x800)
3803 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3804 else if (c < 0x10000)
3805 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3806 else
3807 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3808
3809 for ( ; bits >= 0; bits-= 6) {
3810 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3811 }
3812 out[i] = 0;
3813
3814 htmlCheckParagraph(ctxt);
3815 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3816 ctxt->sax->characters(ctxt->userData, out, i);
3817 }
Owen Taylor3473f882001-02-23 17:55:21 +00003818 }
3819}
3820
3821/**
3822 * htmlParseContent:
3823 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00003824 *
3825 * Parse a content: comment, sub-element, reference or text.
Owen Taylor3473f882001-02-23 17:55:21 +00003826 */
3827
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003828static void
Owen Taylor3473f882001-02-23 17:55:21 +00003829htmlParseContent(htmlParserCtxtPtr ctxt) {
3830 xmlChar *currentNode;
3831 int depth;
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003832 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003833
3834 currentNode = xmlStrdup(ctxt->name);
3835 depth = ctxt->nameNr;
3836 while (1) {
3837 long cons = ctxt->nbChars;
3838
3839 GROW;
3840 /*
3841 * Our tag or one of it's parent or children is ending.
3842 */
3843 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003844 if (htmlParseEndTag(ctxt) &&
3845 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3846 if (currentNode != NULL)
3847 xmlFree(currentNode);
3848 return;
3849 }
3850 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003851 }
3852
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003853 else if ((CUR == '<') &&
3854 ((IS_ASCII_LETTER(NXT(1))) ||
3855 (NXT(1) == '_') || (NXT(1) == ':'))) {
3856 name = htmlParseHTMLName_nonInvasive(ctxt);
3857 if (name == NULL) {
3858 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3859 "htmlParseStartTag: invalid element name\n",
3860 NULL, NULL);
3861 /* Dump the bogus tag like browsers do */
3862 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3863 NEXT;
3864
3865 if (currentNode != NULL)
3866 xmlFree(currentNode);
3867 return;
3868 }
3869
3870 if (ctxt->name != NULL) {
3871 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
3872 htmlAutoClose(ctxt, name);
3873 continue;
3874 }
3875 }
3876 }
3877
Owen Taylor3473f882001-02-23 17:55:21 +00003878 /*
3879 * Has this node been popped out during parsing of
3880 * the next element
3881 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003882 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3883 (!xmlStrEqual(currentNode, ctxt->name)))
3884 {
Owen Taylor3473f882001-02-23 17:55:21 +00003885 if (currentNode != NULL) xmlFree(currentNode);
3886 return;
3887 }
3888
Daniel Veillardf9533d12001-03-03 10:04:57 +00003889 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3890 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003891 /*
3892 * Handle SCRIPT/STYLE separately
3893 */
3894 htmlParseScript(ctxt);
3895 } else {
3896 /*
3897 * Sometimes DOCTYPE arrives in the middle of the document
3898 */
3899 if ((CUR == '<') && (NXT(1) == '!') &&
3900 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3901 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3902 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3903 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003904 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3905 "Misplaced DOCTYPE declaration\n",
3906 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003907 htmlParseDocTypeDecl(ctxt);
3908 }
3909
3910 /*
3911 * First case : a comment
3912 */
3913 if ((CUR == '<') && (NXT(1) == '!') &&
3914 (NXT(2) == '-') && (NXT(3) == '-')) {
3915 htmlParseComment(ctxt);
3916 }
3917
3918 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003919 * Second case : a Processing Instruction.
3920 */
3921 else if ((CUR == '<') && (NXT(1) == '?')) {
3922 htmlParsePI(ctxt);
3923 }
3924
3925 /*
3926 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00003927 */
3928 else if (CUR == '<') {
3929 htmlParseElement(ctxt);
3930 }
3931
3932 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003933 * Fourth case : a reference. If if has not been resolved,
Owen Taylor3473f882001-02-23 17:55:21 +00003934 * parsing returns it's Name, create the node
3935 */
3936 else if (CUR == '&') {
3937 htmlParseReference(ctxt);
3938 }
3939
3940 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003941 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00003942 */
3943 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003944 htmlAutoCloseOnEnd(ctxt);
3945 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003946 }
3947
3948 /*
3949 * Last case, text. Note that References are handled directly.
3950 */
3951 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003952 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003953 }
3954
3955 if (cons == ctxt->nbChars) {
3956 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003957 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3958 "detected an error in element content\n",
3959 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003960 }
3961 break;
3962 }
3963 }
3964 GROW;
3965 }
3966 if (currentNode != NULL) xmlFree(currentNode);
3967}
3968
3969/**
Daniel Veillard499cc922006-01-18 17:22:35 +00003970 * htmlParseContent:
3971 * @ctxt: an HTML parser context
3972 *
3973 * Parse a content: comment, sub-element, reference or text.
3974 */
3975
3976void
3977__htmlParseContent(void *ctxt) {
3978 if (ctxt != NULL)
3979 htmlParseContent((htmlParserCtxtPtr) ctxt);
3980}
3981
3982/**
Owen Taylor3473f882001-02-23 17:55:21 +00003983 * htmlParseElement:
3984 * @ctxt: an HTML parser context
3985 *
3986 * parse an HTML element, this is highly recursive
3987 *
3988 * [39] element ::= EmptyElemTag | STag content ETag
3989 *
3990 * [41] Attribute ::= Name Eq AttValue
3991 */
3992
3993void
3994htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003995 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003996 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003997 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003998 htmlParserNodeInfo node_info;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003999 int failed;
Daniel Veillarda03e3652004-11-02 18:45:30 +00004000 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00004001 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00004002
Daniel Veillarda03e3652004-11-02 18:45:30 +00004003 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4004 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
Daniel Veillard597f1c12005-07-03 23:00:18 +00004005 "htmlParseElement: context error\n", NULL, NULL);
Daniel Veillarda03e3652004-11-02 18:45:30 +00004006 return;
4007 }
Owen Taylor3473f882001-02-23 17:55:21 +00004008 /* Capture start position */
4009 if (ctxt->record_info) {
4010 node_info.begin_pos = ctxt->input->consumed +
4011 (CUR_PTR - ctxt->input->base);
4012 node_info.begin_line = ctxt->input->line;
4013 }
4014
Daniel Veillard597f1c12005-07-03 23:00:18 +00004015 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004016 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00004017 if ((failed == -1) || (name == NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004018 if (CUR == '>')
4019 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004020 return;
4021 }
Owen Taylor3473f882001-02-23 17:55:21 +00004022
4023 /*
4024 * Lookup the info for that element.
4025 */
4026 info = htmlTagLookup(name);
4027 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004028 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4029 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004030 }
4031
4032 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004033 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004034 */
4035 if ((CUR == '/') && (NXT(1) == '>')) {
4036 SKIP(2);
4037 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4038 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004039 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004040 return;
4041 }
4042
4043 if (CUR == '>') {
4044 NEXT;
4045 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004046 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4047 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004048
4049 /*
4050 * end of parsing of this node.
4051 */
4052 if (xmlStrEqual(name, ctxt->name)) {
4053 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00004054 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004055 }
4056
4057 /*
4058 * Capture end position and add node
4059 */
Daniel Veillard30e76072006-03-09 14:13:55 +00004060 if (ctxt->record_info) {
Owen Taylor3473f882001-02-23 17:55:21 +00004061 node_info.end_pos = ctxt->input->consumed +
4062 (CUR_PTR - ctxt->input->base);
4063 node_info.end_line = ctxt->input->line;
4064 node_info.node = ctxt->node;
4065 xmlParserAddNodeInfo(ctxt, &node_info);
4066 }
4067 return;
4068 }
4069
4070 /*
4071 * Check for an Empty Element from DTD definition
4072 */
4073 if ((info != NULL) && (info->empty)) {
4074 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4075 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004076 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004077 return;
4078 }
4079
4080 /*
4081 * Parse the content of the element:
4082 */
4083 currentNode = xmlStrdup(ctxt->name);
4084 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00004085 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00004086 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00004087 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00004088 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00004089 if (ctxt->nameNr < depth) break;
4090 }
4091
Owen Taylor3473f882001-02-23 17:55:21 +00004092 /*
4093 * Capture end position and add node
4094 */
4095 if ( currentNode != NULL && ctxt->record_info ) {
4096 node_info.end_pos = ctxt->input->consumed +
4097 (CUR_PTR - ctxt->input->base);
4098 node_info.end_line = ctxt->input->line;
4099 node_info.node = ctxt->node;
4100 xmlParserAddNodeInfo(ctxt, &node_info);
4101 }
William M. Brack76e95df2003-10-18 16:20:14 +00004102 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004103 htmlAutoCloseOnEnd(ctxt);
4104 }
4105
Owen Taylor3473f882001-02-23 17:55:21 +00004106 if (currentNode != NULL)
4107 xmlFree(currentNode);
4108}
4109
4110/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004111 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00004112 * @ctxt: an HTML parser context
4113 *
4114 * parse an HTML document (and build a tree if using the standard SAX
4115 * interface).
4116 *
4117 * Returns 0, -1 in case of error. the parser context is augmented
4118 * as a result of the parsing.
4119 */
4120
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00004121int
Owen Taylor3473f882001-02-23 17:55:21 +00004122htmlParseDocument(htmlParserCtxtPtr ctxt) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004123 xmlChar start[4];
4124 xmlCharEncoding enc;
Owen Taylor3473f882001-02-23 17:55:21 +00004125 xmlDtdPtr dtd;
4126
Daniel Veillardd0463562001-10-13 09:15:48 +00004127 xmlInitParser();
4128
Owen Taylor3473f882001-02-23 17:55:21 +00004129 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00004130
Daniel Veillarda03e3652004-11-02 18:45:30 +00004131 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4132 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4133 "htmlParseDocument: context error\n", NULL, NULL);
4134 return(XML_ERR_INTERNAL_ERROR);
4135 }
4136 ctxt->html = 1;
Daniel Veillard4d3e2da2009-05-15 17:55:45 +02004137 ctxt->linenumbers = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00004138 GROW;
4139 /*
4140 * SAX: beginning of the document processing.
4141 */
4142 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4143 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4144
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004145 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4146 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4147 /*
4148 * Get the 4 first bytes and decode the charset
4149 * if enc != XML_CHAR_ENCODING_NONE
4150 * plug some encoding conversion routines.
4151 */
4152 start[0] = RAW;
4153 start[1] = NXT(1);
4154 start[2] = NXT(2);
4155 start[3] = NXT(3);
4156 enc = xmlDetectCharEncoding(&start[0], 4);
4157 if (enc != XML_CHAR_ENCODING_NONE) {
4158 xmlSwitchEncoding(ctxt, enc);
4159 }
4160 }
4161
Owen Taylor3473f882001-02-23 17:55:21 +00004162 /*
4163 * Wipe out everything which is before the first '<'
4164 */
4165 SKIP_BLANKS;
4166 if (CUR == 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004167 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4168 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004169 }
4170
4171 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4172 ctxt->sax->startDocument(ctxt->userData);
4173
4174
4175 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004176 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004177 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004178 while (((CUR == '<') && (NXT(1) == '!') &&
4179 (NXT(2) == '-') && (NXT(3) == '-')) ||
4180 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004181 htmlParseComment(ctxt);
4182 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004183 SKIP_BLANKS;
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004184 }
Owen Taylor3473f882001-02-23 17:55:21 +00004185
4186
4187 /*
4188 * Then possibly doc type declaration(s) and more Misc
4189 * (doctypedecl Misc*)?
4190 */
4191 if ((CUR == '<') && (NXT(1) == '!') &&
4192 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4193 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4194 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4195 (UPP(8) == 'E')) {
4196 htmlParseDocTypeDecl(ctxt);
4197 }
4198 SKIP_BLANKS;
4199
4200 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004201 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004202 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004203 while (((CUR == '<') && (NXT(1) == '!') &&
4204 (NXT(2) == '-') && (NXT(3) == '-')) ||
4205 ((CUR == '<') && (NXT(1) == '?'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004206 htmlParseComment(ctxt);
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004207 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004208 SKIP_BLANKS;
4209 }
4210
4211 /*
4212 * Time to start parsing the tree itself
4213 */
4214 htmlParseContent(ctxt);
4215
4216 /*
4217 * autoclose
4218 */
4219 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004220 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004221
4222
4223 /*
4224 * SAX: end of the document processing.
4225 */
4226 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4227 ctxt->sax->endDocument(ctxt->userData);
4228
4229 if (ctxt->myDoc != NULL) {
4230 dtd = xmlGetIntSubset(ctxt->myDoc);
4231 if (dtd == NULL)
4232 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00004233 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004234 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4235 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4236 }
4237 if (! ctxt->wellFormed) return(-1);
4238 return(0);
4239}
4240
4241
4242/************************************************************************
4243 * *
4244 * Parser contexts handling *
4245 * *
4246 ************************************************************************/
4247
4248/**
William M. Brackedb65a72004-02-06 07:36:04 +00004249 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004250 * @ctxt: an HTML parser context
4251 *
4252 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004253 *
4254 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004255 */
4256
Daniel Veillardf403d292003-10-05 13:51:35 +00004257static int
Owen Taylor3473f882001-02-23 17:55:21 +00004258htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4259{
4260 htmlSAXHandler *sax;
4261
Daniel Veillardf403d292003-10-05 13:51:35 +00004262 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004263 memset(ctxt, 0, sizeof(htmlParserCtxt));
4264
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004265 ctxt->dict = xmlDictCreate();
4266 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004267 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4268 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004269 }
Owen Taylor3473f882001-02-23 17:55:21 +00004270 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4271 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004272 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4273 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004274 }
4275 else
4276 memset(sax, 0, sizeof(htmlSAXHandler));
4277
4278 /* Allocate the Input stack */
4279 ctxt->inputTab = (htmlParserInputPtr *)
4280 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4281 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004282 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004283 ctxt->inputNr = 0;
4284 ctxt->inputMax = 0;
4285 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004286 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004287 }
4288 ctxt->inputNr = 0;
4289 ctxt->inputMax = 5;
4290 ctxt->input = NULL;
4291 ctxt->version = NULL;
4292 ctxt->encoding = NULL;
4293 ctxt->standalone = -1;
4294 ctxt->instate = XML_PARSER_START;
4295
4296 /* Allocate the Node stack */
4297 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4298 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004299 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004300 ctxt->nodeNr = 0;
4301 ctxt->nodeMax = 0;
4302 ctxt->node = NULL;
4303 ctxt->inputNr = 0;
4304 ctxt->inputMax = 0;
4305 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004306 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004307 }
4308 ctxt->nodeNr = 0;
4309 ctxt->nodeMax = 10;
4310 ctxt->node = NULL;
4311
4312 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004313 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004314 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004315 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004316 ctxt->nameNr = 0;
4317 ctxt->nameMax = 10;
4318 ctxt->name = NULL;
4319 ctxt->nodeNr = 0;
4320 ctxt->nodeMax = 0;
4321 ctxt->node = NULL;
4322 ctxt->inputNr = 0;
4323 ctxt->inputMax = 0;
4324 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004325 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004326 }
4327 ctxt->nameNr = 0;
4328 ctxt->nameMax = 10;
4329 ctxt->name = NULL;
4330
Daniel Veillard092643b2003-09-25 14:29:29 +00004331 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004332 else {
4333 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004334 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004335 }
4336 ctxt->userData = ctxt;
4337 ctxt->myDoc = NULL;
4338 ctxt->wellFormed = 1;
4339 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004340 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004341 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004342 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004343 ctxt->vctxt.userData = ctxt;
4344 ctxt->vctxt.error = xmlParserValidityError;
4345 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004346 ctxt->record_info = 0;
4347 ctxt->validate = 0;
4348 ctxt->nbChars = 0;
4349 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004350 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004351 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004352 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004353}
4354
4355/**
4356 * htmlFreeParserCtxt:
4357 * @ctxt: an HTML parser context
4358 *
4359 * Free all the memory used by a parser context. However the parsed
4360 * document in ctxt->myDoc is not freed.
4361 */
4362
4363void
4364htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4365{
4366 xmlFreeParserCtxt(ctxt);
4367}
4368
4369/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004370 * htmlNewParserCtxt:
4371 *
4372 * Allocate and initialize a new parser context.
4373 *
Daniel Veillard34c647c2006-09-21 06:53:59 +00004374 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
Daniel Veillard1d995272002-07-22 16:43:32 +00004375 */
4376
Daniel Veillard34c647c2006-09-21 06:53:59 +00004377htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004378htmlNewParserCtxt(void)
4379{
4380 xmlParserCtxtPtr ctxt;
4381
4382 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4383 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004384 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004385 return(NULL);
4386 }
4387 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004388 if (htmlInitParserCtxt(ctxt) < 0) {
4389 htmlFreeParserCtxt(ctxt);
4390 return(NULL);
4391 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004392 return(ctxt);
4393}
4394
4395/**
4396 * htmlCreateMemoryParserCtxt:
4397 * @buffer: a pointer to a char array
4398 * @size: the size of the array
4399 *
4400 * Create a parser context for an HTML in-memory document.
4401 *
4402 * Returns the new parser context or NULL
4403 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004404htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004405htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4406 xmlParserCtxtPtr ctxt;
4407 xmlParserInputPtr input;
4408 xmlParserInputBufferPtr buf;
4409
4410 if (buffer == NULL)
4411 return(NULL);
4412 if (size <= 0)
4413 return(NULL);
4414
4415 ctxt = htmlNewParserCtxt();
4416 if (ctxt == NULL)
4417 return(NULL);
4418
4419 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4420 if (buf == NULL) return(NULL);
4421
4422 input = xmlNewInputStream(ctxt);
4423 if (input == NULL) {
4424 xmlFreeParserCtxt(ctxt);
4425 return(NULL);
4426 }
4427
4428 input->filename = NULL;
4429 input->buf = buf;
4430 input->base = input->buf->buffer->content;
4431 input->cur = input->buf->buffer->content;
4432 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4433
4434 inputPush(ctxt, input);
4435 return(ctxt);
4436}
4437
4438/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004439 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004440 * @cur: a pointer to an array of xmlChar
4441 * @encoding: a free form C string describing the HTML document encoding, or NULL
4442 *
4443 * Create a parser context for an HTML document.
4444 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004445 * TODO: check the need to add encoding handling there
4446 *
Owen Taylor3473f882001-02-23 17:55:21 +00004447 * Returns the new parser context or NULL
4448 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004449static htmlParserCtxtPtr
Daniel Veillard4d1320f2007-04-26 08:55:33 +00004450htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004451 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004452 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004453
Daniel Veillard1d995272002-07-22 16:43:32 +00004454 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004455 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004456 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004457 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
Daniel Veillard739e9d02007-04-27 09:33:58 +00004458 if (ctxt == NULL)
4459 return(NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004460
4461 if (encoding != NULL) {
4462 xmlCharEncoding enc;
4463 xmlCharEncodingHandlerPtr handler;
4464
4465 if (ctxt->input->encoding != NULL)
4466 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004467 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004468
4469 enc = xmlParseCharEncoding(encoding);
4470 /*
4471 * registered set of known encodings
4472 */
4473 if (enc != XML_CHAR_ENCODING_ERROR) {
4474 xmlSwitchEncoding(ctxt, enc);
4475 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004476 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4477 "Unsupported encoding %s\n",
4478 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004479 }
4480 } else {
4481 /*
4482 * fallback for unknown encodings
4483 */
4484 handler = xmlFindCharEncodingHandler((const char *) encoding);
4485 if (handler != NULL) {
4486 xmlSwitchToEncoding(ctxt, handler);
4487 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004488 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4489 "Unsupported encoding %s\n",
4490 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004491 }
4492 }
4493 }
4494 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004495}
4496
Daniel Veillard73b013f2003-09-30 12:36:01 +00004497#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004498/************************************************************************
4499 * *
4500 * Progressive parsing interfaces *
4501 * *
4502 ************************************************************************/
4503
4504/**
4505 * htmlParseLookupSequence:
4506 * @ctxt: an HTML parser context
4507 * @first: the first char to lookup
4508 * @next: the next char to lookup or zero
4509 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004510 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004511 *
4512 * Try to find if a sequence (first, next, third) or just (first next) or
4513 * (first) is available in the input stream.
4514 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4515 * to avoid rescanning sequences of bytes, it DOES change the state of the
4516 * parser, do not use liberally.
4517 * This is basically similar to xmlParseLookupSequence()
4518 *
4519 * Returns the index to the current parsing point if the full sequence
4520 * is available, -1 otherwise.
4521 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004522static int
Owen Taylor3473f882001-02-23 17:55:21 +00004523htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Jiri Netolicky446e1262009-08-07 17:05:36 +02004524 xmlChar next, xmlChar third, int iscomment,
4525 int ignoreattrval) {
Owen Taylor3473f882001-02-23 17:55:21 +00004526 int base, len;
4527 htmlParserInputPtr in;
4528 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004529 int incomment = 0;
Jiri Netolicky446e1262009-08-07 17:05:36 +02004530 int invalue = 0;
4531 char valdellim = 0x0;
Owen Taylor3473f882001-02-23 17:55:21 +00004532
4533 in = ctxt->input;
4534 if (in == NULL) return(-1);
4535 base = in->cur - in->base;
4536 if (base < 0) return(-1);
4537 if (ctxt->checkIndex > base)
4538 base = ctxt->checkIndex;
4539 if (in->buf == NULL) {
4540 buf = in->base;
4541 len = in->length;
4542 } else {
4543 buf = in->buf->buffer->content;
4544 len = in->buf->buffer->use;
4545 }
4546 /* take into account the sequence length */
4547 if (third) len -= 2;
4548 else if (next) len --;
4549 for (;base < len;base++) {
William M. Brackc1939562003-08-05 15:52:22 +00004550 if (!incomment && (base + 4 < len) && !iscomment) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004551 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4552 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4553 incomment = 1;
Daniel Veillard97e01882003-07-30 18:59:19 +00004554 /* do not increment past <! - some people use <!--> */
4555 base += 2;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004556 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004557 }
Jiri Netolicky446e1262009-08-07 17:05:36 +02004558 if (ignoreattrval) {
4559 if (buf[base] == '"' || buf[base] == '\'') {
4560 if (invalue) {
4561 if (buf[base] == valdellim) {
4562 invalue = 0;
4563 continue;
4564 }
4565 } else {
4566 valdellim = buf[base];
4567 invalue = 1;
4568 continue;
4569 }
4570 } else if (invalue) {
4571 continue;
4572 }
4573 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004574 if (incomment) {
William M. Brack4a557d92003-07-29 04:28:04 +00004575 if (base + 3 > len)
Daniel Veillardc1f78342001-11-10 11:43:05 +00004576 return(-1);
4577 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4578 (buf[base + 2] == '>')) {
4579 incomment = 0;
4580 base += 2;
4581 }
4582 continue;
4583 }
Owen Taylor3473f882001-02-23 17:55:21 +00004584 if (buf[base] == first) {
4585 if (third != 0) {
4586 if ((buf[base + 1] != next) ||
4587 (buf[base + 2] != third)) continue;
4588 } else if (next != 0) {
4589 if (buf[base + 1] != next) continue;
4590 }
4591 ctxt->checkIndex = 0;
4592#ifdef DEBUG_PUSH
4593 if (next == 0)
4594 xmlGenericError(xmlGenericErrorContext,
4595 "HPP: lookup '%c' found at %d\n",
4596 first, base);
4597 else if (third == 0)
4598 xmlGenericError(xmlGenericErrorContext,
4599 "HPP: lookup '%c%c' found at %d\n",
4600 first, next, base);
4601 else
4602 xmlGenericError(xmlGenericErrorContext,
4603 "HPP: lookup '%c%c%c' found at %d\n",
4604 first, next, third, base);
4605#endif
4606 return(base - (in->cur - in->base));
4607 }
4608 }
4609 ctxt->checkIndex = base;
4610#ifdef DEBUG_PUSH
4611 if (next == 0)
4612 xmlGenericError(xmlGenericErrorContext,
4613 "HPP: lookup '%c' failed\n", first);
4614 else if (third == 0)
4615 xmlGenericError(xmlGenericErrorContext,
4616 "HPP: lookup '%c%c' failed\n", first, next);
4617 else
4618 xmlGenericError(xmlGenericErrorContext,
4619 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4620#endif
4621 return(-1);
4622}
4623
4624/**
4625 * htmlParseTryOrFinish:
4626 * @ctxt: an HTML parser context
4627 * @terminate: last chunk indicator
4628 *
4629 * Try to progress on parsing
4630 *
4631 * Returns zero if no parsing was possible
4632 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004633static int
Owen Taylor3473f882001-02-23 17:55:21 +00004634htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4635 int ret = 0;
4636 htmlParserInputPtr in;
4637 int avail = 0;
4638 xmlChar cur, next;
4639
4640#ifdef DEBUG_PUSH
4641 switch (ctxt->instate) {
4642 case XML_PARSER_EOF:
4643 xmlGenericError(xmlGenericErrorContext,
4644 "HPP: try EOF\n"); break;
4645 case XML_PARSER_START:
4646 xmlGenericError(xmlGenericErrorContext,
4647 "HPP: try START\n"); break;
4648 case XML_PARSER_MISC:
4649 xmlGenericError(xmlGenericErrorContext,
4650 "HPP: try MISC\n");break;
4651 case XML_PARSER_COMMENT:
4652 xmlGenericError(xmlGenericErrorContext,
4653 "HPP: try COMMENT\n");break;
4654 case XML_PARSER_PROLOG:
4655 xmlGenericError(xmlGenericErrorContext,
4656 "HPP: try PROLOG\n");break;
4657 case XML_PARSER_START_TAG:
4658 xmlGenericError(xmlGenericErrorContext,
4659 "HPP: try START_TAG\n");break;
4660 case XML_PARSER_CONTENT:
4661 xmlGenericError(xmlGenericErrorContext,
4662 "HPP: try CONTENT\n");break;
4663 case XML_PARSER_CDATA_SECTION:
4664 xmlGenericError(xmlGenericErrorContext,
4665 "HPP: try CDATA_SECTION\n");break;
4666 case XML_PARSER_END_TAG:
4667 xmlGenericError(xmlGenericErrorContext,
4668 "HPP: try END_TAG\n");break;
4669 case XML_PARSER_ENTITY_DECL:
4670 xmlGenericError(xmlGenericErrorContext,
4671 "HPP: try ENTITY_DECL\n");break;
4672 case XML_PARSER_ENTITY_VALUE:
4673 xmlGenericError(xmlGenericErrorContext,
4674 "HPP: try ENTITY_VALUE\n");break;
4675 case XML_PARSER_ATTRIBUTE_VALUE:
4676 xmlGenericError(xmlGenericErrorContext,
4677 "HPP: try ATTRIBUTE_VALUE\n");break;
4678 case XML_PARSER_DTD:
4679 xmlGenericError(xmlGenericErrorContext,
4680 "HPP: try DTD\n");break;
4681 case XML_PARSER_EPILOG:
4682 xmlGenericError(xmlGenericErrorContext,
4683 "HPP: try EPILOG\n");break;
4684 case XML_PARSER_PI:
4685 xmlGenericError(xmlGenericErrorContext,
4686 "HPP: try PI\n");break;
4687 case XML_PARSER_SYSTEM_LITERAL:
4688 xmlGenericError(xmlGenericErrorContext,
4689 "HPP: try SYSTEM_LITERAL\n");break;
4690 }
4691#endif
4692
4693 while (1) {
4694
4695 in = ctxt->input;
4696 if (in == NULL) break;
4697 if (in->buf == NULL)
4698 avail = in->length - (in->cur - in->base);
4699 else
4700 avail = in->buf->buffer->use - (in->cur - in->base);
4701 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004702 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004703 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4704 /*
4705 * SAX: end of the document processing.
4706 */
4707 ctxt->instate = XML_PARSER_EOF;
4708 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4709 ctxt->sax->endDocument(ctxt->userData);
4710 }
4711 }
4712 if (avail < 1)
4713 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004714 cur = in->cur[0];
4715 if (cur == 0) {
4716 SKIP(1);
4717 continue;
4718 }
4719
Owen Taylor3473f882001-02-23 17:55:21 +00004720 switch (ctxt->instate) {
4721 case XML_PARSER_EOF:
4722 /*
4723 * Document parsing is done !
4724 */
4725 goto done;
4726 case XML_PARSER_START:
4727 /*
4728 * Very first chars read from the document flow.
4729 */
4730 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004731 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004732 SKIP_BLANKS;
4733 if (in->buf == NULL)
4734 avail = in->length - (in->cur - in->base);
4735 else
4736 avail = in->buf->buffer->use - (in->cur - in->base);
4737 }
4738 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4739 ctxt->sax->setDocumentLocator(ctxt->userData,
4740 &xmlDefaultSAXLocator);
4741 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4742 (!ctxt->disableSAX))
4743 ctxt->sax->startDocument(ctxt->userData);
4744
4745 cur = in->cur[0];
4746 next = in->cur[1];
4747 if ((cur == '<') && (next == '!') &&
4748 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4749 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4750 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4751 (UPP(8) == 'E')) {
4752 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004753 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004754 goto done;
4755#ifdef DEBUG_PUSH
4756 xmlGenericError(xmlGenericErrorContext,
4757 "HPP: Parsing internal subset\n");
4758#endif
4759 htmlParseDocTypeDecl(ctxt);
4760 ctxt->instate = XML_PARSER_PROLOG;
4761#ifdef DEBUG_PUSH
4762 xmlGenericError(xmlGenericErrorContext,
4763 "HPP: entering PROLOG\n");
4764#endif
4765 } else {
4766 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004767#ifdef DEBUG_PUSH
Daniel Veillard597f1c12005-07-03 23:00:18 +00004768 xmlGenericError(xmlGenericErrorContext,
4769 "HPP: entering MISC\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004770#endif
Daniel Veillard597f1c12005-07-03 23:00:18 +00004771 }
Owen Taylor3473f882001-02-23 17:55:21 +00004772 break;
4773 case XML_PARSER_MISC:
4774 SKIP_BLANKS;
4775 if (in->buf == NULL)
4776 avail = in->length - (in->cur - in->base);
4777 else
4778 avail = in->buf->buffer->use - (in->cur - in->base);
4779 if (avail < 2)
4780 goto done;
4781 cur = in->cur[0];
4782 next = in->cur[1];
4783 if ((cur == '<') && (next == '!') &&
4784 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4785 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004786 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004787 goto done;
4788#ifdef DEBUG_PUSH
4789 xmlGenericError(xmlGenericErrorContext,
4790 "HPP: Parsing Comment\n");
4791#endif
4792 htmlParseComment(ctxt);
4793 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004794 } else if ((cur == '<') && (next == '?')) {
4795 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004796 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004797 goto done;
4798#ifdef DEBUG_PUSH
4799 xmlGenericError(xmlGenericErrorContext,
4800 "HPP: Parsing PI\n");
4801#endif
4802 htmlParsePI(ctxt);
4803 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004804 } else if ((cur == '<') && (next == '!') &&
4805 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4806 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4807 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4808 (UPP(8) == 'E')) {
4809 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004810 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004811 goto done;
4812#ifdef DEBUG_PUSH
4813 xmlGenericError(xmlGenericErrorContext,
4814 "HPP: Parsing internal subset\n");
4815#endif
4816 htmlParseDocTypeDecl(ctxt);
4817 ctxt->instate = XML_PARSER_PROLOG;
4818#ifdef DEBUG_PUSH
4819 xmlGenericError(xmlGenericErrorContext,
4820 "HPP: entering PROLOG\n");
4821#endif
4822 } else if ((cur == '<') && (next == '!') &&
4823 (avail < 9)) {
4824 goto done;
4825 } else {
4826 ctxt->instate = XML_PARSER_START_TAG;
4827#ifdef DEBUG_PUSH
4828 xmlGenericError(xmlGenericErrorContext,
4829 "HPP: entering START_TAG\n");
4830#endif
4831 }
4832 break;
4833 case XML_PARSER_PROLOG:
4834 SKIP_BLANKS;
4835 if (in->buf == NULL)
4836 avail = in->length - (in->cur - in->base);
4837 else
4838 avail = in->buf->buffer->use - (in->cur - in->base);
4839 if (avail < 2)
4840 goto done;
4841 cur = in->cur[0];
4842 next = in->cur[1];
4843 if ((cur == '<') && (next == '!') &&
4844 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4845 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004846 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004847 goto done;
4848#ifdef DEBUG_PUSH
4849 xmlGenericError(xmlGenericErrorContext,
4850 "HPP: Parsing Comment\n");
4851#endif
4852 htmlParseComment(ctxt);
4853 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004854 } else if ((cur == '<') && (next == '?')) {
4855 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004856 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004857 goto done;
4858#ifdef DEBUG_PUSH
4859 xmlGenericError(xmlGenericErrorContext,
4860 "HPP: Parsing PI\n");
4861#endif
4862 htmlParsePI(ctxt);
4863 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004864 } else if ((cur == '<') && (next == '!') &&
4865 (avail < 4)) {
4866 goto done;
4867 } else {
4868 ctxt->instate = XML_PARSER_START_TAG;
4869#ifdef DEBUG_PUSH
4870 xmlGenericError(xmlGenericErrorContext,
4871 "HPP: entering START_TAG\n");
4872#endif
4873 }
4874 break;
4875 case XML_PARSER_EPILOG:
4876 if (in->buf == NULL)
4877 avail = in->length - (in->cur - in->base);
4878 else
4879 avail = in->buf->buffer->use - (in->cur - in->base);
4880 if (avail < 1)
4881 goto done;
4882 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004883 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004884 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004885 goto done;
4886 }
4887 if (avail < 2)
4888 goto done;
4889 next = in->cur[1];
4890 if ((cur == '<') && (next == '!') &&
4891 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4892 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004893 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004894 goto done;
4895#ifdef DEBUG_PUSH
4896 xmlGenericError(xmlGenericErrorContext,
4897 "HPP: Parsing Comment\n");
4898#endif
4899 htmlParseComment(ctxt);
4900 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004901 } else if ((cur == '<') && (next == '?')) {
4902 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004903 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004904 goto done;
4905#ifdef DEBUG_PUSH
4906 xmlGenericError(xmlGenericErrorContext,
4907 "HPP: Parsing PI\n");
4908#endif
4909 htmlParsePI(ctxt);
4910 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004911 } else if ((cur == '<') && (next == '!') &&
4912 (avail < 4)) {
4913 goto done;
4914 } else {
4915 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004916 ctxt->wellFormed = 0;
4917 ctxt->instate = XML_PARSER_EOF;
4918#ifdef DEBUG_PUSH
4919 xmlGenericError(xmlGenericErrorContext,
4920 "HPP: entering EOF\n");
4921#endif
4922 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4923 ctxt->sax->endDocument(ctxt->userData);
4924 goto done;
4925 }
4926 break;
4927 case XML_PARSER_START_TAG: {
Daniel Veillard6a0baa02005-12-10 11:11:12 +00004928 const xmlChar *name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004929 int failed;
Daniel Veillardbb371292001-08-16 23:26:59 +00004930 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004931
4932 if (avail < 2)
4933 goto done;
4934 cur = in->cur[0];
4935 if (cur != '<') {
4936 ctxt->instate = XML_PARSER_CONTENT;
4937#ifdef DEBUG_PUSH
4938 xmlGenericError(xmlGenericErrorContext,
4939 "HPP: entering CONTENT\n");
4940#endif
4941 break;
4942 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004943 if (in->cur[1] == '/') {
4944 ctxt->instate = XML_PARSER_END_TAG;
4945 ctxt->checkIndex = 0;
4946#ifdef DEBUG_PUSH
4947 xmlGenericError(xmlGenericErrorContext,
4948 "HPP: entering END_TAG\n");
4949#endif
4950 break;
4951 }
Owen Taylor3473f882001-02-23 17:55:21 +00004952 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004953 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004954 goto done;
4955
Daniel Veillard597f1c12005-07-03 23:00:18 +00004956 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004957 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00004958 if ((failed == -1) ||
Owen Taylor3473f882001-02-23 17:55:21 +00004959 (name == NULL)) {
4960 if (CUR == '>')
4961 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004962 break;
4963 }
Owen Taylor3473f882001-02-23 17:55:21 +00004964
4965 /*
4966 * Lookup the info for that element.
4967 */
4968 info = htmlTagLookup(name);
4969 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004970 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4971 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004972 }
4973
4974 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004975 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004976 */
4977 if ((CUR == '/') && (NXT(1) == '>')) {
4978 SKIP(2);
4979 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4980 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00004981 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004982 ctxt->instate = XML_PARSER_CONTENT;
4983#ifdef DEBUG_PUSH
4984 xmlGenericError(xmlGenericErrorContext,
4985 "HPP: entering CONTENT\n");
4986#endif
4987 break;
4988 }
4989
4990 if (CUR == '>') {
4991 NEXT;
4992 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004993 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4994 "Couldn't find end of Start Tag %s\n",
4995 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004996
4997 /*
4998 * end of parsing of this node.
4999 */
5000 if (xmlStrEqual(name, ctxt->name)) {
5001 nodePop(ctxt);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005002 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005003 }
5004
5005 ctxt->instate = XML_PARSER_CONTENT;
5006#ifdef DEBUG_PUSH
5007 xmlGenericError(xmlGenericErrorContext,
5008 "HPP: entering CONTENT\n");
5009#endif
5010 break;
5011 }
5012
5013 /*
5014 * Check for an Empty Element from DTD definition
5015 */
5016 if ((info != NULL) && (info->empty)) {
5017 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5018 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005019 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005020 }
5021 ctxt->instate = XML_PARSER_CONTENT;
5022#ifdef DEBUG_PUSH
5023 xmlGenericError(xmlGenericErrorContext,
5024 "HPP: entering CONTENT\n");
5025#endif
5026 break;
5027 }
5028 case XML_PARSER_CONTENT: {
5029 long cons;
5030 /*
5031 * Handle preparsed entities and charRef
5032 */
5033 if (ctxt->token != 0) {
5034 xmlChar chr[2] = { 0 , 0 } ;
5035
5036 chr[0] = (xmlChar) ctxt->token;
5037 htmlCheckParagraph(ctxt);
5038 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5039 ctxt->sax->characters(ctxt->userData, chr, 1);
5040 ctxt->token = 0;
5041 ctxt->checkIndex = 0;
5042 }
5043 if ((avail == 1) && (terminate)) {
5044 cur = in->cur[0];
5045 if ((cur != '<') && (cur != '&')) {
5046 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00005047 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005048 if (ctxt->sax->ignorableWhitespace != NULL)
5049 ctxt->sax->ignorableWhitespace(
5050 ctxt->userData, &cur, 1);
5051 } else {
5052 htmlCheckParagraph(ctxt);
5053 if (ctxt->sax->characters != NULL)
5054 ctxt->sax->characters(
5055 ctxt->userData, &cur, 1);
5056 }
5057 }
5058 ctxt->token = 0;
5059 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00005060 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00005061 break;
Owen Taylor3473f882001-02-23 17:55:21 +00005062 }
Owen Taylor3473f882001-02-23 17:55:21 +00005063 }
5064 if (avail < 2)
5065 goto done;
5066 cur = in->cur[0];
5067 next = in->cur[1];
5068 cons = ctxt->nbChars;
5069 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5070 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5071 /*
5072 * Handle SCRIPT/STYLE separately
5073 */
Daniel Veillard68716a72006-10-16 09:32:17 +00005074 if (!terminate) {
5075 int idx;
5076 xmlChar val;
5077
Jiri Netolicky446e1262009-08-07 17:05:36 +02005078 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 1);
Daniel Veillard68716a72006-10-16 09:32:17 +00005079 if (idx < 0)
5080 goto done;
5081 val = in->cur[idx + 2];
5082 if (val == 0) /* bad cut of input */
5083 goto done;
5084 }
Owen Taylor3473f882001-02-23 17:55:21 +00005085 htmlParseScript(ctxt);
5086 if ((cur == '<') && (next == '/')) {
5087 ctxt->instate = XML_PARSER_END_TAG;
5088 ctxt->checkIndex = 0;
5089#ifdef DEBUG_PUSH
5090 xmlGenericError(xmlGenericErrorContext,
5091 "HPP: entering END_TAG\n");
5092#endif
5093 break;
5094 }
5095 } else {
5096 /*
5097 * Sometimes DOCTYPE arrives in the middle of the document
5098 */
5099 if ((cur == '<') && (next == '!') &&
5100 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5101 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5102 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5103 (UPP(8) == 'E')) {
5104 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005105 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005106 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00005107 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5108 "Misplaced DOCTYPE declaration\n",
5109 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005110 htmlParseDocTypeDecl(ctxt);
5111 } else if ((cur == '<') && (next == '!') &&
5112 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5113 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005114 (htmlParseLookupSequence(
Jiri Netolicky446e1262009-08-07 17:05:36 +02005115 ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005116 goto done;
5117#ifdef DEBUG_PUSH
5118 xmlGenericError(xmlGenericErrorContext,
5119 "HPP: Parsing Comment\n");
5120#endif
5121 htmlParseComment(ctxt);
5122 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005123 } else if ((cur == '<') && (next == '?')) {
5124 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005125 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005126 goto done;
5127#ifdef DEBUG_PUSH
5128 xmlGenericError(xmlGenericErrorContext,
5129 "HPP: Parsing PI\n");
5130#endif
5131 htmlParsePI(ctxt);
5132 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00005133 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5134 goto done;
5135 } else if ((cur == '<') && (next == '/')) {
5136 ctxt->instate = XML_PARSER_END_TAG;
5137 ctxt->checkIndex = 0;
5138#ifdef DEBUG_PUSH
5139 xmlGenericError(xmlGenericErrorContext,
5140 "HPP: entering END_TAG\n");
5141#endif
5142 break;
5143 } else if (cur == '<') {
5144 ctxt->instate = XML_PARSER_START_TAG;
5145 ctxt->checkIndex = 0;
5146#ifdef DEBUG_PUSH
5147 xmlGenericError(xmlGenericErrorContext,
5148 "HPP: entering START_TAG\n");
5149#endif
5150 break;
5151 } else if (cur == '&') {
5152 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005153 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005154 goto done;
5155#ifdef DEBUG_PUSH
5156 xmlGenericError(xmlGenericErrorContext,
5157 "HPP: Parsing Reference\n");
5158#endif
5159 /* TODO: check generation of subtrees if noent !!! */
5160 htmlParseReference(ctxt);
5161 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005162 /*
5163 * check that the text sequence is complete
5164 * before handing out the data to the parser
5165 * to avoid problems with erroneous end of
5166 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00005167 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00005168 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005169 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0, 1) < 0))
Daniel Veillard14f752c2003-08-09 11:44:50 +00005170 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00005171 ctxt->checkIndex = 0;
5172#ifdef DEBUG_PUSH
5173 xmlGenericError(xmlGenericErrorContext,
5174 "HPP: Parsing char data\n");
5175#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005176 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005177 }
5178 }
5179 if (cons == ctxt->nbChars) {
5180 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005181 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5182 "detected an error in element content\n",
5183 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005184 }
5185 NEXT;
5186 break;
5187 }
5188
5189 break;
5190 }
5191 case XML_PARSER_END_TAG:
5192 if (avail < 2)
5193 goto done;
5194 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005195 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005196 goto done;
5197 htmlParseEndTag(ctxt);
5198 if (ctxt->nameNr == 0) {
5199 ctxt->instate = XML_PARSER_EPILOG;
5200 } else {
5201 ctxt->instate = XML_PARSER_CONTENT;
5202 }
5203 ctxt->checkIndex = 0;
5204#ifdef DEBUG_PUSH
5205 xmlGenericError(xmlGenericErrorContext,
5206 "HPP: entering CONTENT\n");
5207#endif
5208 break;
5209 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005210 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5211 "HPP: internal error, state == CDATA\n",
5212 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005213 ctxt->instate = XML_PARSER_CONTENT;
5214 ctxt->checkIndex = 0;
5215#ifdef DEBUG_PUSH
5216 xmlGenericError(xmlGenericErrorContext,
5217 "HPP: entering CONTENT\n");
5218#endif
5219 break;
5220 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005221 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5222 "HPP: internal error, state == DTD\n",
5223 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005224 ctxt->instate = XML_PARSER_CONTENT;
5225 ctxt->checkIndex = 0;
5226#ifdef DEBUG_PUSH
5227 xmlGenericError(xmlGenericErrorContext,
5228 "HPP: entering CONTENT\n");
5229#endif
5230 break;
5231 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005232 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5233 "HPP: internal error, state == COMMENT\n",
5234 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005235 ctxt->instate = XML_PARSER_CONTENT;
5236 ctxt->checkIndex = 0;
5237#ifdef DEBUG_PUSH
5238 xmlGenericError(xmlGenericErrorContext,
5239 "HPP: entering CONTENT\n");
5240#endif
5241 break;
5242 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005243 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5244 "HPP: internal error, state == PI\n",
5245 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005246 ctxt->instate = XML_PARSER_CONTENT;
5247 ctxt->checkIndex = 0;
5248#ifdef DEBUG_PUSH
5249 xmlGenericError(xmlGenericErrorContext,
5250 "HPP: entering CONTENT\n");
5251#endif
5252 break;
5253 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005254 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5255 "HPP: internal error, state == ENTITY_DECL\n",
5256 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005257 ctxt->instate = XML_PARSER_CONTENT;
5258 ctxt->checkIndex = 0;
5259#ifdef DEBUG_PUSH
5260 xmlGenericError(xmlGenericErrorContext,
5261 "HPP: entering CONTENT\n");
5262#endif
5263 break;
5264 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005265 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5266 "HPP: internal error, state == ENTITY_VALUE\n",
5267 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005268 ctxt->instate = XML_PARSER_CONTENT;
5269 ctxt->checkIndex = 0;
5270#ifdef DEBUG_PUSH
5271 xmlGenericError(xmlGenericErrorContext,
5272 "HPP: entering DTD\n");
5273#endif
5274 break;
5275 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005276 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5277 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5278 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005279 ctxt->instate = XML_PARSER_START_TAG;
5280 ctxt->checkIndex = 0;
5281#ifdef DEBUG_PUSH
5282 xmlGenericError(xmlGenericErrorContext,
5283 "HPP: entering START_TAG\n");
5284#endif
5285 break;
5286 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005287 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5288 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5289 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005290 ctxt->instate = XML_PARSER_CONTENT;
5291 ctxt->checkIndex = 0;
5292#ifdef DEBUG_PUSH
5293 xmlGenericError(xmlGenericErrorContext,
5294 "HPP: entering CONTENT\n");
5295#endif
5296 break;
5297 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005298 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5299 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5300 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005301 ctxt->instate = XML_PARSER_CONTENT;
5302 ctxt->checkIndex = 0;
5303#ifdef DEBUG_PUSH
5304 xmlGenericError(xmlGenericErrorContext,
5305 "HPP: entering CONTENT\n");
5306#endif
5307 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005308 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005309 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5310 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5311 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005312 ctxt->instate = XML_PARSER_CONTENT;
5313 ctxt->checkIndex = 0;
5314#ifdef DEBUG_PUSH
5315 xmlGenericError(xmlGenericErrorContext,
5316 "HPP: entering CONTENT\n");
5317#endif
5318 break;
5319
Owen Taylor3473f882001-02-23 17:55:21 +00005320 }
5321 }
5322done:
5323 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005324 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005325 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5326 /*
5327 * SAX: end of the document processing.
5328 */
5329 ctxt->instate = XML_PARSER_EOF;
5330 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5331 ctxt->sax->endDocument(ctxt->userData);
5332 }
5333 }
5334 if ((ctxt->myDoc != NULL) &&
5335 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5336 (ctxt->instate == XML_PARSER_EPILOG))) {
5337 xmlDtdPtr dtd;
5338 dtd = xmlGetIntSubset(ctxt->myDoc);
5339 if (dtd == NULL)
5340 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00005341 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005342 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5343 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5344 }
5345#ifdef DEBUG_PUSH
5346 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5347#endif
5348 return(ret);
5349}
5350
5351/**
Owen Taylor3473f882001-02-23 17:55:21 +00005352 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00005353 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00005354 * @chunk: an char array
5355 * @size: the size in byte of the chunk
5356 * @terminate: last chunk indicator
5357 *
5358 * Parse a Chunk of memory
5359 *
5360 * Returns zero if no error, the xmlParserErrors otherwise.
5361 */
5362int
5363htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5364 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00005365 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5366 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5367 "htmlParseChunk: context error\n", NULL, NULL);
5368 return(XML_ERR_INTERNAL_ERROR);
5369 }
Owen Taylor3473f882001-02-23 17:55:21 +00005370 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5371 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5372 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5373 int cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005374 int res;
Owen Taylor3473f882001-02-23 17:55:21 +00005375
Daniel Veillardd2755a82005-08-07 23:42:39 +00005376 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5377 if (res < 0) {
5378 ctxt->errNo = XML_PARSER_EOF;
5379 ctxt->disableSAX = 1;
5380 return (XML_PARSER_EOF);
5381 }
Owen Taylor3473f882001-02-23 17:55:21 +00005382 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5383 ctxt->input->cur = ctxt->input->base + cur;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005384 ctxt->input->end =
5385 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005386#ifdef DEBUG_PUSH
5387 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5388#endif
5389
Daniel Veillard14f752c2003-08-09 11:44:50 +00005390#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00005391 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5392 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005393#endif
Owen Taylor3473f882001-02-23 17:55:21 +00005394 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005395 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5396 xmlParserInputBufferPtr in = ctxt->input->buf;
5397 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5398 (in->raw != NULL)) {
5399 int nbchars;
5400
5401 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5402 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005403 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5404 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005405 return(XML_ERR_INVALID_ENCODING);
5406 }
5407 }
5408 }
Owen Taylor3473f882001-02-23 17:55:21 +00005409 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00005410 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00005411 if (terminate) {
5412 if ((ctxt->instate != XML_PARSER_EOF) &&
5413 (ctxt->instate != XML_PARSER_EPILOG) &&
5414 (ctxt->instate != XML_PARSER_MISC)) {
5415 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005416 ctxt->wellFormed = 0;
5417 }
5418 if (ctxt->instate != XML_PARSER_EOF) {
5419 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5420 ctxt->sax->endDocument(ctxt->userData);
5421 }
5422 ctxt->instate = XML_PARSER_EOF;
5423 }
5424 return((xmlParserErrors) ctxt->errNo);
5425}
5426
5427/************************************************************************
5428 * *
5429 * User entry points *
5430 * *
5431 ************************************************************************/
5432
5433/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005434 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005435 * @sax: a SAX handler
5436 * @user_data: The user data returned on SAX callbacks
5437 * @chunk: a pointer to an array of chars
5438 * @size: number of chars in the array
5439 * @filename: an optional file name or URI
5440 * @enc: an optional encoding
5441 *
5442 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005443 * The value of @filename is used for fetching external entities
5444 * and error/warning reports.
5445 *
5446 * Returns the new parser context or NULL
5447 */
5448htmlParserCtxtPtr
5449htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5450 const char *chunk, int size, const char *filename,
5451 xmlCharEncoding enc) {
5452 htmlParserCtxtPtr ctxt;
5453 htmlParserInputPtr inputStream;
5454 xmlParserInputBufferPtr buf;
5455
Daniel Veillardd0463562001-10-13 09:15:48 +00005456 xmlInitParser();
5457
Owen Taylor3473f882001-02-23 17:55:21 +00005458 buf = xmlAllocParserInputBuffer(enc);
5459 if (buf == NULL) return(NULL);
5460
Daniel Veillardf403d292003-10-05 13:51:35 +00005461 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005462 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005463 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005464 return(NULL);
5465 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00005466 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5467 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005468 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005469 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005470 xmlFree(ctxt->sax);
5471 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5472 if (ctxt->sax == NULL) {
5473 xmlFree(buf);
5474 xmlFree(ctxt);
5475 return(NULL);
5476 }
5477 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5478 if (user_data != NULL)
5479 ctxt->userData = user_data;
5480 }
5481 if (filename == NULL) {
5482 ctxt->directory = NULL;
5483 } else {
5484 ctxt->directory = xmlParserGetDirectory(filename);
5485 }
5486
5487 inputStream = htmlNewInputStream(ctxt);
5488 if (inputStream == NULL) {
5489 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005490 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005491 return(NULL);
5492 }
5493
5494 if (filename == NULL)
5495 inputStream->filename = NULL;
5496 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005497 inputStream->filename = (char *)
5498 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005499 inputStream->buf = buf;
5500 inputStream->base = inputStream->buf->buffer->content;
5501 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillard5f704af2003-03-05 10:01:43 +00005502 inputStream->end =
5503 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005504
5505 inputPush(ctxt, inputStream);
5506
5507 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5508 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005509 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5510 int cur = ctxt->input->cur - ctxt->input->base;
5511
Owen Taylor3473f882001-02-23 17:55:21 +00005512 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005513
5514 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5515 ctxt->input->cur = ctxt->input->base + cur;
5516 ctxt->input->end =
5517 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005518#ifdef DEBUG_PUSH
5519 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5520#endif
5521 }
Daniel Veillard68716a72006-10-16 09:32:17 +00005522 ctxt->progressive = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00005523
5524 return(ctxt);
5525}
William M. Brack21e4ef22005-01-02 09:53:13 +00005526#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00005527
5528/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005529 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005530 * @cur: a pointer to an array of xmlChar
5531 * @encoding: a free form C string describing the HTML document encoding, or NULL
5532 * @sax: the SAX handler block
5533 * @userData: if using SAX, this pointer will be provided on callbacks.
5534 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005535 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5536 * to handle parse events. If sax is NULL, fallback to the default DOM
5537 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00005538 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005539 * Returns the resulting document tree unless SAX is NULL or the document is
5540 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005541 */
5542
5543htmlDocPtr
5544htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5545 htmlDocPtr ret;
5546 htmlParserCtxtPtr ctxt;
5547
Daniel Veillardd0463562001-10-13 09:15:48 +00005548 xmlInitParser();
5549
Owen Taylor3473f882001-02-23 17:55:21 +00005550 if (cur == NULL) return(NULL);
5551
5552
5553 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5554 if (ctxt == NULL) return(NULL);
5555 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005556 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005557 ctxt->sax = sax;
5558 ctxt->userData = userData;
5559 }
5560
5561 htmlParseDocument(ctxt);
5562 ret = ctxt->myDoc;
5563 if (sax != NULL) {
5564 ctxt->sax = NULL;
5565 ctxt->userData = NULL;
5566 }
5567 htmlFreeParserCtxt(ctxt);
5568
5569 return(ret);
5570}
5571
5572/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005573 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005574 * @cur: a pointer to an array of xmlChar
5575 * @encoding: a free form C string describing the HTML document encoding, or NULL
5576 *
5577 * parse an HTML in-memory document and build a tree.
5578 *
5579 * Returns the resulting document tree
5580 */
5581
5582htmlDocPtr
5583htmlParseDoc(xmlChar *cur, const char *encoding) {
5584 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5585}
5586
5587
5588/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005589 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005590 * @filename: the filename
5591 * @encoding: a free form C string describing the HTML document encoding, or NULL
5592 *
5593 * Create a parser context for a file content.
5594 * Automatic support for ZLIB/Compress compressed document is provided
5595 * by default if found at compile-time.
5596 *
5597 * Returns the new parser context or NULL
5598 */
5599htmlParserCtxtPtr
5600htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5601{
5602 htmlParserCtxtPtr ctxt;
5603 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005604 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005605 /* htmlCharEncoding enc; */
5606 xmlChar *content, *content_line = (xmlChar *) "charset=";
5607
Daniel Veillarda03e3652004-11-02 18:45:30 +00005608 if (filename == NULL)
5609 return(NULL);
5610
Daniel Veillardf403d292003-10-05 13:51:35 +00005611 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005612 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00005613 return(NULL);
5614 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005615 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5616 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00005617#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005618 if (xmlDefaultSAXHandler.error != NULL) {
5619 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5620 }
Daniel Veillard87247e82004-01-13 20:42:02 +00005621#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00005622 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005623 return(NULL);
5624 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005625
5626 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5627 xmlFree(canonicFilename);
5628 if (inputStream == NULL) {
5629 xmlFreeParserCtxt(ctxt);
5630 return(NULL);
5631 }
Owen Taylor3473f882001-02-23 17:55:21 +00005632
5633 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005634
Owen Taylor3473f882001-02-23 17:55:21 +00005635 /* set encoding */
5636 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005637 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Owen Taylor3473f882001-02-23 17:55:21 +00005638 if (content) {
5639 strcpy ((char *)content, (char *)content_line);
5640 strcat ((char *)content, (char *)encoding);
5641 htmlCheckEncoding (ctxt, content);
5642 xmlFree (content);
5643 }
5644 }
5645
5646 return(ctxt);
5647}
5648
5649/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005650 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005651 * @filename: the filename
5652 * @encoding: a free form C string describing the HTML document encoding, or NULL
5653 * @sax: the SAX handler block
5654 * @userData: if using SAX, this pointer will be provided on callbacks.
5655 *
5656 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5657 * compressed document is provided by default if found at compile-time.
5658 * It use the given SAX function block to handle the parsing callback.
5659 * If sax is NULL, fallback to the default DOM tree building routines.
5660 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005661 * Returns the resulting document tree unless SAX is NULL or the document is
5662 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005663 */
5664
5665htmlDocPtr
5666htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5667 void *userData) {
5668 htmlDocPtr ret;
5669 htmlParserCtxtPtr ctxt;
5670 htmlSAXHandlerPtr oldsax = NULL;
5671
Daniel Veillardd0463562001-10-13 09:15:48 +00005672 xmlInitParser();
5673
Owen Taylor3473f882001-02-23 17:55:21 +00005674 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5675 if (ctxt == NULL) return(NULL);
5676 if (sax != NULL) {
5677 oldsax = ctxt->sax;
5678 ctxt->sax = sax;
5679 ctxt->userData = userData;
5680 }
5681
5682 htmlParseDocument(ctxt);
5683
5684 ret = ctxt->myDoc;
5685 if (sax != NULL) {
5686 ctxt->sax = oldsax;
5687 ctxt->userData = NULL;
5688 }
5689 htmlFreeParserCtxt(ctxt);
5690
5691 return(ret);
5692}
5693
5694/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005695 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005696 * @filename: the filename
5697 * @encoding: a free form C string describing the HTML document encoding, or NULL
5698 *
5699 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5700 * compressed document is provided by default if found at compile-time.
5701 *
5702 * Returns the resulting document tree
5703 */
5704
5705htmlDocPtr
5706htmlParseFile(const char *filename, const char *encoding) {
5707 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5708}
5709
5710/**
5711 * htmlHandleOmittedElem:
5712 * @val: int 0 or 1
5713 *
5714 * Set and return the previous value for handling HTML omitted tags.
5715 *
5716 * Returns the last value for 0 for no handling, 1 for auto insertion.
5717 */
5718
5719int
5720htmlHandleOmittedElem(int val) {
5721 int old = htmlOmittedDefaultValue;
5722
5723 htmlOmittedDefaultValue = val;
5724 return(old);
5725}
5726
Daniel Veillard930dfb62003-02-05 10:17:38 +00005727/**
5728 * htmlElementAllowedHere:
5729 * @parent: HTML parent element
5730 * @elt: HTML element
5731 *
5732 * Checks whether an HTML element may be a direct child of a parent element.
5733 * Note - doesn't check for deprecated elements
5734 *
5735 * Returns 1 if allowed; 0 otherwise.
5736 */
5737int
5738htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5739 const char** p ;
5740
5741 if ( ! elt || ! parent || ! parent->subelts )
5742 return 0 ;
5743
5744 for ( p = parent->subelts; *p; ++p )
5745 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5746 return 1 ;
5747
5748 return 0 ;
5749}
5750/**
5751 * htmlElementStatusHere:
5752 * @parent: HTML parent element
5753 * @elt: HTML element
5754 *
5755 * Checks whether an HTML element may be a direct child of a parent element.
5756 * and if so whether it is valid or deprecated.
5757 *
5758 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5759 */
5760htmlStatus
5761htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5762 if ( ! parent || ! elt )
5763 return HTML_INVALID ;
5764 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5765 return HTML_INVALID ;
5766
5767 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5768}
5769/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005770 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005771 * @elt: HTML element
5772 * @attr: HTML attribute
5773 * @legacy: whether to allow deprecated attributes
5774 *
5775 * Checks whether an attribute is valid for an element
5776 * Has full knowledge of Required and Deprecated attributes
5777 *
5778 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5779 */
5780htmlStatus
5781htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5782 const char** p ;
5783
5784 if ( !elt || ! attr )
5785 return HTML_INVALID ;
5786
5787 if ( elt->attrs_req )
5788 for ( p = elt->attrs_req; *p; ++p)
5789 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5790 return HTML_REQUIRED ;
5791
5792 if ( elt->attrs_opt )
5793 for ( p = elt->attrs_opt; *p; ++p)
5794 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5795 return HTML_VALID ;
5796
5797 if ( legacy && elt->attrs_depr )
5798 for ( p = elt->attrs_depr; *p; ++p)
5799 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5800 return HTML_DEPRECATED ;
5801
5802 return HTML_INVALID ;
5803}
5804/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005805 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005806 * @node: an htmlNodePtr in a tree
5807 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005808 * for Element nodes)
5809 *
5810 * Checks whether the tree node is valid. Experimental (the author
5811 * only uses the HTML enhancements in a SAX parser)
5812 *
5813 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5814 * legacy allowed) or htmlElementStatusHere (otherwise).
5815 * for Attribute nodes, a return from htmlAttrAllowed
5816 * for other nodes, HTML_NA (no checks performed)
5817 */
5818htmlStatus
5819htmlNodeStatus(const htmlNodePtr node, int legacy) {
5820 if ( ! node )
5821 return HTML_INVALID ;
5822
5823 switch ( node->type ) {
5824 case XML_ELEMENT_NODE:
5825 return legacy
5826 ? ( htmlElementAllowedHere (
5827 htmlTagLookup(node->parent->name) , node->name
5828 ) ? HTML_VALID : HTML_INVALID )
5829 : htmlElementStatusHere(
5830 htmlTagLookup(node->parent->name) ,
5831 htmlTagLookup(node->name) )
5832 ;
5833 case XML_ATTRIBUTE_NODE:
5834 return htmlAttrAllowed(
5835 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5836 default: return HTML_NA ;
5837 }
5838}
Daniel Veillard9475a352003-09-26 12:47:50 +00005839/************************************************************************
5840 * *
5841 * New set (2.6.0) of simpler and more flexible APIs *
5842 * *
5843 ************************************************************************/
5844/**
5845 * DICT_FREE:
5846 * @str: a string
5847 *
5848 * Free a string if it is not owned by the "dict" dictionnary in the
5849 * current scope
5850 */
5851#define DICT_FREE(str) \
5852 if ((str) && ((!dict) || \
5853 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5854 xmlFree((char *)(str));
5855
5856/**
5857 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00005858 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00005859 *
5860 * Reset a parser context
5861 */
5862void
5863htmlCtxtReset(htmlParserCtxtPtr ctxt)
5864{
5865 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00005866 xmlDictPtr dict;
5867
5868 if (ctxt == NULL)
5869 return;
5870
Daniel Veillardf1a27c62006-10-13 22:33:03 +00005871 xmlInitParser();
Daniel Veillarda03e3652004-11-02 18:45:30 +00005872 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00005873
5874 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5875 xmlFreeInputStream(input);
5876 }
5877 ctxt->inputNr = 0;
5878 ctxt->input = NULL;
5879
5880 ctxt->spaceNr = 0;
Daniel Veillarda521d282004-11-09 14:59:59 +00005881 if (ctxt->spaceTab != NULL) {
5882 ctxt->spaceTab[0] = -1;
5883 ctxt->space = &ctxt->spaceTab[0];
5884 } else {
5885 ctxt->space = NULL;
5886 }
Daniel Veillard9475a352003-09-26 12:47:50 +00005887
5888
5889 ctxt->nodeNr = 0;
5890 ctxt->node = NULL;
5891
5892 ctxt->nameNr = 0;
5893 ctxt->name = NULL;
5894
5895 DICT_FREE(ctxt->version);
5896 ctxt->version = NULL;
5897 DICT_FREE(ctxt->encoding);
5898 ctxt->encoding = NULL;
5899 DICT_FREE(ctxt->directory);
5900 ctxt->directory = NULL;
5901 DICT_FREE(ctxt->extSubURI);
5902 ctxt->extSubURI = NULL;
5903 DICT_FREE(ctxt->extSubSystem);
5904 ctxt->extSubSystem = NULL;
5905 if (ctxt->myDoc != NULL)
5906 xmlFreeDoc(ctxt->myDoc);
5907 ctxt->myDoc = NULL;
5908
5909 ctxt->standalone = -1;
5910 ctxt->hasExternalSubset = 0;
5911 ctxt->hasPErefs = 0;
5912 ctxt->html = 1;
5913 ctxt->external = 0;
5914 ctxt->instate = XML_PARSER_START;
5915 ctxt->token = 0;
5916
5917 ctxt->wellFormed = 1;
5918 ctxt->nsWellFormed = 1;
5919 ctxt->valid = 1;
5920 ctxt->vctxt.userData = ctxt;
5921 ctxt->vctxt.error = xmlParserValidityError;
5922 ctxt->vctxt.warning = xmlParserValidityWarning;
5923 ctxt->record_info = 0;
5924 ctxt->nbChars = 0;
5925 ctxt->checkIndex = 0;
5926 ctxt->inSubset = 0;
5927 ctxt->errNo = XML_ERR_OK;
5928 ctxt->depth = 0;
Daniel Veillard772869f2006-11-08 09:16:56 +00005929 ctxt->charset = XML_CHAR_ENCODING_NONE;
Daniel Veillard9475a352003-09-26 12:47:50 +00005930 ctxt->catalogs = NULL;
5931 xmlInitNodeInfoSeq(&ctxt->node_seq);
5932
5933 if (ctxt->attsDefault != NULL) {
5934 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5935 ctxt->attsDefault = NULL;
5936 }
5937 if (ctxt->attsSpecial != NULL) {
5938 xmlHashFree(ctxt->attsSpecial, NULL);
5939 ctxt->attsSpecial = NULL;
5940 }
5941}
5942
5943/**
5944 * htmlCtxtUseOptions:
5945 * @ctxt: an HTML parser context
5946 * @options: a combination of htmlParserOption(s)
5947 *
5948 * Applies the options to the parser context
5949 *
5950 * Returns 0 in case of success, the set of unknown or unimplemented options
5951 * in case of error.
5952 */
5953int
5954htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5955{
Daniel Veillarda03e3652004-11-02 18:45:30 +00005956 if (ctxt == NULL)
5957 return(-1);
5958
Daniel Veillard9475a352003-09-26 12:47:50 +00005959 if (options & HTML_PARSE_NOWARNING) {
5960 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005961 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00005962 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005963 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00005964 }
5965 if (options & HTML_PARSE_NOERROR) {
5966 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005967 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00005968 ctxt->sax->fatalError = NULL;
5969 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005970 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00005971 }
5972 if (options & HTML_PARSE_PEDANTIC) {
5973 ctxt->pedantic = 1;
5974 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005975 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00005976 } else
5977 ctxt->pedantic = 0;
5978 if (options & XML_PARSE_NOBLANKS) {
5979 ctxt->keepBlanks = 0;
5980 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5981 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005982 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00005983 } else
5984 ctxt->keepBlanks = 1;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00005985 if (options & HTML_PARSE_RECOVER) {
5986 ctxt->recovery = 1;
Daniel Veillardaf616a72006-10-17 20:18:39 +00005987 options -= HTML_PARSE_RECOVER;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00005988 } else
5989 ctxt->recovery = 0;
Daniel Veillard8874b942005-08-25 13:19:21 +00005990 if (options & HTML_PARSE_COMPACT) {
5991 ctxt->options |= HTML_PARSE_COMPACT;
5992 options -= HTML_PARSE_COMPACT;
5993 }
Daniel Veillard9475a352003-09-26 12:47:50 +00005994 ctxt->dictNames = 0;
5995 return (options);
5996}
5997
5998/**
5999 * htmlDoRead:
6000 * @ctxt: an HTML parser context
6001 * @URL: the base URL to use for the document
6002 * @encoding: the document encoding, or NULL
6003 * @options: a combination of htmlParserOption(s)
6004 * @reuse: keep the context for reuse
6005 *
6006 * Common front-end for the htmlRead functions
6007 *
6008 * Returns the resulting document tree or NULL
6009 */
6010static htmlDocPtr
6011htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6012 int options, int reuse)
6013{
6014 htmlDocPtr ret;
6015
6016 htmlCtxtUseOptions(ctxt, options);
6017 ctxt->html = 1;
6018 if (encoding != NULL) {
6019 xmlCharEncodingHandlerPtr hdlr;
6020
6021 hdlr = xmlFindCharEncodingHandler(encoding);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006022 if (hdlr != NULL) {
Daniel Veillard9475a352003-09-26 12:47:50 +00006023 xmlSwitchToEncoding(ctxt, hdlr);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006024 if (ctxt->input->encoding != NULL)
6025 xmlFree((xmlChar *) ctxt->input->encoding);
6026 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6027 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006028 }
6029 if ((URL != NULL) && (ctxt->input != NULL) &&
6030 (ctxt->input->filename == NULL))
6031 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6032 htmlParseDocument(ctxt);
6033 ret = ctxt->myDoc;
6034 ctxt->myDoc = NULL;
6035 if (!reuse) {
6036 if ((ctxt->dictNames) &&
6037 (ret != NULL) &&
6038 (ret->dict == ctxt->dict))
6039 ctxt->dict = NULL;
6040 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006041 }
6042 return (ret);
6043}
6044
6045/**
6046 * htmlReadDoc:
6047 * @cur: a pointer to a zero terminated string
6048 * @URL: the base URL to use for the document
6049 * @encoding: the document encoding, or NULL
6050 * @options: a combination of htmlParserOption(s)
6051 *
6052 * parse an XML in-memory document and build a tree.
6053 *
6054 * Returns the resulting document tree
6055 */
6056htmlDocPtr
6057htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6058{
6059 htmlParserCtxtPtr ctxt;
6060
6061 if (cur == NULL)
6062 return (NULL);
6063
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006064 xmlInitParser();
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006065 ctxt = htmlCreateDocParserCtxt(cur, NULL);
Daniel Veillard9475a352003-09-26 12:47:50 +00006066 if (ctxt == NULL)
6067 return (NULL);
6068 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6069}
6070
6071/**
6072 * htmlReadFile:
6073 * @filename: a file or URL
6074 * @encoding: the document encoding, or NULL
6075 * @options: a combination of htmlParserOption(s)
6076 *
6077 * parse an XML file from the filesystem or the network.
6078 *
6079 * Returns the resulting document tree
6080 */
6081htmlDocPtr
6082htmlReadFile(const char *filename, const char *encoding, int options)
6083{
6084 htmlParserCtxtPtr ctxt;
6085
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006086 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006087 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6088 if (ctxt == NULL)
6089 return (NULL);
6090 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6091}
6092
6093/**
6094 * htmlReadMemory:
6095 * @buffer: a pointer to a char array
6096 * @size: the size of the array
6097 * @URL: the base URL to use for the document
6098 * @encoding: the document encoding, or NULL
6099 * @options: a combination of htmlParserOption(s)
6100 *
6101 * parse an XML in-memory document and build a tree.
6102 *
6103 * Returns the resulting document tree
6104 */
6105htmlDocPtr
6106htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6107{
6108 htmlParserCtxtPtr ctxt;
6109
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006110 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006111 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6112 if (ctxt == NULL)
6113 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006114 htmlDefaultSAXHandlerInit();
William M. Brackd43cdcd2004-08-03 15:13:29 +00006115 if (ctxt->sax != NULL)
6116 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00006117 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6118}
6119
6120/**
6121 * htmlReadFd:
6122 * @fd: an open file descriptor
6123 * @URL: the base URL to use for the document
6124 * @encoding: the document encoding, or NULL
6125 * @options: a combination of htmlParserOption(s)
6126 *
6127 * parse an XML from a file descriptor and build a tree.
6128 *
6129 * Returns the resulting document tree
6130 */
6131htmlDocPtr
6132htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6133{
6134 htmlParserCtxtPtr ctxt;
6135 xmlParserInputBufferPtr input;
6136 xmlParserInputPtr stream;
6137
6138 if (fd < 0)
6139 return (NULL);
6140
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006141 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006142 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6143 if (input == NULL)
6144 return (NULL);
6145 ctxt = xmlNewParserCtxt();
6146 if (ctxt == NULL) {
6147 xmlFreeParserInputBuffer(input);
6148 return (NULL);
6149 }
6150 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6151 if (stream == NULL) {
6152 xmlFreeParserInputBuffer(input);
6153 xmlFreeParserCtxt(ctxt);
6154 return (NULL);
6155 }
6156 inputPush(ctxt, stream);
6157 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6158}
6159
6160/**
6161 * htmlReadIO:
6162 * @ioread: an I/O read function
6163 * @ioclose: an I/O close function
6164 * @ioctx: an I/O handler
6165 * @URL: the base URL to use for the document
6166 * @encoding: the document encoding, or NULL
6167 * @options: a combination of htmlParserOption(s)
6168 *
6169 * parse an HTML document from I/O functions and source and build a tree.
6170 *
6171 * Returns the resulting document tree
6172 */
6173htmlDocPtr
6174htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6175 void *ioctx, const char *URL, const char *encoding, int options)
6176{
6177 htmlParserCtxtPtr ctxt;
6178 xmlParserInputBufferPtr input;
6179 xmlParserInputPtr stream;
6180
6181 if (ioread == NULL)
6182 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006183 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006184
6185 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6186 XML_CHAR_ENCODING_NONE);
6187 if (input == NULL)
6188 return (NULL);
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006189 ctxt = htmlNewParserCtxt();
Daniel Veillard9475a352003-09-26 12:47:50 +00006190 if (ctxt == NULL) {
6191 xmlFreeParserInputBuffer(input);
6192 return (NULL);
6193 }
6194 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6195 if (stream == NULL) {
6196 xmlFreeParserInputBuffer(input);
6197 xmlFreeParserCtxt(ctxt);
6198 return (NULL);
6199 }
6200 inputPush(ctxt, stream);
6201 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6202}
6203
6204/**
6205 * htmlCtxtReadDoc:
6206 * @ctxt: an HTML parser context
6207 * @cur: a pointer to a zero terminated string
6208 * @URL: the base URL to use for the document
6209 * @encoding: the document encoding, or NULL
6210 * @options: a combination of htmlParserOption(s)
6211 *
6212 * parse an XML in-memory document and build a tree.
6213 * This reuses the existing @ctxt parser context
6214 *
6215 * Returns the resulting document tree
6216 */
6217htmlDocPtr
6218htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6219 const char *URL, const char *encoding, int options)
6220{
6221 xmlParserInputPtr stream;
6222
6223 if (cur == NULL)
6224 return (NULL);
6225 if (ctxt == NULL)
6226 return (NULL);
6227
6228 htmlCtxtReset(ctxt);
6229
6230 stream = xmlNewStringInputStream(ctxt, cur);
6231 if (stream == NULL) {
6232 return (NULL);
6233 }
6234 inputPush(ctxt, stream);
6235 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6236}
6237
6238/**
6239 * htmlCtxtReadFile:
6240 * @ctxt: an HTML parser context
6241 * @filename: a file or URL
6242 * @encoding: the document encoding, or NULL
6243 * @options: a combination of htmlParserOption(s)
6244 *
6245 * parse an XML file from the filesystem or the network.
6246 * This reuses the existing @ctxt parser context
6247 *
6248 * Returns the resulting document tree
6249 */
6250htmlDocPtr
6251htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6252 const char *encoding, int options)
6253{
6254 xmlParserInputPtr stream;
6255
6256 if (filename == NULL)
6257 return (NULL);
6258 if (ctxt == NULL)
6259 return (NULL);
6260
6261 htmlCtxtReset(ctxt);
6262
Daniel Veillard29614c72004-11-26 10:47:26 +00006263 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006264 if (stream == NULL) {
6265 return (NULL);
6266 }
6267 inputPush(ctxt, stream);
6268 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6269}
6270
6271/**
6272 * htmlCtxtReadMemory:
6273 * @ctxt: an HTML parser context
6274 * @buffer: a pointer to a char array
6275 * @size: the size of the array
6276 * @URL: the base URL to use for the document
6277 * @encoding: the document encoding, or NULL
6278 * @options: a combination of htmlParserOption(s)
6279 *
6280 * parse an XML in-memory document and build a tree.
6281 * This reuses the existing @ctxt parser context
6282 *
6283 * Returns the resulting document tree
6284 */
6285htmlDocPtr
6286htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6287 const char *URL, const char *encoding, int options)
6288{
6289 xmlParserInputBufferPtr input;
6290 xmlParserInputPtr stream;
6291
6292 if (ctxt == NULL)
6293 return (NULL);
6294 if (buffer == NULL)
6295 return (NULL);
6296
6297 htmlCtxtReset(ctxt);
6298
6299 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6300 if (input == NULL) {
6301 return(NULL);
6302 }
6303
6304 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6305 if (stream == NULL) {
6306 xmlFreeParserInputBuffer(input);
6307 return(NULL);
6308 }
6309
6310 inputPush(ctxt, stream);
6311 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6312}
6313
6314/**
6315 * htmlCtxtReadFd:
6316 * @ctxt: an HTML parser context
6317 * @fd: an open file descriptor
6318 * @URL: the base URL to use for the document
6319 * @encoding: the document encoding, or NULL
6320 * @options: a combination of htmlParserOption(s)
6321 *
6322 * parse an XML from a file descriptor and build a tree.
6323 * This reuses the existing @ctxt parser context
6324 *
6325 * Returns the resulting document tree
6326 */
6327htmlDocPtr
6328htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6329 const char *URL, const char *encoding, int options)
6330{
6331 xmlParserInputBufferPtr input;
6332 xmlParserInputPtr stream;
6333
6334 if (fd < 0)
6335 return (NULL);
6336 if (ctxt == NULL)
6337 return (NULL);
6338
6339 htmlCtxtReset(ctxt);
6340
6341
6342 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6343 if (input == NULL)
6344 return (NULL);
6345 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6346 if (stream == NULL) {
6347 xmlFreeParserInputBuffer(input);
6348 return (NULL);
6349 }
6350 inputPush(ctxt, stream);
6351 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6352}
6353
6354/**
6355 * htmlCtxtReadIO:
6356 * @ctxt: an HTML parser context
6357 * @ioread: an I/O read function
6358 * @ioclose: an I/O close function
6359 * @ioctx: an I/O handler
6360 * @URL: the base URL to use for the document
6361 * @encoding: the document encoding, or NULL
6362 * @options: a combination of htmlParserOption(s)
6363 *
6364 * parse an HTML document from I/O functions and source and build a tree.
6365 * This reuses the existing @ctxt parser context
6366 *
6367 * Returns the resulting document tree
6368 */
6369htmlDocPtr
6370htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6371 xmlInputCloseCallback ioclose, void *ioctx,
6372 const char *URL,
6373 const char *encoding, int options)
6374{
6375 xmlParserInputBufferPtr input;
6376 xmlParserInputPtr stream;
6377
6378 if (ioread == NULL)
6379 return (NULL);
6380 if (ctxt == NULL)
6381 return (NULL);
6382
6383 htmlCtxtReset(ctxt);
6384
6385 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6386 XML_CHAR_ENCODING_NONE);
6387 if (input == NULL)
6388 return (NULL);
6389 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6390 if (stream == NULL) {
6391 xmlFreeParserInputBuffer(input);
6392 return (NULL);
6393 }
6394 inputPush(ctxt, stream);
6395 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6396}
6397
Daniel Veillard5d4644e2005-04-01 13:11:58 +00006398#define bottom_HTMLparser
6399#include "elfgcchack.h"
Owen Taylor3473f882001-02-23 17:55:21 +00006400#endif /* LIBXML_HTML_ENABLED */