blob: 92503a1a4d92a3568c3a1b01b53f7568cda1f031 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillardf403d292003-10-05 13:51:35 +000062 * Some factorized error routines *
63 * *
64 ************************************************************************/
65
66/**
William M. Brackedb65a72004-02-06 07:36:04 +000067 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000068 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
Daniel Veillard157fee02003-10-31 10:36:03 +000076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000079 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
Daniel Veillard157fee02003-10-31 10:36:03 +0000109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000112 if (ctxt != NULL)
113 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
117 NULL, 0, 0,
118 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000119 if (ctxt != NULL)
120 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
128 * @val: integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
135{
Daniel Veillard157fee02003-10-31 10:36:03 +0000136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
138 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000139 if (ctxt != NULL)
140 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000144 if (ctxt != NULL)
145 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000146}
147
148/************************************************************************
149 * *
Owen Taylor3473f882001-02-23 17:55:21 +0000150 * Parser stacks related functions and macros *
151 * *
152 ************************************************************************/
153
Daniel Veillard1c732d22002-11-30 11:22:59 +0000154/**
155 * htmlnamePush:
156 * @ctxt: an HTML parser context
157 * @value: the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000162 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000163static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000165{
166 if (ctxt->nameNr >= ctxt->nameMax) {
167 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000168 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000169 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000170 ctxt->nameMax *
171 sizeof(ctxt->nameTab[0]));
172 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000173 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000174 return (0);
175 }
176 }
177 ctxt->nameTab[ctxt->nameNr] = value;
178 ctxt->name = value;
179 return (ctxt->nameNr++);
180}
181/**
182 * htmlnamePop:
183 * @ctxt: an HTML parser context
184 *
185 * Pops the top element name from the name stack
186 *
187 * Returns the name just removed
188 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000189static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000190htmlnamePop(htmlParserCtxtPtr ctxt)
191{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000192 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000193
Daniel Veillard1c732d22002-11-30 11:22:59 +0000194 if (ctxt->nameNr <= 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000195 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000196 ctxt->nameNr--;
197 if (ctxt->nameNr < 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000198 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000199 if (ctxt->nameNr > 0)
200 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
201 else
202 ctxt->name = NULL;
203 ret = ctxt->nameTab[ctxt->nameNr];
Daniel Veillard24505b02005-07-28 23:49:35 +0000204 ctxt->nameTab[ctxt->nameNr] = NULL;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000205 return (ret);
206}
Owen Taylor3473f882001-02-23 17:55:21 +0000207
208/*
209 * Macros for accessing the content. Those should be used only by the parser,
210 * and not exported.
211 *
212 * Dirty macros, i.e. one need to make assumption on the context to use them
213 *
214 * CUR_PTR return the current pointer to the xmlChar to be parsed.
215 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
216 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
217 * in UNICODE mode. This should be used internally by the parser
218 * only to compare to ASCII values otherwise it would break when
219 * running with UTF-8 encoding.
220 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
221 * to compare on ASCII based substring.
222 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
223 * it should be used only to compare on ASCII based substring.
224 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000225 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000226 *
227 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
228 *
229 * CURRENT Returns the current char value, with the full decoding of
230 * UTF-8 if we are using this mode. It returns an int.
231 * NEXT Skip to the next character, this does the proper decoding
232 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000233 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000234 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
235 */
236
237#define UPPER (toupper(*ctxt->input->cur))
238
Daniel Veillard77a90a72003-03-22 00:04:05 +0000239#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000240
241#define NXT(val) ctxt->input->cur[(val)]
242
243#define UPP(val) (toupper(ctxt->input->cur[(val)]))
244
245#define CUR_PTR ctxt->input->cur
246
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000247#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
248 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
249 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000250
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000251#define GROW if ((ctxt->progressive == 0) && \
252 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
253 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000254
255#define CURRENT ((int) (*ctxt->input->cur))
256
257#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
258
259/* Inported from XML */
260
Daniel Veillard561b7f82002-03-20 21:55:57 +0000261/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
262#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000263#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000264
Daniel Veillard561b7f82002-03-20 21:55:57 +0000265#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000266#define NXT(val) ctxt->input->cur[(val)]
267#define CUR_PTR ctxt->input->cur
268
269
270#define NEXTL(l) do { \
271 if (*(ctxt->input->cur) == '\n') { \
272 ctxt->input->line++; ctxt->input->col = 1; \
273 } else ctxt->input->col++; \
274 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
275 } while (0)
276
277/************
278 \
279 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
280 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
281 ************/
282
283#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
284#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
285
286#define COPY_BUF(l,b,i,v) \
287 if (l == 1) b[i++] = (xmlChar) v; \
288 else i += xmlCopyChar(l,&b[i],v)
289
290/**
291 * htmlCurrentChar:
292 * @ctxt: the HTML parser context
293 * @len: pointer to the length of the char read
294 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000295 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000296 * bytes in the input buffer. Implement the end of line normalization:
297 * 2.11 End-of-Line Handling
298 * If the encoding is unspecified, in the case we find an ISO-Latin-1
299 * char, then the encoding converter is plugged in automatically.
300 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000301 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000302 */
303
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000304static int
Owen Taylor3473f882001-02-23 17:55:21 +0000305htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
306 if (ctxt->instate == XML_PARSER_EOF)
307 return(0);
308
309 if (ctxt->token != 0) {
310 *len = 0;
311 return(ctxt->token);
312 }
313 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
314 /*
315 * We are supposed to handle UTF8, check it's valid
316 * From rfc2044: encoding of the Unicode values on UTF-8:
317 *
318 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
319 * 0000 0000-0000 007F 0xxxxxxx
320 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
321 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
322 *
323 * Check for the 0x110000 limit too
324 */
325 const unsigned char *cur = ctxt->input->cur;
326 unsigned char c;
327 unsigned int val;
328
329 c = *cur;
330 if (c & 0x80) {
331 if (cur[1] == 0)
332 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
333 if ((cur[1] & 0xc0) != 0x80)
334 goto encoding_error;
335 if ((c & 0xe0) == 0xe0) {
336
337 if (cur[2] == 0)
338 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
339 if ((cur[2] & 0xc0) != 0x80)
340 goto encoding_error;
341 if ((c & 0xf0) == 0xf0) {
342 if (cur[3] == 0)
343 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
344 if (((c & 0xf8) != 0xf0) ||
345 ((cur[3] & 0xc0) != 0x80))
346 goto encoding_error;
347 /* 4-byte code */
348 *len = 4;
349 val = (cur[0] & 0x7) << 18;
350 val |= (cur[1] & 0x3f) << 12;
351 val |= (cur[2] & 0x3f) << 6;
352 val |= cur[3] & 0x3f;
353 } else {
354 /* 3-byte code */
355 *len = 3;
356 val = (cur[0] & 0xf) << 12;
357 val |= (cur[1] & 0x3f) << 6;
358 val |= cur[2] & 0x3f;
359 }
360 } else {
361 /* 2-byte code */
362 *len = 2;
363 val = (cur[0] & 0x1f) << 6;
364 val |= cur[1] & 0x3f;
365 }
366 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000367 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
368 "Char 0x%X out of allowed range\n", val);
Owen Taylor3473f882001-02-23 17:55:21 +0000369 }
370 return(val);
371 } else {
372 /* 1-byte code */
373 *len = 1;
374 return((int) *ctxt->input->cur);
375 }
376 }
377 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000378 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000379 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000380 * XML constructs only use < 128 chars
381 */
382 *len = 1;
383 if ((int) *ctxt->input->cur < 0x80)
384 return((int) *ctxt->input->cur);
385
386 /*
387 * Humm this is bad, do an automatic flow conversion
388 */
389 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
390 ctxt->charset = XML_CHAR_ENCODING_UTF8;
391 return(xmlCurrentChar(ctxt, len));
392
393encoding_error:
394 /*
395 * If we detect an UTF8 error that probably mean that the
396 * input encoding didn't get properly advertized in the
397 * declaration header. Report the error and switch the encoding
398 * to ISO-Latin-1 (if you don't like this policy, just declare the
399 * encoding !)
400 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000401 {
402 char buffer[150];
403
Daniel Veillard861101d2007-06-12 08:38:57 +0000404 if (ctxt->input->end - ctxt->input->cur >= 4) {
405 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
406 ctxt->input->cur[0], ctxt->input->cur[1],
407 ctxt->input->cur[2], ctxt->input->cur[3]);
408 } else {
409 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
410 }
Daniel Veillarda03e3652004-11-02 18:45:30 +0000411 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
412 "Input is not proper UTF-8, indicate encoding !\n",
413 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000414 }
415
416 ctxt->charset = XML_CHAR_ENCODING_8859_1;
417 *len = 1;
418 return((int) *ctxt->input->cur);
419}
420
421/**
Owen Taylor3473f882001-02-23 17:55:21 +0000422 * htmlSkipBlankChars:
423 * @ctxt: the HTML parser context
424 *
425 * skip all blanks character found at that point in the input streams.
426 *
427 * Returns the number of space chars skipped
428 */
429
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000430static int
Owen Taylor3473f882001-02-23 17:55:21 +0000431htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
432 int res = 0;
433
William M. Brack76e95df2003-10-18 16:20:14 +0000434 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000435 if ((*ctxt->input->cur == 0) &&
436 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
437 xmlPopInput(ctxt);
438 } else {
439 if (*(ctxt->input->cur) == '\n') {
440 ctxt->input->line++; ctxt->input->col = 1;
441 } else ctxt->input->col++;
442 ctxt->input->cur++;
443 ctxt->nbChars++;
444 if (*ctxt->input->cur == 0)
445 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
446 }
447 res++;
448 }
449 return(res);
450}
451
452
453
454/************************************************************************
455 * *
456 * The list of HTML elements and their properties *
457 * *
458 ************************************************************************/
459
460/*
461 * Start Tag: 1 means the start tag can be ommited
462 * End Tag: 1 means the end tag can be ommited
463 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000464 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000465 * Depr: this element is deprecated
466 * DTD: 1 means that this element is valid only in the Loose DTD
467 * 2 means that this element is valid only in the Frameset DTD
468 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000469 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000470 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000471 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000472
473/* Definitions and a couple of vars for HTML Elements */
474
475#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000476#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000477#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000478#define NB_PHRASE 10
Daniel Veillard491e58e2007-05-02 16:15:18 +0000479#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
480#define NB_SPECIAL 16
Daniel Veillard930dfb62003-02-05 10:17:38 +0000481#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000482#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
483#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
484#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000485#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000486#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000487#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000488#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000489#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000490#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000491#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000492#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000493#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000494#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000495#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000496#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000497#define EMPTY NULL
498
499
Daniel Veillard065abe82006-07-03 08:55:04 +0000500static const char* const html_flow[] = { FLOW, NULL } ;
501static const char* const html_inline[] = { INLINE, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000502
503/* placeholders: elts with content but no subelements */
Daniel Veillard065abe82006-07-03 08:55:04 +0000504static const char* const html_pcdata[] = { NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000505#define html_cdata html_pcdata
506
507
508/* ... and for HTML Attributes */
509
510#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000511#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000512#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000513#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000514#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000515#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000516#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000517#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000518#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000519#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000520#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000521#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000522
Daniel Veillard065abe82006-07-03 08:55:04 +0000523static const char* const html_attrs[] = { ATTRS, NULL } ;
524static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
525static const char* const core_attrs[] = { COREATTRS, NULL } ;
526static const char* const i18n_attrs[] = { I18N, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000527
528
529/* Other declarations that should go inline ... */
Daniel Veillard065abe82006-07-03 08:55:04 +0000530static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000531 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
532 "tabindex", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000533static const char* const target_attr[] = { "target", NULL } ;
534static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
535static const char* const alt_attr[] = { "alt", NULL } ;
536static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
537static const char* const href_attrs[] = { "href", NULL } ;
538static const char* const clear_attrs[] = { "clear", NULL } ;
539static const char* const inline_p[] = { INLINE, "p", NULL } ;
540
541static const char* const flow_param[] = { FLOW, "param", NULL } ;
542static const char* const applet_attrs[] = { COREATTRS , "codebase",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000543 "archive", "alt", "name", "height", "width", "align",
544 "hspace", "vspace", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000545static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000546 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000547static const char* const basefont_attrs[] =
Daniel Veillard930dfb62003-02-05 10:17:38 +0000548 { "id", "size", "color", "face", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000549static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
550static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
551static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
552static const char* const body_depr[] = { "background", "bgcolor", "text",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000553 "link", "vlink", "alink", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000554static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000555 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
556
557
Daniel Veillard065abe82006-07-03 08:55:04 +0000558static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
559static const char* const col_elt[] = { "col", NULL } ;
560static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
561static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
562static const char* const dl_contents[] = { "dt", "dd", NULL } ;
563static const char* const compact_attr[] = { "compact", NULL } ;
564static const char* const label_attr[] = { "label", NULL } ;
565static const char* const fieldset_contents[] = { FLOW, "legend" } ;
566static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
567static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
568static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
569static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
570static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
571static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
572static const char* const head_attrs[] = { I18N, "profile", NULL } ;
573static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
574static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
575static const char* const version_attr[] = { "version", NULL } ;
576static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
577static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
578static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
Daniel Veillard491e58e2007-05-02 16:15:18 +0000579static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000580static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
581static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
582static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
583static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
584static const char* const align_attr[] = { "align", NULL } ;
585static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
586static const char* const map_contents[] = { BLOCK, "area", NULL } ;
587static const char* const name_attr[] = { "name", NULL } ;
588static const char* const action_attr[] = { "action", NULL } ;
589static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
590static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
591static const char* const content_attr[] = { "content", NULL } ;
592static const char* const type_attr[] = { "type", NULL } ;
593static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
594static const char* const object_contents[] = { FLOW, "param", NULL } ;
595static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
596static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
597static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
598static const char* const option_elt[] = { "option", NULL } ;
599static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
600static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
601static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
602static const char* const width_attr[] = { "width", NULL } ;
603static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
604static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
605static const char* const language_attr[] = { "language", NULL } ;
606static const char* const select_content[] = { "optgroup", "option", NULL } ;
607static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
608static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
609static const char* const table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
610static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
611static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
612static const char* const tr_elt[] = { "tr", NULL } ;
613static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
614static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
615static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
616static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
617static const char* const tr_contents[] = { "th", "td", NULL } ;
618static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
619static const char* const li_elt[] = { "li", NULL } ;
620static const char* const ul_depr[] = { "type", "compact", NULL} ;
621static const char* const dir_attr[] = { "dir", NULL} ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000622
623#define DECL (const char**)
624
Daniel Veillard22090732001-07-16 00:06:07 +0000625static const htmlElemDesc
626html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000627{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
628 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
629},
630{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
631 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
632},
633{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
634 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
635},
636{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
637 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
638},
639{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
640 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
641},
642{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
643 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
644},
645{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
646 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
647},
648{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
649 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
650},
651{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
652 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
653},
654{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
655 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
656},
657{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
658 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
659},
660{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
661 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
662},
663{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
664 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
665},
666{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
667 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
668},
669{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
670 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
671},
672{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
673 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
674},
675{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
676 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
677},
678{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
679 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
680},
681{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
682 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
683},
684{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
685 EMPTY , NULL , DECL col_attrs , NULL, NULL
686},
687{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
688 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
689},
690{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
691 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
692},
693{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
694 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
695},
696{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
697 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
698},
699{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
700 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
701},
702{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
703 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
704},
705{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
William M. Bracke978ae22007-03-21 06:16:02 +0000706 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
Daniel Veillard930dfb62003-02-05 10:17:38 +0000707},
708{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
709 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
710},
711{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
712 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
713},
Daniel Veillard640f89e2008-01-11 06:24:09 +0000714{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
Daniel Veillard491e58e2007-05-02 16:15:18 +0000715 EMPTY, NULL, DECL embed_attrs, NULL, NULL
716},
Daniel Veillard930dfb62003-02-05 10:17:38 +0000717{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
718 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
719},
720{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
721 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
722},
723{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
724 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
725},
726{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
727 EMPTY, NULL, NULL, DECL frame_attrs, NULL
728},
729{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
730 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
731},
732{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
733 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
734},
735{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
736 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
737},
738{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
739 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
740},
741{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
742 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
743},
744{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
745 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
746},
747{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
748 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
749},
750{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
751 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
752},
753{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
754 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
755},
756{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
757 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
758},
759{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
760 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
761},
762{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
763 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
764},
765{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
William M. Bracke978ae22007-03-21 06:16:02 +0000766 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
Daniel Veillard930dfb62003-02-05 10:17:38 +0000767},
768{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
769 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
770},
771{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
772 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
773},
774{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
775 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
776},
777{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
778 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
779},
780{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
781 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
782},
783{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
784 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
785},
786{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
787 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
788},
789{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
790 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
791},
792{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
William M. Bracke978ae22007-03-21 06:16:02 +0000793 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000794},
795{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
796 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
797},
798{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
799 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
800},
801{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
802 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
803},
804{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
805 DECL html_flow, "div", DECL html_attrs, NULL, NULL
806},
807{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
808 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
809},
810{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
811 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
812},
813{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
William M. Bracke978ae22007-03-21 06:16:02 +0000814 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000815},
816{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
817 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
818},
819{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
820 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
821},
822{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
William M. Bracke978ae22007-03-21 06:16:02 +0000823 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000824},
825{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
826 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
827},
828{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
829 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
830},
831{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
832 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
833},
834{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
835 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
836},
837{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
838 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
839},
840{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
841 DECL select_content, NULL, DECL select_attrs, NULL, NULL
842},
843{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
844 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
845},
846{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
847 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
848},
849{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
850 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
851},
852{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
853 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
854},
855{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
856 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
857},
858{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
859 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
860},
861{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
862 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
863},
864{ "table", 0, 0, 0, 0, 0, 0, 0, "",
865 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
866},
867{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
868 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
869},
870{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
871 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
872},
873{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
874 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
875},
876{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
877 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
878},
879{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
880 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
881},
882{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
883 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
884},
885{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
886 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
887},
888{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
889 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
890},
891{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
892 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
893},
894{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
895 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
896},
897{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
898 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
899},
900{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
901 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
902}
Owen Taylor3473f882001-02-23 17:55:21 +0000903};
904
905/*
Owen Taylor3473f882001-02-23 17:55:21 +0000906 * start tags that imply the end of current element
907 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000908static const char * const htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000909"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
910 "dl", "ul", "ol", "menu", "dir", "address", "pre",
911 "listing", "xmp", "head", NULL,
912"head", "p", NULL,
913"title", "p", NULL,
914"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +0000915"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000916"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
917 "pre", "listing", "xmp", "head", "li", NULL,
918"hr", "p", "head", NULL,
919"h1", "p", "head", NULL,
920"h2", "p", "head", NULL,
921"h3", "p", "head", NULL,
922"h4", "p", "head", NULL,
923"h5", "p", "head", NULL,
924"h6", "p", "head", NULL,
925"dir", "p", "head", NULL,
926"address", "p", "head", "ul", NULL,
927"pre", "p", "head", "ul", NULL,
928"listing", "p", "head", NULL,
929"xmp", "p", "head", NULL,
930"blockquote", "p", "head", NULL,
931"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
932 "xmp", "head", NULL,
933"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
934 "head", "dd", NULL,
935"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
936 "head", "dt", NULL,
937"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
938 "listing", "xmp", NULL,
939"ol", "p", "head", "ul", NULL,
940"menu", "p", "head", "ul", NULL,
941"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
942"div", "p", "head", NULL,
943"noscript", "p", "head", NULL,
944"center", "font", "b", "i", "p", "head", NULL,
945"a", "a", NULL,
946"caption", "p", NULL,
947"colgroup", "caption", "colgroup", "col", "p", NULL,
948"col", "caption", "col", "p", NULL,
949"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
950 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000951"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
952"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000953"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
954"thead", "caption", "col", "colgroup", NULL,
955"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
956 "tbody", "p", NULL,
957"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
958 "tfoot", "tbody", "p", NULL,
959"optgroup", "option", NULL,
960"option", "option", NULL,
961"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
962 "pre", "listing", "xmp", "a", NULL,
963NULL
964};
965
966/*
967 * The list of HTML elements which are supposed not to have
968 * CDATA content and where a p element will be implied
969 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000970 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000971 * implied paragraph
972 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000973static const char *const htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000974 "html",
975 "head",
Owen Taylor3473f882001-02-23 17:55:21 +0000976 NULL
977};
978
979/*
980 * The list of HTML attributes which are of content %Script;
981 * NOTE: when adding ones, check htmlIsScriptAttribute() since
982 * it assumes the name starts with 'on'
983 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000984static const char *const htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000985 "onclick",
986 "ondblclick",
987 "onmousedown",
988 "onmouseup",
989 "onmouseover",
990 "onmousemove",
991 "onmouseout",
992 "onkeypress",
993 "onkeydown",
994 "onkeyup",
995 "onload",
996 "onunload",
997 "onfocus",
998 "onblur",
999 "onsubmit",
1000 "onrest",
1001 "onchange",
1002 "onselect"
1003};
1004
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001005/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001006 * This table is used by the htmlparser to know what to do with
1007 * broken html pages. By assigning different priorities to different
1008 * elements the parser can decide how to handle extra endtags.
1009 * Endtags are only allowed to close elements with lower or equal
1010 * priority.
1011 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001012
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001013typedef struct {
1014 const char *name;
1015 int priority;
1016} elementPriority;
1017
Daniel Veillard22090732001-07-16 00:06:07 +00001018static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001019 {"div", 150},
1020 {"td", 160},
1021 {"th", 160},
1022 {"tr", 170},
1023 {"thead", 180},
1024 {"tbody", 180},
1025 {"tfoot", 180},
1026 {"table", 190},
1027 {"head", 200},
1028 {"body", 200},
1029 {"html", 220},
1030 {NULL, 100} /* Default priority */
1031};
Owen Taylor3473f882001-02-23 17:55:21 +00001032
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001033static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001034static int htmlStartCloseIndexinitialized = 0;
1035
1036/************************************************************************
1037 * *
1038 * functions to handle HTML specific data *
1039 * *
1040 ************************************************************************/
1041
1042/**
1043 * htmlInitAutoClose:
1044 *
1045 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1046 * This is not reentrant. Call xmlInitParser() once before processing in
1047 * case of use in multithreaded programs.
1048 */
1049void
1050htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001051 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001052
1053 if (htmlStartCloseIndexinitialized) return;
1054
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001055 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1056 indx = 0;
1057 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
Daniel Veillard28aac0b2006-10-16 08:31:18 +00001058 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001059 while (htmlStartClose[i] != NULL) i++;
1060 i++;
1061 }
1062 htmlStartCloseIndexinitialized = 1;
1063}
1064
1065/**
1066 * htmlTagLookup:
1067 * @tag: The tag name in lowercase
1068 *
1069 * Lookup the HTML tag in the ElementTable
1070 *
1071 * Returns the related htmlElemDescPtr or NULL if not found.
1072 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001073const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001074htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001075 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001076
1077 for (i = 0; i < (sizeof(html40ElementTable) /
1078 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001079 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001080 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001081 }
1082 return(NULL);
1083}
1084
1085/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001086 * htmlGetEndPriority:
1087 * @name: The name of the element to look up the priority for.
1088 *
1089 * Return value: The "endtag" priority.
1090 **/
1091static int
1092htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001093 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001094
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001095 while ((htmlEndPriority[i].name != NULL) &&
1096 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1097 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001098
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001099 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001100}
1101
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001102
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001103/**
Owen Taylor3473f882001-02-23 17:55:21 +00001104 * htmlCheckAutoClose:
1105 * @newtag: The new tag name
1106 * @oldtag: The old tag name
1107 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001108 * Checks whether the new tag is one of the registered valid tags for
1109 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001110 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1111 *
1112 * Returns 0 if no, 1 if yes.
1113 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001114static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001115htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1116{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001117 int i, indx;
1118 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001119
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001120 if (htmlStartCloseIndexinitialized == 0)
1121 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001122
1123 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001124 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001125 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001126 if (closed == NULL)
1127 return (0);
1128 if (xmlStrEqual(BAD_CAST * closed, newtag))
1129 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001130 }
1131
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001132 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001133 i++;
1134 while (htmlStartClose[i] != NULL) {
1135 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001136 return (1);
1137 }
1138 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001139 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001140 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001141}
1142
1143/**
1144 * htmlAutoCloseOnClose:
1145 * @ctxt: an HTML parser context
1146 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001147 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001148 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001149 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001150 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001151static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001152htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1153{
1154 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001155 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001156
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001157 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001158
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001159 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001160
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001161 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1162 break;
1163 /*
1164 * A missplaced endtag can only close elements with lower
1165 * or equal priority, so if we find an element with higher
1166 * priority before we find an element with
1167 * matching name, we just ignore this endtag
1168 */
1169 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1170 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001171 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001172 if (i < 0)
1173 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001174
1175 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001176 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001177 if ((info != NULL) && (info->endTag == 3)) {
1178 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1179 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001180 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001181 }
1182 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1183 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001184 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001185 }
1186}
1187
1188/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001189 * htmlAutoCloseOnEnd:
1190 * @ctxt: an HTML parser context
1191 *
1192 * Close all remaining tags at the end of the stream
1193 */
1194static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001195htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1196{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001197 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001198
William M. Brack899e64a2003-09-26 18:03:42 +00001199 if (ctxt->nameNr == 0)
1200 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001201 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001202 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1203 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001204 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001205 }
1206}
1207
1208/**
Owen Taylor3473f882001-02-23 17:55:21 +00001209 * htmlAutoClose:
1210 * @ctxt: an HTML parser context
1211 * @newtag: The new tag name or NULL
1212 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001213 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001214 * The list is kept in htmlStartClose array. This function is
1215 * called when a new tag has been detected and generates the
1216 * appropriates closes if possible/needed.
1217 * If newtag is NULL this mean we are at the end of the resource
1218 * and we should check
1219 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001220static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001221htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1222{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001223 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001224 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001225 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1226 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001227 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001228 }
1229 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001230 htmlAutoCloseOnEnd(ctxt);
1231 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001232 }
1233 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001234 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1235 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1236 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001237 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1238 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001239 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001240 }
Owen Taylor3473f882001-02-23 17:55:21 +00001241}
1242
1243/**
1244 * htmlAutoCloseTag:
1245 * @doc: the HTML document
1246 * @name: The tag name
1247 * @elem: the HTML element
1248 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001249 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001250 * The list is kept in htmlStartClose array. This function checks
1251 * if the element or one of it's children would autoclose the
1252 * given tag.
1253 *
1254 * Returns 1 if autoclose, 0 otherwise
1255 */
1256int
1257htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1258 htmlNodePtr child;
1259
1260 if (elem == NULL) return(1);
1261 if (xmlStrEqual(name, elem->name)) return(0);
1262 if (htmlCheckAutoClose(elem->name, name)) return(1);
1263 child = elem->children;
1264 while (child != NULL) {
1265 if (htmlAutoCloseTag(doc, name, child)) return(1);
1266 child = child->next;
1267 }
1268 return(0);
1269}
1270
1271/**
1272 * htmlIsAutoClosed:
1273 * @doc: the HTML document
1274 * @elem: the HTML element
1275 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001276 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001277 * The list is kept in htmlStartClose array. This function checks
1278 * if a tag is autoclosed by one of it's child
1279 *
1280 * Returns 1 if autoclosed, 0 otherwise
1281 */
1282int
1283htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1284 htmlNodePtr child;
1285
1286 if (elem == NULL) return(1);
1287 child = elem->children;
1288 while (child != NULL) {
1289 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1290 child = child->next;
1291 }
1292 return(0);
1293}
1294
1295/**
1296 * htmlCheckImplied:
1297 * @ctxt: an HTML parser context
1298 * @newtag: The new tag name
1299 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001300 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001301 * called when a new tag has been detected and generates the
1302 * appropriates implicit tags if missing
1303 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001304static void
Owen Taylor3473f882001-02-23 17:55:21 +00001305htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1306 if (!htmlOmittedDefaultValue)
1307 return;
1308 if (xmlStrEqual(newtag, BAD_CAST"html"))
1309 return;
1310 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001311 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001312 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1313 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1314 }
1315 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1316 return;
1317 if ((ctxt->nameNr <= 1) &&
1318 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1319 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1320 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1321 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1322 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1323 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1324 /*
1325 * dropped OBJECT ... i you put it first BODY will be
1326 * assumed !
1327 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001328 htmlnamePush(ctxt, BAD_CAST"head");
Owen Taylor3473f882001-02-23 17:55:21 +00001329 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1330 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1331 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1332 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1333 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1334 int i;
1335 for (i = 0;i < ctxt->nameNr;i++) {
1336 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1337 return;
1338 }
1339 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1340 return;
1341 }
1342 }
1343
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001344 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001345 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1346 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1347 }
1348}
1349
1350/**
1351 * htmlCheckParagraph
1352 * @ctxt: an HTML parser context
1353 *
1354 * Check whether a p element need to be implied before inserting
1355 * characters in the current element.
1356 *
1357 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1358 * in case of error.
1359 */
1360
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001361static int
Owen Taylor3473f882001-02-23 17:55:21 +00001362htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1363 const xmlChar *tag;
1364 int i;
1365
1366 if (ctxt == NULL)
1367 return(-1);
1368 tag = ctxt->name;
1369 if (tag == NULL) {
1370 htmlAutoClose(ctxt, BAD_CAST"p");
1371 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001372 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001373 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1374 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1375 return(1);
1376 }
1377 if (!htmlOmittedDefaultValue)
1378 return(0);
1379 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1380 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001381 htmlAutoClose(ctxt, BAD_CAST"p");
1382 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001383 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001384 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1385 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1386 return(1);
1387 }
1388 }
1389 return(0);
1390}
1391
1392/**
1393 * htmlIsScriptAttribute:
1394 * @name: an attribute name
1395 *
1396 * Check if an attribute is of content type Script
1397 *
1398 * Returns 1 is the attribute is a script 0 otherwise
1399 */
1400int
1401htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001402 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001403
1404 if (name == NULL)
1405 return(0);
1406 /*
1407 * all script attributes start with 'on'
1408 */
1409 if ((name[0] != 'o') || (name[1] != 'n'))
1410 return(0);
1411 for (i = 0;
1412 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1413 i++) {
1414 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1415 return(1);
1416 }
1417 return(0);
1418}
1419
1420/************************************************************************
1421 * *
1422 * The list of HTML predefined entities *
1423 * *
1424 ************************************************************************/
1425
1426
Daniel Veillard22090732001-07-16 00:06:07 +00001427static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001428/*
1429 * the 4 absolute ones, plus apostrophe.
1430 */
1431{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1432{ 38, "amp", "ampersand, U+0026 ISOnum" },
1433{ 39, "apos", "single quote" },
1434{ 60, "lt", "less-than sign, U+003C ISOnum" },
1435{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1436
1437/*
1438 * A bunch still in the 128-255 range
1439 * Replacing them depend really on the charset used.
1440 */
1441{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1442{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1443{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1444{ 163, "pound","pound sign, U+00A3 ISOnum" },
1445{ 164, "curren","currency sign, U+00A4 ISOnum" },
1446{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1447{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1448{ 167, "sect", "section sign, U+00A7 ISOnum" },
1449{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1450{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1451{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1452{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1453{ 172, "not", "not sign, U+00AC ISOnum" },
1454{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1455{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1456{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1457{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1458{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1459{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1460{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1461{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1462{ 181, "micro","micro sign, U+00B5 ISOnum" },
1463{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1464{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1465{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1466{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1467{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1468{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1469{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1470{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1471{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1472{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1473{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1474{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1475{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1476{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1477{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1478{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1479{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1480{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1481{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1482{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1483{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1484{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1485{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1486{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1487{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1488{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1489{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1490{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1491{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1492{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1493{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1494{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1495{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1496{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1497{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1498{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1499{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1500{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1501{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1502{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1503{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1504{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1505{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1506{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1507{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1508{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1509{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1510{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1511{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1512{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1513{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1514{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1515{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1516{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1517{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1518{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1519{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1520{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1521{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1522{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1523{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1524{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1525{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1526{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1527{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1528{ 247, "divide","division sign, U+00F7 ISOnum" },
1529{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1530{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1531{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1532{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1533{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1534{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1535{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1536{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1537
1538{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1539{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1540{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1541{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1542{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1543
1544/*
1545 * Anything below should really be kept as entities references
1546 */
1547{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1548
1549{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1550{ 732, "tilde","small tilde, U+02DC ISOdia" },
1551
1552{ 913, "Alpha","greek capital letter alpha, U+0391" },
1553{ 914, "Beta", "greek capital letter beta, U+0392" },
1554{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1555{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1556{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1557{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1558{ 919, "Eta", "greek capital letter eta, U+0397" },
1559{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1560{ 921, "Iota", "greek capital letter iota, U+0399" },
1561{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001562{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001563{ 924, "Mu", "greek capital letter mu, U+039C" },
1564{ 925, "Nu", "greek capital letter nu, U+039D" },
1565{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1566{ 927, "Omicron","greek capital letter omicron, U+039F" },
1567{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1568{ 929, "Rho", "greek capital letter rho, U+03A1" },
1569{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1570{ 932, "Tau", "greek capital letter tau, U+03A4" },
1571{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1572{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1573{ 935, "Chi", "greek capital letter chi, U+03A7" },
1574{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1575{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1576
1577{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1578{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1579{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1580{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1581{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1582{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1583{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1584{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1585{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1586{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1587{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1588{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1589{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1590{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1591{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1592{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1593{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1594{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1595{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1596{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1597{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1598{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1599{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1600{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1601{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1602{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1603{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1604{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1605
1606{ 8194, "ensp", "en space, U+2002 ISOpub" },
1607{ 8195, "emsp", "em space, U+2003 ISOpub" },
1608{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1609{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1610{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1611{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1612{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1613{ 8211, "ndash","en dash, U+2013 ISOpub" },
1614{ 8212, "mdash","em dash, U+2014 ISOpub" },
1615{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1616{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1617{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1618{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1619{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1620{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1621{ 8224, "dagger","dagger, U+2020 ISOpub" },
1622{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1623
1624{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1625{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1626
1627{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1628
1629{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1630{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1631
1632{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1633{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1634
1635{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1636{ 8260, "frasl","fraction slash, U+2044 NEW" },
1637
1638{ 8364, "euro", "euro sign, U+20AC NEW" },
1639
1640{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1641{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1642{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1643{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1644{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1645{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1646{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1647{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1648{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1649{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1650{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1651{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1652{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1653{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1654{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1655{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1656
1657{ 8704, "forall","for all, U+2200 ISOtech" },
1658{ 8706, "part", "partial differential, U+2202 ISOtech" },
1659{ 8707, "exist","there exists, U+2203 ISOtech" },
1660{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1661{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1662{ 8712, "isin", "element of, U+2208 ISOtech" },
1663{ 8713, "notin","not an element of, U+2209 ISOtech" },
1664{ 8715, "ni", "contains as member, U+220B ISOtech" },
1665{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001666{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001667{ 8722, "minus","minus sign, U+2212 ISOtech" },
1668{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1669{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1670{ 8733, "prop", "proportional to, U+221D ISOtech" },
1671{ 8734, "infin","infinity, U+221E ISOtech" },
1672{ 8736, "ang", "angle, U+2220 ISOamso" },
1673{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1674{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1675{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1676{ 8746, "cup", "union = cup, U+222A ISOtech" },
1677{ 8747, "int", "integral, U+222B ISOtech" },
1678{ 8756, "there4","therefore, U+2234 ISOtech" },
1679{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1680{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1681{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1682{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1683{ 8801, "equiv","identical to, U+2261 ISOtech" },
1684{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1685{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1686{ 8834, "sub", "subset of, U+2282 ISOtech" },
1687{ 8835, "sup", "superset of, U+2283 ISOtech" },
1688{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1689{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1690{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1691{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1692{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1693{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1694{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1695{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1696{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1697{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1698{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1699{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1700{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1701{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1702
1703{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1704{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1705{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1706{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1707
1708};
1709
1710/************************************************************************
1711 * *
1712 * Commodity functions to handle entities *
1713 * *
1714 ************************************************************************/
1715
1716/*
1717 * Macro used to grow the current buffer.
1718 */
1719#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001720 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001721 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001722 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1723 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001724 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001725 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001726 return(NULL); \
1727 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001728 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001729}
1730
1731/**
1732 * htmlEntityLookup:
1733 * @name: the entity name
1734 *
1735 * Lookup the given entity in EntitiesTable
1736 *
1737 * TODO: the linear scan is really ugly, an hash table is really needed.
1738 *
1739 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1740 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001741const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001742htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001743 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001744
1745 for (i = 0;i < (sizeof(html40EntitiesTable)/
1746 sizeof(html40EntitiesTable[0]));i++) {
1747 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001748 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001749 }
1750 }
1751 return(NULL);
1752}
1753
1754/**
1755 * htmlEntityValueLookup:
1756 * @value: the entity's unicode value
1757 *
1758 * Lookup the given entity in EntitiesTable
1759 *
1760 * TODO: the linear scan is really ugly, an hash table is really needed.
1761 *
1762 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1763 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001764const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001765htmlEntityValueLookup(unsigned int value) {
1766 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001767
1768 for (i = 0;i < (sizeof(html40EntitiesTable)/
1769 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001770 if (html40EntitiesTable[i].value >= value) {
1771 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001772 break;
William M. Brack78637da2003-07-31 14:47:38 +00001773 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001774 }
Owen Taylor3473f882001-02-23 17:55:21 +00001775 }
1776 return(NULL);
1777}
1778
1779/**
1780 * UTF8ToHtml:
1781 * @out: a pointer to an array of bytes to store the result
1782 * @outlen: the length of @out
1783 * @in: a pointer to an array of UTF-8 chars
1784 * @inlen: the length of @in
1785 *
1786 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1787 * plus HTML entities block of chars out.
1788 *
1789 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1790 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001791 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001792 * The value of @outlen after return is the number of octets consumed.
1793 */
1794int
1795UTF8ToHtml(unsigned char* out, int *outlen,
1796 const unsigned char* in, int *inlen) {
1797 const unsigned char* processed = in;
1798 const unsigned char* outend;
1799 const unsigned char* outstart = out;
1800 const unsigned char* instart = in;
1801 const unsigned char* inend;
1802 unsigned int c, d;
1803 int trailing;
1804
Daniel Veillardce682bc2004-11-05 17:22:25 +00001805 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00001806 if (in == NULL) {
1807 /*
1808 * initialization nothing to do
1809 */
1810 *outlen = 0;
1811 *inlen = 0;
1812 return(0);
1813 }
1814 inend = in + (*inlen);
1815 outend = out + (*outlen);
1816 while (in < inend) {
1817 d = *in++;
1818 if (d < 0x80) { c= d; trailing= 0; }
1819 else if (d < 0xC0) {
1820 /* trailing byte in leading position */
1821 *outlen = out - outstart;
1822 *inlen = processed - instart;
1823 return(-2);
1824 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1825 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1826 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1827 else {
1828 /* no chance for this in Ascii */
1829 *outlen = out - outstart;
1830 *inlen = processed - instart;
1831 return(-2);
1832 }
1833
1834 if (inend - in < trailing) {
1835 break;
1836 }
1837
1838 for ( ; trailing; trailing--) {
1839 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1840 break;
1841 c <<= 6;
1842 c |= d & 0x3F;
1843 }
1844
1845 /* assertion: c is a single UTF-4 value */
1846 if (c < 0x80) {
1847 if (out + 1 >= outend)
1848 break;
1849 *out++ = c;
1850 } else {
1851 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001852 const htmlEntityDesc * ent;
Daniel Veillard1032ac42006-11-23 16:18:30 +00001853 const char *cp;
1854 char nbuf[16];
Owen Taylor3473f882001-02-23 17:55:21 +00001855
1856 /*
1857 * Try to lookup a predefined HTML entity for it
1858 */
1859
1860 ent = htmlEntityValueLookup(c);
1861 if (ent == NULL) {
Daniel Veillard1032ac42006-11-23 16:18:30 +00001862 snprintf(nbuf, sizeof(nbuf), "#%u", c);
1863 cp = nbuf;
Owen Taylor3473f882001-02-23 17:55:21 +00001864 }
Daniel Veillard1032ac42006-11-23 16:18:30 +00001865 else
1866 cp = ent->name;
1867 len = strlen(cp);
Owen Taylor3473f882001-02-23 17:55:21 +00001868 if (out + 2 + len >= outend)
1869 break;
1870 *out++ = '&';
Daniel Veillard1032ac42006-11-23 16:18:30 +00001871 memcpy(out, cp, len);
Owen Taylor3473f882001-02-23 17:55:21 +00001872 out += len;
1873 *out++ = ';';
1874 }
1875 processed = in;
1876 }
1877 *outlen = out - outstart;
1878 *inlen = processed - instart;
1879 return(0);
1880}
1881
1882/**
1883 * htmlEncodeEntities:
1884 * @out: a pointer to an array of bytes to store the result
1885 * @outlen: the length of @out
1886 * @in: a pointer to an array of UTF-8 chars
1887 * @inlen: the length of @in
1888 * @quoteChar: the quote character to escape (' or ") or zero.
1889 *
1890 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1891 * plus HTML entities block of chars out.
1892 *
1893 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1894 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001895 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001896 * The value of @outlen after return is the number of octets consumed.
1897 */
1898int
1899htmlEncodeEntities(unsigned char* out, int *outlen,
1900 const unsigned char* in, int *inlen, int quoteChar) {
1901 const unsigned char* processed = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001902 const unsigned char* outend;
Owen Taylor3473f882001-02-23 17:55:21 +00001903 const unsigned char* outstart = out;
1904 const unsigned char* instart = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001905 const unsigned char* inend;
Owen Taylor3473f882001-02-23 17:55:21 +00001906 unsigned int c, d;
1907 int trailing;
1908
Daniel Veillardce682bc2004-11-05 17:22:25 +00001909 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
1910 return(-1);
1911 outend = out + (*outlen);
1912 inend = in + (*inlen);
Owen Taylor3473f882001-02-23 17:55:21 +00001913 while (in < inend) {
1914 d = *in++;
1915 if (d < 0x80) { c= d; trailing= 0; }
1916 else if (d < 0xC0) {
1917 /* trailing byte in leading position */
1918 *outlen = out - outstart;
1919 *inlen = processed - instart;
1920 return(-2);
1921 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1922 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1923 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1924 else {
1925 /* no chance for this in Ascii */
1926 *outlen = out - outstart;
1927 *inlen = processed - instart;
1928 return(-2);
1929 }
1930
1931 if (inend - in < trailing)
1932 break;
1933
1934 while (trailing--) {
1935 if (((d= *in++) & 0xC0) != 0x80) {
1936 *outlen = out - outstart;
1937 *inlen = processed - instart;
1938 return(-2);
1939 }
1940 c <<= 6;
1941 c |= d & 0x3F;
1942 }
1943
1944 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001945 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1946 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001947 if (out >= outend)
1948 break;
1949 *out++ = c;
1950 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001951 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001952 const char *cp;
1953 char nbuf[16];
1954 int len;
1955
1956 /*
1957 * Try to lookup a predefined HTML entity for it
1958 */
1959 ent = htmlEntityValueLookup(c);
1960 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001961 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001962 cp = nbuf;
1963 }
1964 else
1965 cp = ent->name;
1966 len = strlen(cp);
1967 if (out + 2 + len > outend)
1968 break;
1969 *out++ = '&';
1970 memcpy(out, cp, len);
1971 out += len;
1972 *out++ = ';';
1973 }
1974 processed = in;
1975 }
1976 *outlen = out - outstart;
1977 *inlen = processed - instart;
1978 return(0);
1979}
1980
Owen Taylor3473f882001-02-23 17:55:21 +00001981/************************************************************************
1982 * *
1983 * Commodity functions to handle streams *
1984 * *
1985 ************************************************************************/
1986
1987/**
Owen Taylor3473f882001-02-23 17:55:21 +00001988 * htmlNewInputStream:
1989 * @ctxt: an HTML parser context
1990 *
1991 * Create a new input stream structure
1992 * Returns the new input stream or NULL
1993 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001994static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001995htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1996 htmlParserInputPtr input;
1997
1998 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1999 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002000 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002001 return(NULL);
2002 }
2003 memset(input, 0, sizeof(htmlParserInput));
2004 input->filename = NULL;
2005 input->directory = NULL;
2006 input->base = NULL;
2007 input->cur = NULL;
2008 input->buf = NULL;
2009 input->line = 1;
2010 input->col = 1;
2011 input->buf = NULL;
2012 input->free = NULL;
2013 input->version = NULL;
2014 input->consumed = 0;
2015 input->length = 0;
2016 return(input);
2017}
2018
2019
2020/************************************************************************
2021 * *
2022 * Commodity functions, cleanup needed ? *
2023 * *
2024 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002025/*
2026 * all tags allowing pc data from the html 4.01 loose dtd
2027 * NOTE: it might be more apropriate to integrate this information
2028 * into the html40ElementTable array but I don't want to risk any
2029 * binary incomptibility
2030 */
2031static const char *allowPCData[] = {
2032 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2033 "blockquote", "body", "button", "caption", "center", "cite", "code",
2034 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2035 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2036 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2037 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2038};
Owen Taylor3473f882001-02-23 17:55:21 +00002039
2040/**
2041 * areBlanks:
2042 * @ctxt: an HTML parser context
2043 * @str: a xmlChar *
2044 * @len: the size of @str
2045 *
2046 * Is this a sequence of blank chars that one can ignore ?
2047 *
2048 * Returns 1 if ignorable 0 otherwise.
2049 */
2050
2051static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002052 unsigned int i;
2053 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002054 xmlNodePtr lastChild;
Daniel Veillard36d73402005-09-01 09:52:30 +00002055 xmlDtdPtr dtd;
Owen Taylor3473f882001-02-23 17:55:21 +00002056
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002057 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002058 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002059
2060 if (CUR == 0) return(1);
2061 if (CUR != '<') return(0);
2062 if (ctxt->name == NULL)
2063 return(1);
2064 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2065 return(1);
2066 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2067 return(1);
Daniel Veillard36d73402005-09-01 09:52:30 +00002068
2069 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2070 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2071 dtd = xmlGetIntSubset(ctxt->myDoc);
2072 if (dtd != NULL && dtd->ExternalID != NULL) {
2073 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2074 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2075 return(1);
2076 }
2077 }
2078
Owen Taylor3473f882001-02-23 17:55:21 +00002079 if (ctxt->node == NULL) return(0);
2080 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002081 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2082 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002083 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002084 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2085 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002086 /* keep ws in constructs like ...<b> </b>...
2087 for all tags "b" allowing PCDATA */
2088 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2089 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2090 return(0);
2091 }
2092 }
Owen Taylor3473f882001-02-23 17:55:21 +00002093 } else if (xmlNodeIsText(lastChild)) {
2094 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002095 } else {
2096 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2097 for all tags "p" allowing PCDATA */
2098 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2099 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2100 return(0);
2101 }
2102 }
Owen Taylor3473f882001-02-23 17:55:21 +00002103 }
2104 return(1);
2105}
2106
2107/**
Owen Taylor3473f882001-02-23 17:55:21 +00002108 * htmlNewDocNoDtD:
2109 * @URI: URI for the dtd, or NULL
2110 * @ExternalID: the external ID of the DTD, or NULL
2111 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002112 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2113 * are NULL
2114 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002115 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002116 */
2117htmlDocPtr
2118htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2119 xmlDocPtr cur;
2120
2121 /*
2122 * Allocate a new document and fill the fields.
2123 */
2124 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2125 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002126 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002127 return(NULL);
2128 }
2129 memset(cur, 0, sizeof(xmlDoc));
2130
2131 cur->type = XML_HTML_DOCUMENT_NODE;
2132 cur->version = NULL;
2133 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002134 cur->doc = cur;
2135 cur->name = NULL;
2136 cur->children = NULL;
2137 cur->extSubset = NULL;
2138 cur->oldNs = NULL;
2139 cur->encoding = NULL;
2140 cur->standalone = 1;
2141 cur->compression = 0;
2142 cur->ids = NULL;
2143 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002144 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002145 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002146 if ((ExternalID != NULL) ||
2147 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002148 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002149 return(cur);
2150}
2151
2152/**
2153 * htmlNewDoc:
2154 * @URI: URI for the dtd, or NULL
2155 * @ExternalID: the external ID of the DTD, or NULL
2156 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002157 * Creates a new HTML document
2158 *
Owen Taylor3473f882001-02-23 17:55:21 +00002159 * Returns a new document
2160 */
2161htmlDocPtr
2162htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2163 if ((URI == NULL) && (ExternalID == NULL))
2164 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002165 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2166 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002167
2168 return(htmlNewDocNoDtD(URI, ExternalID));
2169}
2170
2171
2172/************************************************************************
2173 * *
2174 * The parser itself *
2175 * Relates to http://www.w3.org/TR/html40 *
2176 * *
2177 ************************************************************************/
2178
2179/************************************************************************
2180 * *
2181 * The parser itself *
2182 * *
2183 ************************************************************************/
2184
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002185static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002186
Owen Taylor3473f882001-02-23 17:55:21 +00002187/**
2188 * htmlParseHTMLName:
2189 * @ctxt: an HTML parser context
2190 *
2191 * parse an HTML tag or attribute name, note that we convert it to lowercase
2192 * since HTML names are not case-sensitive.
2193 *
2194 * Returns the Tag Name parsed or NULL
2195 */
2196
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002197static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002198htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002199 int i = 0;
2200 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2201
William M. Brackd1757ab2004-10-02 22:07:48 +00002202 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Owen Taylor3473f882001-02-23 17:55:21 +00002203 (CUR != ':')) return(NULL);
2204
2205 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002206 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Owen Taylor3473f882001-02-23 17:55:21 +00002207 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2208 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2209 else loc[i] = CUR;
2210 i++;
2211
2212 NEXT;
2213 }
2214
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002215 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002216}
2217
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002218
2219/**
2220 * htmlParseHTMLName_nonInvasive:
2221 * @ctxt: an HTML parser context
2222 *
2223 * parse an HTML tag or attribute name, note that we convert it to lowercase
2224 * since HTML names are not case-sensitive, this doesn't consume the data
2225 * from the stream, it's a look-ahead
2226 *
2227 * Returns the Tag Name parsed or NULL
2228 */
2229
2230static const xmlChar *
2231htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2232 int i = 0;
2233 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2234
2235 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2236 (NXT(1) != ':')) return(NULL);
2237
2238 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2239 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2240 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2241 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2242 else loc[i] = NXT(1+i);
2243 i++;
2244 }
2245
2246 return(xmlDictLookup(ctxt->dict, loc, i));
2247}
2248
2249
Owen Taylor3473f882001-02-23 17:55:21 +00002250/**
2251 * htmlParseName:
2252 * @ctxt: an HTML parser context
2253 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002254 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002255 *
2256 * Returns the Name parsed or NULL
2257 */
2258
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002259static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002260htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002261 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002262 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002263 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002264
2265 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002266
2267 /*
2268 * Accelerator for simple ASCII names
2269 */
2270 in = ctxt->input->cur;
2271 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2272 ((*in >= 0x41) && (*in <= 0x5A)) ||
2273 (*in == '_') || (*in == ':')) {
2274 in++;
2275 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2276 ((*in >= 0x41) && (*in <= 0x5A)) ||
2277 ((*in >= 0x30) && (*in <= 0x39)) ||
2278 (*in == '_') || (*in == '-') ||
2279 (*in == ':') || (*in == '.'))
2280 in++;
2281 if ((*in > 0) && (*in < 0x80)) {
2282 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002283 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002284 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002285 ctxt->nbChars += count;
2286 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002287 return(ret);
2288 }
2289 }
2290 return(htmlParseNameComplex(ctxt));
2291}
2292
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002293static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002294htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002295 int len = 0, l;
2296 int c;
2297 int count = 0;
2298
2299 /*
2300 * Handler for more complex cases
2301 */
2302 GROW;
2303 c = CUR_CHAR(l);
2304 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2305 (!IS_LETTER(c) && (c != '_') &&
2306 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002307 return(NULL);
2308 }
2309
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002310 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2311 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2312 (c == '.') || (c == '-') ||
2313 (c == '_') || (c == ':') ||
2314 (IS_COMBINING(c)) ||
2315 (IS_EXTENDER(c)))) {
2316 if (count++ > 100) {
2317 count = 0;
2318 GROW;
2319 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002320 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002321 NEXTL(l);
2322 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002323 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002324 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002325}
2326
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002327
Owen Taylor3473f882001-02-23 17:55:21 +00002328/**
2329 * htmlParseHTMLAttribute:
2330 * @ctxt: an HTML parser context
2331 * @stop: a char stop value
2332 *
2333 * parse an HTML attribute value till the stop (quote), if
2334 * stop is 0 then it stops at the first space
2335 *
2336 * Returns the attribute parsed or NULL
2337 */
2338
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002339static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002340htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2341 xmlChar *buffer = NULL;
2342 int buffer_size = 0;
2343 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002344 const xmlChar *name = NULL;
2345 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002346 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002347
2348 /*
2349 * allocate a translation buffer.
2350 */
2351 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002352 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002353 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002354 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002355 return(NULL);
2356 }
2357 out = buffer;
2358
2359 /*
2360 * Ok loop until we reach one of the ending chars
2361 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002362 while ((CUR != 0) && (CUR != stop)) {
2363 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002364 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002365 if (CUR == '&') {
2366 if (NXT(1) == '#') {
2367 unsigned int c;
2368 int bits;
2369
2370 c = htmlParseCharRef(ctxt);
2371 if (c < 0x80)
2372 { *out++ = c; bits= -6; }
2373 else if (c < 0x800)
2374 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2375 else if (c < 0x10000)
2376 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2377 else
2378 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2379
2380 for ( ; bits >= 0; bits-= 6) {
2381 *out++ = ((c >> bits) & 0x3F) | 0x80;
2382 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002383
2384 if (out - buffer > buffer_size - 100) {
2385 int indx = out - buffer;
2386
2387 growBuffer(buffer);
2388 out = &buffer[indx];
2389 }
Owen Taylor3473f882001-02-23 17:55:21 +00002390 } else {
2391 ent = htmlParseEntityRef(ctxt, &name);
2392 if (name == NULL) {
2393 *out++ = '&';
2394 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002395 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002396
2397 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002398 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002399 }
2400 } else if (ent == NULL) {
2401 *out++ = '&';
2402 cur = name;
2403 while (*cur != 0) {
2404 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002405 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002406
2407 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002408 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002409 }
2410 *out++ = *cur++;
2411 }
Owen Taylor3473f882001-02-23 17:55:21 +00002412 } else {
2413 unsigned int c;
2414 int bits;
2415
2416 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002417 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002418
2419 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002420 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002421 }
Daniel Veillard48519092006-10-17 15:56:35 +00002422 c = ent->value;
Owen Taylor3473f882001-02-23 17:55:21 +00002423 if (c < 0x80)
2424 { *out++ = c; bits= -6; }
2425 else if (c < 0x800)
2426 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2427 else if (c < 0x10000)
2428 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2429 else
2430 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2431
2432 for ( ; bits >= 0; bits-= 6) {
2433 *out++ = ((c >> bits) & 0x3F) | 0x80;
2434 }
Owen Taylor3473f882001-02-23 17:55:21 +00002435 }
2436 }
2437 } else {
2438 unsigned int c;
2439 int bits, l;
2440
2441 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002442 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002443
2444 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002445 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002446 }
2447 c = CUR_CHAR(l);
2448 if (c < 0x80)
2449 { *out++ = c; bits= -6; }
2450 else if (c < 0x800)
2451 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2452 else if (c < 0x10000)
2453 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2454 else
2455 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2456
2457 for ( ; bits >= 0; bits-= 6) {
2458 *out++ = ((c >> bits) & 0x3F) | 0x80;
2459 }
2460 NEXT;
2461 }
2462 }
2463 *out++ = 0;
2464 return(buffer);
2465}
2466
2467/**
Owen Taylor3473f882001-02-23 17:55:21 +00002468 * htmlParseEntityRef:
2469 * @ctxt: an HTML parser context
2470 * @str: location to store the entity name
2471 *
2472 * parse an HTML ENTITY references
2473 *
2474 * [68] EntityRef ::= '&' Name ';'
2475 *
2476 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2477 * if non-NULL *str will have to be freed by the caller.
2478 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002479const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002480htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2481 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002482 const htmlEntityDesc * ent = NULL;
Daniel Veillard42595322004-11-08 10:52:06 +00002483
2484 if (str != NULL) *str = NULL;
2485 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002486
2487 if (CUR == '&') {
2488 NEXT;
2489 name = htmlParseName(ctxt);
2490 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002491 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2492 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002493 } else {
2494 GROW;
2495 if (CUR == ';') {
Daniel Veillard42595322004-11-08 10:52:06 +00002496 if (str != NULL)
2497 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002498
2499 /*
2500 * Lookup the entity in the table.
2501 */
2502 ent = htmlEntityLookup(name);
2503 if (ent != NULL) /* OK that's ugly !!! */
2504 NEXT;
2505 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002506 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2507 "htmlParseEntityRef: expecting ';'\n",
2508 NULL, NULL);
Daniel Veillard42595322004-11-08 10:52:06 +00002509 if (str != NULL)
2510 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002511 }
2512 }
2513 }
2514 return(ent);
2515}
2516
2517/**
2518 * htmlParseAttValue:
2519 * @ctxt: an HTML parser context
2520 *
2521 * parse a value for an attribute
2522 * Note: the parser won't do substitution of entities here, this
2523 * will be handled later in xmlStringGetNodeList, unless it was
2524 * asked for ctxt->replaceEntities != 0
2525 *
2526 * Returns the AttValue parsed or NULL.
2527 */
2528
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002529static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002530htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2531 xmlChar *ret = NULL;
2532
2533 if (CUR == '"') {
2534 NEXT;
2535 ret = htmlParseHTMLAttribute(ctxt, '"');
2536 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002537 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2538 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002539 } else
2540 NEXT;
2541 } else if (CUR == '\'') {
2542 NEXT;
2543 ret = htmlParseHTMLAttribute(ctxt, '\'');
2544 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002545 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2546 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002547 } else
2548 NEXT;
2549 } else {
2550 /*
2551 * That's an HTMLism, the attribute value may not be quoted
2552 */
2553 ret = htmlParseHTMLAttribute(ctxt, 0);
2554 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002555 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2556 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002557 }
2558 }
2559 return(ret);
2560}
2561
2562/**
2563 * htmlParseSystemLiteral:
2564 * @ctxt: an HTML parser context
2565 *
2566 * parse an HTML Literal
2567 *
2568 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2569 *
2570 * Returns the SystemLiteral parsed or NULL
2571 */
2572
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002573static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002574htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2575 const xmlChar *q;
2576 xmlChar *ret = NULL;
2577
2578 if (CUR == '"') {
2579 NEXT;
2580 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002581 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002582 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002583 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002584 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2585 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002586 } else {
2587 ret = xmlStrndup(q, CUR_PTR - q);
2588 NEXT;
2589 }
2590 } else if (CUR == '\'') {
2591 NEXT;
2592 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002593 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002594 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002595 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002596 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2597 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002598 } else {
2599 ret = xmlStrndup(q, CUR_PTR - q);
2600 NEXT;
2601 }
2602 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002603 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2604 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002605 }
2606
2607 return(ret);
2608}
2609
2610/**
2611 * htmlParsePubidLiteral:
2612 * @ctxt: an HTML parser context
2613 *
2614 * parse an HTML public literal
2615 *
2616 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2617 *
2618 * Returns the PubidLiteral parsed or NULL.
2619 */
2620
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002621static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002622htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2623 const xmlChar *q;
2624 xmlChar *ret = NULL;
2625 /*
2626 * Name ::= (Letter | '_') (NameChar)*
2627 */
2628 if (CUR == '"') {
2629 NEXT;
2630 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002631 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002632 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002633 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2634 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002635 } else {
2636 ret = xmlStrndup(q, CUR_PTR - q);
2637 NEXT;
2638 }
2639 } else if (CUR == '\'') {
2640 NEXT;
2641 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002642 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002643 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002644 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002645 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2646 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002647 } else {
2648 ret = xmlStrndup(q, CUR_PTR - q);
2649 NEXT;
2650 }
2651 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002652 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2653 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002654 }
2655
2656 return(ret);
2657}
2658
2659/**
2660 * htmlParseScript:
2661 * @ctxt: an HTML parser context
2662 *
2663 * parse the content of an HTML SCRIPT or STYLE element
2664 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2665 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2666 * http://www.w3.org/TR/html4/types.html#type-script
2667 * http://www.w3.org/TR/html4/types.html#h-6.15
2668 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2669 *
2670 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2671 * element and the value of intrinsic event attributes. User agents must
2672 * not evaluate script data as HTML markup but instead must pass it on as
2673 * data to a script engine.
2674 * NOTES:
2675 * - The content is passed like CDATA
2676 * - the attributes for style and scripting "onXXX" are also described
2677 * as CDATA but SGML allows entities references in attributes so their
2678 * processing is identical as other attributes
2679 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002680static void
Owen Taylor3473f882001-02-23 17:55:21 +00002681htmlParseScript(htmlParserCtxtPtr ctxt) {
Daniel Veillard7d2b3232005-07-14 08:57:39 +00002682 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
Owen Taylor3473f882001-02-23 17:55:21 +00002683 int nbchar = 0;
Daniel Veillard358fef42005-07-13 16:37:38 +00002684 int cur,l;
Owen Taylor3473f882001-02-23 17:55:21 +00002685
2686 SHRINK;
Daniel Veillard358fef42005-07-13 16:37:38 +00002687 cur = CUR_CHAR(l);
William M. Brack76e95df2003-10-18 16:20:14 +00002688 while (IS_CHAR_CH(cur)) {
Daniel Veillard42720242007-04-16 07:02:31 +00002689 if ((cur == '<') && (NXT(1) == '/')) {
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002690 /*
2691 * One should break here, the specification is clear:
2692 * Authors should therefore escape "</" within the content.
2693 * Escape mechanisms are specific to each scripting or
2694 * style sheet language.
2695 *
2696 * In recovery mode, only break if end tag match the
2697 * current tag, effectively ignoring all tags inside the
2698 * script/style block and treating the entire block as
2699 * CDATA.
2700 */
2701 if (ctxt->recovery) {
2702 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2703 xmlStrlen(ctxt->name)) == 0)
2704 {
2705 break; /* while */
2706 } else {
2707 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
Daniel Veillard2cf36a12005-10-25 12:21:29 +00002708 "Element %s embeds close tag\n",
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002709 ctxt->name, NULL);
2710 }
2711 } else {
2712 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2713 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2714 {
2715 break; /* while */
2716 }
2717 }
Owen Taylor3473f882001-02-23 17:55:21 +00002718 }
Daniel Veillard358fef42005-07-13 16:37:38 +00002719 COPY_BUF(l,buf,nbchar,cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002720 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2721 if (ctxt->sax->cdataBlock!= NULL) {
2722 /*
2723 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2724 */
2725 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002726 } else if (ctxt->sax->characters != NULL) {
2727 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002728 }
2729 nbchar = 0;
2730 }
Daniel Veillardb9900082005-10-25 12:36:29 +00002731 GROW;
Daniel Veillard358fef42005-07-13 16:37:38 +00002732 NEXTL(l);
2733 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002734 }
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002735
Daniel Veillard68716a72006-10-16 09:32:17 +00002736 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002737 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2738 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002739 NEXT;
2740 }
2741
2742 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2743 if (ctxt->sax->cdataBlock!= NULL) {
2744 /*
2745 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2746 */
2747 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002748 } else if (ctxt->sax->characters != NULL) {
2749 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002750 }
2751 }
2752}
2753
2754
2755/**
2756 * htmlParseCharData:
2757 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002758 *
2759 * parse a CharData section.
2760 * if we are within a CDATA section ']]>' marks an end of section.
2761 *
2762 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2763 */
2764
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002765static void
2766htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002767 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2768 int nbchar = 0;
2769 int cur, l;
2770
2771 SHRINK;
2772 cur = CUR_CHAR(l);
2773 while (((cur != '<') || (ctxt->token == '<')) &&
2774 ((cur != '&') || (ctxt->token == '&')) &&
Daniel Veillardc5b43cc2008-01-11 07:41:39 +00002775 (cur != 0)) {
2776 if (!(IS_CHAR(cur))) {
2777 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2778 "Invalid char in CDATA 0x%X\n", cur);
2779 } else {
2780 COPY_BUF(l,buf,nbchar,cur);
2781 }
Owen Taylor3473f882001-02-23 17:55:21 +00002782 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2783 /*
2784 * Ok the segment is to be consumed as chars.
2785 */
2786 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2787 if (areBlanks(ctxt, buf, nbchar)) {
2788 if (ctxt->sax->ignorableWhitespace != NULL)
2789 ctxt->sax->ignorableWhitespace(ctxt->userData,
2790 buf, nbchar);
2791 } else {
2792 htmlCheckParagraph(ctxt);
2793 if (ctxt->sax->characters != NULL)
2794 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2795 }
2796 }
2797 nbchar = 0;
2798 }
2799 NEXTL(l);
2800 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002801 if (cur == 0) {
2802 SHRINK;
2803 GROW;
2804 cur = CUR_CHAR(l);
2805 }
Owen Taylor3473f882001-02-23 17:55:21 +00002806 }
2807 if (nbchar != 0) {
Daniel Veillardd2755a82005-08-07 23:42:39 +00002808 buf[nbchar] = 0;
2809
Owen Taylor3473f882001-02-23 17:55:21 +00002810 /*
2811 * Ok the segment is to be consumed as chars.
2812 */
2813 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2814 if (areBlanks(ctxt, buf, nbchar)) {
2815 if (ctxt->sax->ignorableWhitespace != NULL)
2816 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2817 } else {
2818 htmlCheckParagraph(ctxt);
2819 if (ctxt->sax->characters != NULL)
2820 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2821 }
2822 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002823 } else {
2824 /*
2825 * Loop detection
2826 */
2827 if (cur == 0)
2828 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002829 }
2830}
2831
2832/**
2833 * htmlParseExternalID:
2834 * @ctxt: an HTML parser context
2835 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002836 *
2837 * Parse an External ID or a Public ID
2838 *
Owen Taylor3473f882001-02-23 17:55:21 +00002839 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2840 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2841 *
2842 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2843 *
2844 * Returns the function returns SystemLiteral and in the second
2845 * case publicID receives PubidLiteral, is strict is off
2846 * it is possible to return NULL and have publicID set.
2847 */
2848
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002849static xmlChar *
2850htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002851 xmlChar *URI = NULL;
2852
2853 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2854 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2855 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2856 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002857 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002858 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2859 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002860 }
2861 SKIP_BLANKS;
2862 URI = htmlParseSystemLiteral(ctxt);
2863 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002864 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2865 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002866 }
2867 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2868 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2869 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2870 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002871 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002872 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2873 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002874 }
2875 SKIP_BLANKS;
2876 *publicID = htmlParsePubidLiteral(ctxt);
2877 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002878 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2879 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2880 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002881 }
2882 SKIP_BLANKS;
2883 if ((CUR == '"') || (CUR == '\'')) {
2884 URI = htmlParseSystemLiteral(ctxt);
2885 }
2886 }
2887 return(URI);
2888}
2889
2890/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00002891 * xmlParsePI:
2892 * @ctxt: an XML parser context
2893 *
2894 * parse an XML Processing Instruction.
2895 *
2896 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2897 */
2898static void
2899htmlParsePI(htmlParserCtxtPtr ctxt) {
2900 xmlChar *buf = NULL;
2901 int len = 0;
2902 int size = HTML_PARSER_BUFFER_SIZE;
2903 int cur, l;
2904 const xmlChar *target;
2905 xmlParserInputState state;
2906 int count = 0;
2907
2908 if ((RAW == '<') && (NXT(1) == '?')) {
2909 state = ctxt->instate;
2910 ctxt->instate = XML_PARSER_PI;
2911 /*
2912 * this is a Processing Instruction.
2913 */
2914 SKIP(2);
2915 SHRINK;
2916
2917 /*
2918 * Parse the target name and check for special support like
2919 * namespace.
2920 */
2921 target = htmlParseName(ctxt);
2922 if (target != NULL) {
2923 if (RAW == '>') {
2924 SKIP(1);
2925
2926 /*
2927 * SAX: PI detected.
2928 */
2929 if ((ctxt->sax) && (!ctxt->disableSAX) &&
2930 (ctxt->sax->processingInstruction != NULL))
2931 ctxt->sax->processingInstruction(ctxt->userData,
2932 target, NULL);
2933 ctxt->instate = state;
2934 return;
2935 }
2936 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
2937 if (buf == NULL) {
2938 htmlErrMemory(ctxt, NULL);
2939 ctxt->instate = state;
2940 return;
2941 }
2942 cur = CUR;
2943 if (!IS_BLANK(cur)) {
2944 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2945 "ParsePI: PI %s space expected\n", target, NULL);
2946 }
2947 SKIP_BLANKS;
2948 cur = CUR_CHAR(l);
2949 while (IS_CHAR(cur) && (cur != '>')) {
2950 if (len + 5 >= size) {
2951 xmlChar *tmp;
2952
2953 size *= 2;
2954 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2955 if (tmp == NULL) {
2956 htmlErrMemory(ctxt, NULL);
2957 xmlFree(buf);
2958 ctxt->instate = state;
2959 return;
2960 }
2961 buf = tmp;
2962 }
2963 count++;
2964 if (count > 50) {
2965 GROW;
2966 count = 0;
2967 }
2968 COPY_BUF(l,buf,len,cur);
2969 NEXTL(l);
2970 cur = CUR_CHAR(l);
2971 if (cur == 0) {
2972 SHRINK;
2973 GROW;
2974 cur = CUR_CHAR(l);
2975 }
2976 }
2977 buf[len] = 0;
2978 if (cur != '>') {
2979 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
2980 "ParsePI: PI %s never end ...\n", target, NULL);
2981 } else {
2982 SKIP(1);
2983
2984 /*
2985 * SAX: PI detected.
2986 */
2987 if ((ctxt->sax) && (!ctxt->disableSAX) &&
2988 (ctxt->sax->processingInstruction != NULL))
2989 ctxt->sax->processingInstruction(ctxt->userData,
2990 target, buf);
2991 }
2992 xmlFree(buf);
2993 } else {
2994 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
2995 "PI is not started correctly", NULL, NULL);
2996 }
2997 ctxt->instate = state;
2998 }
2999}
3000
3001/**
Owen Taylor3473f882001-02-23 17:55:21 +00003002 * htmlParseComment:
3003 * @ctxt: an HTML parser context
3004 *
3005 * Parse an XML (SGML) comment <!-- .... -->
3006 *
3007 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3008 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003009static void
Owen Taylor3473f882001-02-23 17:55:21 +00003010htmlParseComment(htmlParserCtxtPtr ctxt) {
3011 xmlChar *buf = NULL;
3012 int len;
3013 int size = HTML_PARSER_BUFFER_SIZE;
3014 int q, ql;
3015 int r, rl;
3016 int cur, l;
3017 xmlParserInputState state;
3018
3019 /*
3020 * Check that there is a comment right here.
3021 */
3022 if ((RAW != '<') || (NXT(1) != '!') ||
3023 (NXT(2) != '-') || (NXT(3) != '-')) return;
3024
3025 state = ctxt->instate;
3026 ctxt->instate = XML_PARSER_COMMENT;
3027 SHRINK;
3028 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003029 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00003030 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003031 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003032 ctxt->instate = state;
3033 return;
3034 }
3035 q = CUR_CHAR(ql);
3036 NEXTL(ql);
3037 r = CUR_CHAR(rl);
3038 NEXTL(rl);
3039 cur = CUR_CHAR(l);
3040 len = 0;
3041 while (IS_CHAR(cur) &&
3042 ((cur != '>') ||
3043 (r != '-') || (q != '-'))) {
3044 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00003045 xmlChar *tmp;
3046
Owen Taylor3473f882001-02-23 17:55:21 +00003047 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00003048 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3049 if (tmp == NULL) {
3050 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00003051 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003052 ctxt->instate = state;
3053 return;
3054 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00003055 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00003056 }
3057 COPY_BUF(ql,buf,len,q);
3058 q = r;
3059 ql = rl;
3060 r = cur;
3061 rl = l;
3062 NEXTL(l);
3063 cur = CUR_CHAR(l);
3064 if (cur == 0) {
3065 SHRINK;
3066 GROW;
3067 cur = CUR_CHAR(l);
3068 }
3069 }
3070 buf[len] = 0;
3071 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003072 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3073 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003074 xmlFree(buf);
3075 } else {
3076 NEXT;
3077 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3078 (!ctxt->disableSAX))
3079 ctxt->sax->comment(ctxt->userData, buf);
3080 xmlFree(buf);
3081 }
3082 ctxt->instate = state;
3083}
3084
3085/**
3086 * htmlParseCharRef:
3087 * @ctxt: an HTML parser context
3088 *
3089 * parse Reference declarations
3090 *
3091 * [66] CharRef ::= '&#' [0-9]+ ';' |
3092 * '&#x' [0-9a-fA-F]+ ';'
3093 *
3094 * Returns the value parsed (as an int)
3095 */
3096int
3097htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3098 int val = 0;
3099
Daniel Veillarda03e3652004-11-02 18:45:30 +00003100 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3101 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3102 "htmlParseCharRef: context error\n",
3103 NULL, NULL);
3104 return(0);
3105 }
Owen Taylor3473f882001-02-23 17:55:21 +00003106 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003107 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003108 SKIP(3);
3109 while (CUR != ';') {
3110 if ((CUR >= '0') && (CUR <= '9'))
3111 val = val * 16 + (CUR - '0');
3112 else if ((CUR >= 'a') && (CUR <= 'f'))
3113 val = val * 16 + (CUR - 'a') + 10;
3114 else if ((CUR >= 'A') && (CUR <= 'F'))
3115 val = val * 16 + (CUR - 'A') + 10;
3116 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003117 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
Daniel Veillard36de63e2008-04-03 09:05:05 +00003118 "htmlParseCharRef: missing semicolumn\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003119 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003120 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003121 }
3122 NEXT;
3123 }
3124 if (CUR == ';')
3125 NEXT;
3126 } else if ((CUR == '&') && (NXT(1) == '#')) {
3127 SKIP(2);
3128 while (CUR != ';') {
3129 if ((CUR >= '0') && (CUR <= '9'))
3130 val = val * 10 + (CUR - '0');
3131 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003132 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
Daniel Veillard36de63e2008-04-03 09:05:05 +00003133 "htmlParseCharRef: missing semicolumn\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003134 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003135 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003136 }
3137 NEXT;
3138 }
3139 if (CUR == ';')
3140 NEXT;
3141 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003142 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3143 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003144 }
3145 /*
3146 * Check the value IS_CHAR ...
3147 */
3148 if (IS_CHAR(val)) {
3149 return(val);
3150 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003151 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3152 "htmlParseCharRef: invalid xmlChar value %d\n",
3153 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003154 }
3155 return(0);
3156}
3157
3158
3159/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003160 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003161 * @ctxt: an HTML parser context
3162 *
3163 * parse a DOCTYPE declaration
3164 *
3165 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3166 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3167 */
3168
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003169static void
Owen Taylor3473f882001-02-23 17:55:21 +00003170htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003171 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003172 xmlChar *ExternalID = NULL;
3173 xmlChar *URI = NULL;
3174
3175 /*
3176 * We know that '<!DOCTYPE' has been detected.
3177 */
3178 SKIP(9);
3179
3180 SKIP_BLANKS;
3181
3182 /*
3183 * Parse the DOCTYPE name.
3184 */
3185 name = htmlParseName(ctxt);
3186 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003187 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3188 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3189 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003190 }
3191 /*
3192 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3193 */
3194
3195 SKIP_BLANKS;
3196
3197 /*
3198 * Check for SystemID and ExternalID
3199 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003200 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003201 SKIP_BLANKS;
3202
3203 /*
3204 * We should be at the end of the DOCTYPE declaration.
3205 */
3206 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003207 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3208 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003209 /* We shouldn't try to resynchronize ... */
3210 }
3211 NEXT;
3212
3213 /*
3214 * Create or update the document accordingly to the DOCTYPE
3215 */
3216 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3217 (!ctxt->disableSAX))
3218 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3219
3220 /*
3221 * Cleanup, since we don't use all those identifiers
3222 */
3223 if (URI != NULL) xmlFree(URI);
3224 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003225}
3226
3227/**
3228 * htmlParseAttribute:
3229 * @ctxt: an HTML parser context
3230 * @value: a xmlChar ** used to store the value of the attribute
3231 *
3232 * parse an attribute
3233 *
3234 * [41] Attribute ::= Name Eq AttValue
3235 *
3236 * [25] Eq ::= S? '=' S?
3237 *
3238 * With namespace:
3239 *
3240 * [NS 11] Attribute ::= QName Eq AttValue
3241 *
3242 * Also the case QName == xmlns:??? is handled independently as a namespace
3243 * definition.
3244 *
3245 * Returns the attribute name, and the value in *value.
3246 */
3247
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003248static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003249htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003250 const xmlChar *name;
3251 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003252
3253 *value = NULL;
3254 name = htmlParseHTMLName(ctxt);
3255 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003256 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3257 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003258 return(NULL);
3259 }
3260
3261 /*
3262 * read the value
3263 */
3264 SKIP_BLANKS;
3265 if (CUR == '=') {
3266 NEXT;
3267 SKIP_BLANKS;
3268 val = htmlParseAttValue(ctxt);
Daniel Veillardc47d2632006-10-17 16:13:27 +00003269 } else if (htmlIsBooleanAttr(name)) {
3270 /*
3271 * assume a minimized attribute
3272 */
3273 val = xmlStrdup(name);
Owen Taylor3473f882001-02-23 17:55:21 +00003274 }
3275
3276 *value = val;
3277 return(name);
3278}
3279
3280/**
3281 * htmlCheckEncoding:
3282 * @ctxt: an HTML parser context
3283 * @attvalue: the attribute value
3284 *
3285 * Checks an http-equiv attribute from a Meta tag to detect
3286 * the encoding
3287 * If a new encoding is detected the parser is switched to decode
3288 * it and pass UTF8
3289 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003290static void
Owen Taylor3473f882001-02-23 17:55:21 +00003291htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3292 const xmlChar *encoding;
3293
3294 if ((ctxt == NULL) || (attvalue == NULL))
3295 return;
3296
3297 /* do not change encoding */
3298 if (ctxt->input->encoding != NULL)
3299 return;
3300
3301 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3302 if (encoding != NULL) {
3303 encoding += 8;
3304 } else {
3305 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3306 if (encoding != NULL)
3307 encoding += 9;
3308 }
3309 if (encoding != NULL) {
3310 xmlCharEncoding enc;
3311 xmlCharEncodingHandlerPtr handler;
3312
3313 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3314
3315 if (ctxt->input->encoding != NULL)
3316 xmlFree((xmlChar *) ctxt->input->encoding);
3317 ctxt->input->encoding = xmlStrdup(encoding);
3318
3319 enc = xmlParseCharEncoding((const char *) encoding);
3320 /*
3321 * registered set of known encodings
3322 */
3323 if (enc != XML_CHAR_ENCODING_ERROR) {
Daniel Veillard7e303562006-10-16 13:14:55 +00003324 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3325 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3326 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3327 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3328 (ctxt->input->buf != NULL) &&
3329 (ctxt->input->buf->encoder == NULL)) {
3330 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3331 "htmlCheckEncoding: wrong encoding meta\n",
3332 NULL, NULL);
3333 } else {
3334 xmlSwitchEncoding(ctxt, enc);
3335 }
Owen Taylor3473f882001-02-23 17:55:21 +00003336 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3337 } else {
3338 /*
3339 * fallback for unknown encodings
3340 */
3341 handler = xmlFindCharEncodingHandler((const char *) encoding);
3342 if (handler != NULL) {
3343 xmlSwitchToEncoding(ctxt, handler);
3344 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3345 } else {
3346 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3347 }
3348 }
3349
3350 if ((ctxt->input->buf != NULL) &&
3351 (ctxt->input->buf->encoder != NULL) &&
3352 (ctxt->input->buf->raw != NULL) &&
3353 (ctxt->input->buf->buffer != NULL)) {
3354 int nbchars;
3355 int processed;
3356
3357 /*
3358 * convert as much as possible to the parser reading buffer.
3359 */
3360 processed = ctxt->input->cur - ctxt->input->base;
3361 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3362 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3363 ctxt->input->buf->buffer,
3364 ctxt->input->buf->raw);
3365 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003366 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3367 "htmlCheckEncoding: encoder error\n",
3368 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003369 }
3370 ctxt->input->base =
3371 ctxt->input->cur = ctxt->input->buf->buffer->content;
3372 }
3373 }
3374}
3375
3376/**
3377 * htmlCheckMeta:
3378 * @ctxt: an HTML parser context
3379 * @atts: the attributes values
3380 *
3381 * Checks an attributes from a Meta tag
3382 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003383static void
Owen Taylor3473f882001-02-23 17:55:21 +00003384htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3385 int i;
3386 const xmlChar *att, *value;
3387 int http = 0;
3388 const xmlChar *content = NULL;
3389
3390 if ((ctxt == NULL) || (atts == NULL))
3391 return;
3392
3393 i = 0;
3394 att = atts[i++];
3395 while (att != NULL) {
3396 value = atts[i++];
3397 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3398 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3399 http = 1;
3400 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3401 content = value;
3402 att = atts[i++];
3403 }
3404 if ((http) && (content != NULL))
3405 htmlCheckEncoding(ctxt, content);
3406
3407}
3408
3409/**
3410 * htmlParseStartTag:
3411 * @ctxt: an HTML parser context
3412 *
3413 * parse a start of tag either for rule element or
3414 * EmptyElement. In both case we don't parse the tag closing chars.
3415 *
3416 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3417 *
3418 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3419 *
3420 * With namespace:
3421 *
3422 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3423 *
3424 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3425 *
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003426 * Returns 0 in case of success, -1 in case of error and 1 if discarded
Owen Taylor3473f882001-02-23 17:55:21 +00003427 */
3428
Daniel Veillard597f1c12005-07-03 23:00:18 +00003429static int
Owen Taylor3473f882001-02-23 17:55:21 +00003430htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003431 const xmlChar *name;
3432 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003433 xmlChar *attvalue;
Daniel Veillard30e76072006-03-09 14:13:55 +00003434 const xmlChar **atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003435 int nbatts = 0;
Daniel Veillard30e76072006-03-09 14:13:55 +00003436 int maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003437 int meta = 0;
3438 int i;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003439 int discardtag = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003440
Daniel Veillarda03e3652004-11-02 18:45:30 +00003441 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3442 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3443 "htmlParseStartTag: context error\n", NULL, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003444 return -1;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003445 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003446 if (CUR != '<') return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003447 NEXT;
3448
Daniel Veillard30e76072006-03-09 14:13:55 +00003449 atts = ctxt->atts;
3450 maxatts = ctxt->maxatts;
3451
Owen Taylor3473f882001-02-23 17:55:21 +00003452 GROW;
3453 name = htmlParseHTMLName(ctxt);
3454 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003455 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3456 "htmlParseStartTag: invalid element name\n",
3457 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003458 /* Dump the bogus tag like browsers do */
William M. Brack76e95df2003-10-18 16:20:14 +00003459 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Owen Taylor3473f882001-02-23 17:55:21 +00003460 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003461 return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003462 }
3463 if (xmlStrEqual(name, BAD_CAST"meta"))
3464 meta = 1;
3465
3466 /*
3467 * Check for auto-closure of HTML elements.
3468 */
3469 htmlAutoClose(ctxt, name);
3470
3471 /*
3472 * Check for implied HTML elements.
3473 */
3474 htmlCheckImplied(ctxt, name);
3475
3476 /*
3477 * Avoid html at any level > 0, head at any level != 1
3478 * or any attempt to recurse body
3479 */
3480 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003481 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3482 "htmlParseStartTag: misplaced <html> tag\n",
3483 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003484 discardtag = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00003485 }
3486 if ((ctxt->nameNr != 1) &&
3487 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003488 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3489 "htmlParseStartTag: misplaced <head> tag\n",
3490 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003491 discardtag = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00003492 }
3493 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003494 int indx;
3495 for (indx = 0;indx < ctxt->nameNr;indx++) {
3496 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003497 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3498 "htmlParseStartTag: misplaced <body> tag\n",
3499 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003500 discardtag = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00003501 }
3502 }
3503 }
3504
3505 /*
3506 * Now parse the attributes, it ends up with the ending
3507 *
3508 * (S Attribute)* S?
3509 */
3510 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003511 while ((IS_CHAR_CH(CUR)) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003512 (CUR != '>') &&
3513 ((CUR != '/') || (NXT(1) != '>'))) {
3514 long cons = ctxt->nbChars;
3515
3516 GROW;
3517 attname = htmlParseAttribute(ctxt, &attvalue);
3518 if (attname != NULL) {
3519
3520 /*
3521 * Well formedness requires at most one declaration of an attribute
3522 */
3523 for (i = 0; i < nbatts;i += 2) {
3524 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003525 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3526 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003527 if (attvalue != NULL)
3528 xmlFree(attvalue);
3529 goto failed;
3530 }
3531 }
3532
3533 /*
3534 * Add the pair to atts
3535 */
3536 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003537 maxatts = 22; /* allow for 10 attrs by default */
3538 atts = (const xmlChar **)
3539 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003540 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003541 htmlErrMemory(ctxt, NULL);
3542 if (attvalue != NULL)
3543 xmlFree(attvalue);
3544 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003545 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003546 ctxt->atts = atts;
3547 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003548 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003549 const xmlChar **n;
3550
Owen Taylor3473f882001-02-23 17:55:21 +00003551 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003552 n = (const xmlChar **) xmlRealloc((void *) atts,
3553 maxatts * sizeof(const xmlChar *));
3554 if (n == NULL) {
3555 htmlErrMemory(ctxt, NULL);
3556 if (attvalue != NULL)
3557 xmlFree(attvalue);
3558 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003559 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003560 atts = n;
3561 ctxt->atts = atts;
3562 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003563 }
3564 atts[nbatts++] = attname;
3565 atts[nbatts++] = attvalue;
3566 atts[nbatts] = NULL;
3567 atts[nbatts + 1] = NULL;
3568 }
3569 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003570 if (attvalue != NULL)
3571 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003572 /* Dump the bogus attribute string up to the next blank or
3573 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003574 while ((IS_CHAR_CH(CUR)) &&
3575 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003576 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003577 NEXT;
3578 }
3579
3580failed:
3581 SKIP_BLANKS;
3582 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003583 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3584 "htmlParseStartTag: problem parsing attributes\n",
3585 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003586 break;
3587 }
3588 }
3589
3590 /*
3591 * Handle specific association to the META tag
3592 */
William M. Bracke978ae22007-03-21 06:16:02 +00003593 if (meta && (nbatts != 0))
Owen Taylor3473f882001-02-23 17:55:21 +00003594 htmlCheckMeta(ctxt, atts);
3595
3596 /*
3597 * SAX: Start of Element !
3598 */
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003599 if (!discardtag) {
3600 htmlnamePush(ctxt, name);
3601 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3602 if (nbatts != 0)
3603 ctxt->sax->startElement(ctxt->userData, name, atts);
3604 else
3605 ctxt->sax->startElement(ctxt->userData, name, NULL);
3606 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003607 }
Owen Taylor3473f882001-02-23 17:55:21 +00003608
3609 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003610 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003611 if (atts[i] != NULL)
3612 xmlFree((xmlChar *) atts[i]);
3613 }
Owen Taylor3473f882001-02-23 17:55:21 +00003614 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003615
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003616 return(discardtag);
Owen Taylor3473f882001-02-23 17:55:21 +00003617}
3618
3619/**
3620 * htmlParseEndTag:
3621 * @ctxt: an HTML parser context
3622 *
3623 * parse an end of tag
3624 *
3625 * [42] ETag ::= '</' Name S? '>'
3626 *
3627 * With namespace
3628 *
3629 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003630 *
3631 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003632 */
3633
Daniel Veillardf420ac52001-07-04 16:04:09 +00003634static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003635htmlParseEndTag(htmlParserCtxtPtr ctxt)
3636{
3637 const xmlChar *name;
3638 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003639 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003640
3641 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003642 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3643 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003644 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003645 }
3646 SKIP(2);
3647
3648 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003649 if (name == NULL)
3650 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003651
3652 /*
3653 * We should definitely be at the ending "S? '>'" part
3654 */
3655 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003656 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003657 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3658 "End tag : expected '>'\n", NULL, NULL);
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00003659 if (ctxt->recovery) {
3660 /*
3661 * We're not at the ending > !!
3662 * Error, unless in recover mode where we search forwards
3663 * until we find a >
3664 */
3665 while (CUR != '\0' && CUR != '>') NEXT;
3666 NEXT;
3667 }
Owen Taylor3473f882001-02-23 17:55:21 +00003668 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003669 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003670
3671 /*
3672 * If the name read is not one of the element in the parsing stack
3673 * then return, it's just an error.
3674 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003675 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3676 if (xmlStrEqual(name, ctxt->nameTab[i]))
3677 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003678 }
3679 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003680 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3681 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003682 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003683 }
3684
3685
3686 /*
3687 * Check for auto-closure of HTML elements.
3688 */
3689
3690 htmlAutoCloseOnClose(ctxt, name);
3691
3692 /*
3693 * Well formedness constraints, opening and closing must match.
3694 * With the exception that the autoclose may have popped stuff out
3695 * of the stack.
3696 */
3697 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003698 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003699 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3700 "Opening and ending tag mismatch: %s and %s\n",
3701 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003702 }
3703 }
3704
3705 /*
3706 * SAX: End of Tag
3707 */
3708 oldname = ctxt->name;
3709 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003710 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3711 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003712 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003713 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003714 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003715 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003716 }
3717
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003718 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003719}
3720
3721
3722/**
3723 * htmlParseReference:
3724 * @ctxt: an HTML parser context
3725 *
3726 * parse and handle entity references in content,
3727 * this will end-up in a call to character() since this is either a
3728 * CharRef, or a predefined entity.
3729 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003730static void
Owen Taylor3473f882001-02-23 17:55:21 +00003731htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003732 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003733 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003734 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003735 if (CUR != '&') return;
3736
3737 if (NXT(1) == '#') {
3738 unsigned int c;
3739 int bits, i = 0;
3740
3741 c = htmlParseCharRef(ctxt);
3742 if (c == 0)
3743 return;
3744
3745 if (c < 0x80) { out[i++]= c; bits= -6; }
3746 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3747 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3748 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3749
3750 for ( ; bits >= 0; bits-= 6) {
3751 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3752 }
3753 out[i] = 0;
3754
3755 htmlCheckParagraph(ctxt);
3756 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3757 ctxt->sax->characters(ctxt->userData, out, i);
3758 } else {
3759 ent = htmlParseEntityRef(ctxt, &name);
3760 if (name == NULL) {
3761 htmlCheckParagraph(ctxt);
3762 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3763 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3764 return;
3765 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003766 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003767 htmlCheckParagraph(ctxt);
3768 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3769 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3770 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3771 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3772 }
3773 } else {
3774 unsigned int c;
3775 int bits, i = 0;
3776
3777 c = ent->value;
3778 if (c < 0x80)
3779 { out[i++]= c; bits= -6; }
3780 else if (c < 0x800)
3781 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3782 else if (c < 0x10000)
3783 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3784 else
3785 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3786
3787 for ( ; bits >= 0; bits-= 6) {
3788 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3789 }
3790 out[i] = 0;
3791
3792 htmlCheckParagraph(ctxt);
3793 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3794 ctxt->sax->characters(ctxt->userData, out, i);
3795 }
Owen Taylor3473f882001-02-23 17:55:21 +00003796 }
3797}
3798
3799/**
3800 * htmlParseContent:
3801 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00003802 *
3803 * Parse a content: comment, sub-element, reference or text.
Owen Taylor3473f882001-02-23 17:55:21 +00003804 */
3805
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003806static void
Owen Taylor3473f882001-02-23 17:55:21 +00003807htmlParseContent(htmlParserCtxtPtr ctxt) {
3808 xmlChar *currentNode;
3809 int depth;
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003810 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003811
3812 currentNode = xmlStrdup(ctxt->name);
3813 depth = ctxt->nameNr;
3814 while (1) {
3815 long cons = ctxt->nbChars;
3816
3817 GROW;
3818 /*
3819 * Our tag or one of it's parent or children is ending.
3820 */
3821 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003822 if (htmlParseEndTag(ctxt) &&
3823 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3824 if (currentNode != NULL)
3825 xmlFree(currentNode);
3826 return;
3827 }
3828 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003829 }
3830
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003831 else if ((CUR == '<') &&
3832 ((IS_ASCII_LETTER(NXT(1))) ||
3833 (NXT(1) == '_') || (NXT(1) == ':'))) {
3834 name = htmlParseHTMLName_nonInvasive(ctxt);
3835 if (name == NULL) {
3836 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3837 "htmlParseStartTag: invalid element name\n",
3838 NULL, NULL);
3839 /* Dump the bogus tag like browsers do */
3840 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3841 NEXT;
3842
3843 if (currentNode != NULL)
3844 xmlFree(currentNode);
3845 return;
3846 }
3847
3848 if (ctxt->name != NULL) {
3849 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
3850 htmlAutoClose(ctxt, name);
3851 continue;
3852 }
3853 }
3854 }
3855
Owen Taylor3473f882001-02-23 17:55:21 +00003856 /*
3857 * Has this node been popped out during parsing of
3858 * the next element
3859 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003860 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3861 (!xmlStrEqual(currentNode, ctxt->name)))
3862 {
Owen Taylor3473f882001-02-23 17:55:21 +00003863 if (currentNode != NULL) xmlFree(currentNode);
3864 return;
3865 }
3866
Daniel Veillardf9533d12001-03-03 10:04:57 +00003867 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3868 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003869 /*
3870 * Handle SCRIPT/STYLE separately
3871 */
3872 htmlParseScript(ctxt);
3873 } else {
3874 /*
3875 * Sometimes DOCTYPE arrives in the middle of the document
3876 */
3877 if ((CUR == '<') && (NXT(1) == '!') &&
3878 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3879 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3880 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3881 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003882 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3883 "Misplaced DOCTYPE declaration\n",
3884 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003885 htmlParseDocTypeDecl(ctxt);
3886 }
3887
3888 /*
3889 * First case : a comment
3890 */
3891 if ((CUR == '<') && (NXT(1) == '!') &&
3892 (NXT(2) == '-') && (NXT(3) == '-')) {
3893 htmlParseComment(ctxt);
3894 }
3895
3896 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003897 * Second case : a Processing Instruction.
3898 */
3899 else if ((CUR == '<') && (NXT(1) == '?')) {
3900 htmlParsePI(ctxt);
3901 }
3902
3903 /*
3904 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00003905 */
3906 else if (CUR == '<') {
3907 htmlParseElement(ctxt);
3908 }
3909
3910 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003911 * Fourth case : a reference. If if has not been resolved,
Owen Taylor3473f882001-02-23 17:55:21 +00003912 * parsing returns it's Name, create the node
3913 */
3914 else if (CUR == '&') {
3915 htmlParseReference(ctxt);
3916 }
3917
3918 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003919 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00003920 */
3921 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003922 htmlAutoCloseOnEnd(ctxt);
3923 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003924 }
3925
3926 /*
3927 * Last case, text. Note that References are handled directly.
3928 */
3929 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003930 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003931 }
3932
3933 if (cons == ctxt->nbChars) {
3934 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003935 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3936 "detected an error in element content\n",
3937 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003938 }
3939 break;
3940 }
3941 }
3942 GROW;
3943 }
3944 if (currentNode != NULL) xmlFree(currentNode);
3945}
3946
3947/**
Daniel Veillard499cc922006-01-18 17:22:35 +00003948 * htmlParseContent:
3949 * @ctxt: an HTML parser context
3950 *
3951 * Parse a content: comment, sub-element, reference or text.
3952 */
3953
3954void
3955__htmlParseContent(void *ctxt) {
3956 if (ctxt != NULL)
3957 htmlParseContent((htmlParserCtxtPtr) ctxt);
3958}
3959
3960/**
Owen Taylor3473f882001-02-23 17:55:21 +00003961 * htmlParseElement:
3962 * @ctxt: an HTML parser context
3963 *
3964 * parse an HTML element, this is highly recursive
3965 *
3966 * [39] element ::= EmptyElemTag | STag content ETag
3967 *
3968 * [41] Attribute ::= Name Eq AttValue
3969 */
3970
3971void
3972htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003973 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003974 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003975 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003976 htmlParserNodeInfo node_info;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003977 int failed;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003978 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003979 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003980
Daniel Veillarda03e3652004-11-02 18:45:30 +00003981 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3982 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
Daniel Veillard597f1c12005-07-03 23:00:18 +00003983 "htmlParseElement: context error\n", NULL, NULL);
Daniel Veillarda03e3652004-11-02 18:45:30 +00003984 return;
3985 }
Owen Taylor3473f882001-02-23 17:55:21 +00003986 /* Capture start position */
3987 if (ctxt->record_info) {
3988 node_info.begin_pos = ctxt->input->consumed +
3989 (CUR_PTR - ctxt->input->base);
3990 node_info.begin_line = ctxt->input->line;
3991 }
3992
Daniel Veillard597f1c12005-07-03 23:00:18 +00003993 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003994 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003995 if ((failed == -1) || (name == NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003996 if (CUR == '>')
3997 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003998 return;
3999 }
Owen Taylor3473f882001-02-23 17:55:21 +00004000
4001 /*
4002 * Lookup the info for that element.
4003 */
4004 info = htmlTagLookup(name);
4005 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004006 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4007 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004008 }
4009
4010 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004011 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004012 */
4013 if ((CUR == '/') && (NXT(1) == '>')) {
4014 SKIP(2);
4015 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4016 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004017 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004018 return;
4019 }
4020
4021 if (CUR == '>') {
4022 NEXT;
4023 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004024 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4025 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004026
4027 /*
4028 * end of parsing of this node.
4029 */
4030 if (xmlStrEqual(name, ctxt->name)) {
4031 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00004032 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004033 }
4034
4035 /*
4036 * Capture end position and add node
4037 */
Daniel Veillard30e76072006-03-09 14:13:55 +00004038 if (ctxt->record_info) {
Owen Taylor3473f882001-02-23 17:55:21 +00004039 node_info.end_pos = ctxt->input->consumed +
4040 (CUR_PTR - ctxt->input->base);
4041 node_info.end_line = ctxt->input->line;
4042 node_info.node = ctxt->node;
4043 xmlParserAddNodeInfo(ctxt, &node_info);
4044 }
4045 return;
4046 }
4047
4048 /*
4049 * Check for an Empty Element from DTD definition
4050 */
4051 if ((info != NULL) && (info->empty)) {
4052 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4053 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004054 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004055 return;
4056 }
4057
4058 /*
4059 * Parse the content of the element:
4060 */
4061 currentNode = xmlStrdup(ctxt->name);
4062 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00004063 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00004064 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00004065 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00004066 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00004067 if (ctxt->nameNr < depth) break;
4068 }
4069
Owen Taylor3473f882001-02-23 17:55:21 +00004070 /*
4071 * Capture end position and add node
4072 */
4073 if ( currentNode != NULL && ctxt->record_info ) {
4074 node_info.end_pos = ctxt->input->consumed +
4075 (CUR_PTR - ctxt->input->base);
4076 node_info.end_line = ctxt->input->line;
4077 node_info.node = ctxt->node;
4078 xmlParserAddNodeInfo(ctxt, &node_info);
4079 }
William M. Brack76e95df2003-10-18 16:20:14 +00004080 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004081 htmlAutoCloseOnEnd(ctxt);
4082 }
4083
Owen Taylor3473f882001-02-23 17:55:21 +00004084 if (currentNode != NULL)
4085 xmlFree(currentNode);
4086}
4087
4088/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004089 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00004090 * @ctxt: an HTML parser context
4091 *
4092 * parse an HTML document (and build a tree if using the standard SAX
4093 * interface).
4094 *
4095 * Returns 0, -1 in case of error. the parser context is augmented
4096 * as a result of the parsing.
4097 */
4098
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00004099int
Owen Taylor3473f882001-02-23 17:55:21 +00004100htmlParseDocument(htmlParserCtxtPtr ctxt) {
4101 xmlDtdPtr dtd;
4102
Daniel Veillardd0463562001-10-13 09:15:48 +00004103 xmlInitParser();
4104
Owen Taylor3473f882001-02-23 17:55:21 +00004105 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00004106
Daniel Veillarda03e3652004-11-02 18:45:30 +00004107 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4108 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4109 "htmlParseDocument: context error\n", NULL, NULL);
4110 return(XML_ERR_INTERNAL_ERROR);
4111 }
4112 ctxt->html = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00004113 GROW;
4114 /*
4115 * SAX: beginning of the document processing.
4116 */
4117 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4118 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4119
4120 /*
4121 * Wipe out everything which is before the first '<'
4122 */
4123 SKIP_BLANKS;
4124 if (CUR == 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004125 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4126 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004127 }
4128
4129 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4130 ctxt->sax->startDocument(ctxt->userData);
4131
4132
4133 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004134 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004135 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004136 while (((CUR == '<') && (NXT(1) == '!') &&
4137 (NXT(2) == '-') && (NXT(3) == '-')) ||
4138 ((CUR == '<') && (NXT(1) == '?'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004139 htmlParseComment(ctxt);
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004140 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004141 SKIP_BLANKS;
4142 }
4143
4144
4145 /*
4146 * Then possibly doc type declaration(s) and more Misc
4147 * (doctypedecl Misc*)?
4148 */
4149 if ((CUR == '<') && (NXT(1) == '!') &&
4150 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4151 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4152 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4153 (UPP(8) == 'E')) {
4154 htmlParseDocTypeDecl(ctxt);
4155 }
4156 SKIP_BLANKS;
4157
4158 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004159 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004160 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004161 while (((CUR == '<') && (NXT(1) == '!') &&
4162 (NXT(2) == '-') && (NXT(3) == '-')) ||
4163 ((CUR == '<') && (NXT(1) == '?'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004164 htmlParseComment(ctxt);
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004165 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004166 SKIP_BLANKS;
4167 }
4168
4169 /*
4170 * Time to start parsing the tree itself
4171 */
4172 htmlParseContent(ctxt);
4173
4174 /*
4175 * autoclose
4176 */
4177 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004178 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004179
4180
4181 /*
4182 * SAX: end of the document processing.
4183 */
4184 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4185 ctxt->sax->endDocument(ctxt->userData);
4186
4187 if (ctxt->myDoc != NULL) {
4188 dtd = xmlGetIntSubset(ctxt->myDoc);
4189 if (dtd == NULL)
4190 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00004191 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004192 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4193 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4194 }
4195 if (! ctxt->wellFormed) return(-1);
4196 return(0);
4197}
4198
4199
4200/************************************************************************
4201 * *
4202 * Parser contexts handling *
4203 * *
4204 ************************************************************************/
4205
4206/**
William M. Brackedb65a72004-02-06 07:36:04 +00004207 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004208 * @ctxt: an HTML parser context
4209 *
4210 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004211 *
4212 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004213 */
4214
Daniel Veillardf403d292003-10-05 13:51:35 +00004215static int
Owen Taylor3473f882001-02-23 17:55:21 +00004216htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4217{
4218 htmlSAXHandler *sax;
4219
Daniel Veillardf403d292003-10-05 13:51:35 +00004220 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004221 memset(ctxt, 0, sizeof(htmlParserCtxt));
4222
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004223 ctxt->dict = xmlDictCreate();
4224 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004225 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4226 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004227 }
Owen Taylor3473f882001-02-23 17:55:21 +00004228 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4229 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004230 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4231 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004232 }
4233 else
4234 memset(sax, 0, sizeof(htmlSAXHandler));
4235
4236 /* Allocate the Input stack */
4237 ctxt->inputTab = (htmlParserInputPtr *)
4238 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4239 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004240 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004241 ctxt->inputNr = 0;
4242 ctxt->inputMax = 0;
4243 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004244 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004245 }
4246 ctxt->inputNr = 0;
4247 ctxt->inputMax = 5;
4248 ctxt->input = NULL;
4249 ctxt->version = NULL;
4250 ctxt->encoding = NULL;
4251 ctxt->standalone = -1;
4252 ctxt->instate = XML_PARSER_START;
4253
4254 /* Allocate the Node stack */
4255 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4256 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004257 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004258 ctxt->nodeNr = 0;
4259 ctxt->nodeMax = 0;
4260 ctxt->node = NULL;
4261 ctxt->inputNr = 0;
4262 ctxt->inputMax = 0;
4263 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004264 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004265 }
4266 ctxt->nodeNr = 0;
4267 ctxt->nodeMax = 10;
4268 ctxt->node = NULL;
4269
4270 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004271 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004272 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004273 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004274 ctxt->nameNr = 0;
4275 ctxt->nameMax = 10;
4276 ctxt->name = NULL;
4277 ctxt->nodeNr = 0;
4278 ctxt->nodeMax = 0;
4279 ctxt->node = NULL;
4280 ctxt->inputNr = 0;
4281 ctxt->inputMax = 0;
4282 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004283 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004284 }
4285 ctxt->nameNr = 0;
4286 ctxt->nameMax = 10;
4287 ctxt->name = NULL;
4288
Daniel Veillard092643b2003-09-25 14:29:29 +00004289 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004290 else {
4291 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004292 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004293 }
4294 ctxt->userData = ctxt;
4295 ctxt->myDoc = NULL;
4296 ctxt->wellFormed = 1;
4297 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004298 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004299 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004300 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004301 ctxt->vctxt.userData = ctxt;
4302 ctxt->vctxt.error = xmlParserValidityError;
4303 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004304 ctxt->record_info = 0;
4305 ctxt->validate = 0;
4306 ctxt->nbChars = 0;
4307 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004308 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004309 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004310 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004311}
4312
4313/**
4314 * htmlFreeParserCtxt:
4315 * @ctxt: an HTML parser context
4316 *
4317 * Free all the memory used by a parser context. However the parsed
4318 * document in ctxt->myDoc is not freed.
4319 */
4320
4321void
4322htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4323{
4324 xmlFreeParserCtxt(ctxt);
4325}
4326
4327/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004328 * htmlNewParserCtxt:
4329 *
4330 * Allocate and initialize a new parser context.
4331 *
Daniel Veillard34c647c2006-09-21 06:53:59 +00004332 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
Daniel Veillard1d995272002-07-22 16:43:32 +00004333 */
4334
Daniel Veillard34c647c2006-09-21 06:53:59 +00004335htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004336htmlNewParserCtxt(void)
4337{
4338 xmlParserCtxtPtr ctxt;
4339
4340 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4341 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004342 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004343 return(NULL);
4344 }
4345 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004346 if (htmlInitParserCtxt(ctxt) < 0) {
4347 htmlFreeParserCtxt(ctxt);
4348 return(NULL);
4349 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004350 return(ctxt);
4351}
4352
4353/**
4354 * htmlCreateMemoryParserCtxt:
4355 * @buffer: a pointer to a char array
4356 * @size: the size of the array
4357 *
4358 * Create a parser context for an HTML in-memory document.
4359 *
4360 * Returns the new parser context or NULL
4361 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004362htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004363htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4364 xmlParserCtxtPtr ctxt;
4365 xmlParserInputPtr input;
4366 xmlParserInputBufferPtr buf;
4367
4368 if (buffer == NULL)
4369 return(NULL);
4370 if (size <= 0)
4371 return(NULL);
4372
4373 ctxt = htmlNewParserCtxt();
4374 if (ctxt == NULL)
4375 return(NULL);
4376
4377 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4378 if (buf == NULL) return(NULL);
4379
4380 input = xmlNewInputStream(ctxt);
4381 if (input == NULL) {
4382 xmlFreeParserCtxt(ctxt);
4383 return(NULL);
4384 }
4385
4386 input->filename = NULL;
4387 input->buf = buf;
4388 input->base = input->buf->buffer->content;
4389 input->cur = input->buf->buffer->content;
4390 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4391
4392 inputPush(ctxt, input);
4393 return(ctxt);
4394}
4395
4396/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004397 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004398 * @cur: a pointer to an array of xmlChar
4399 * @encoding: a free form C string describing the HTML document encoding, or NULL
4400 *
4401 * Create a parser context for an HTML document.
4402 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004403 * TODO: check the need to add encoding handling there
4404 *
Owen Taylor3473f882001-02-23 17:55:21 +00004405 * Returns the new parser context or NULL
4406 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004407static htmlParserCtxtPtr
Daniel Veillard4d1320f2007-04-26 08:55:33 +00004408htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004409 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004410 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004411
Daniel Veillard1d995272002-07-22 16:43:32 +00004412 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004413 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004414 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004415 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
Daniel Veillard739e9d02007-04-27 09:33:58 +00004416 if (ctxt == NULL)
4417 return(NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004418
4419 if (encoding != NULL) {
4420 xmlCharEncoding enc;
4421 xmlCharEncodingHandlerPtr handler;
4422
4423 if (ctxt->input->encoding != NULL)
4424 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004425 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004426
4427 enc = xmlParseCharEncoding(encoding);
4428 /*
4429 * registered set of known encodings
4430 */
4431 if (enc != XML_CHAR_ENCODING_ERROR) {
4432 xmlSwitchEncoding(ctxt, enc);
4433 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004434 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4435 "Unsupported encoding %s\n",
4436 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004437 }
4438 } else {
4439 /*
4440 * fallback for unknown encodings
4441 */
4442 handler = xmlFindCharEncodingHandler((const char *) encoding);
4443 if (handler != NULL) {
4444 xmlSwitchToEncoding(ctxt, handler);
4445 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004446 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4447 "Unsupported encoding %s\n",
4448 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004449 }
4450 }
4451 }
4452 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004453}
4454
Daniel Veillard73b013f2003-09-30 12:36:01 +00004455#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004456/************************************************************************
4457 * *
4458 * Progressive parsing interfaces *
4459 * *
4460 ************************************************************************/
4461
4462/**
4463 * htmlParseLookupSequence:
4464 * @ctxt: an HTML parser context
4465 * @first: the first char to lookup
4466 * @next: the next char to lookup or zero
4467 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004468 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004469 *
4470 * Try to find if a sequence (first, next, third) or just (first next) or
4471 * (first) is available in the input stream.
4472 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4473 * to avoid rescanning sequences of bytes, it DOES change the state of the
4474 * parser, do not use liberally.
4475 * This is basically similar to xmlParseLookupSequence()
4476 *
4477 * Returns the index to the current parsing point if the full sequence
4478 * is available, -1 otherwise.
4479 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004480static int
Owen Taylor3473f882001-02-23 17:55:21 +00004481htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
William M. Brackc1939562003-08-05 15:52:22 +00004482 xmlChar next, xmlChar third, int iscomment) {
Owen Taylor3473f882001-02-23 17:55:21 +00004483 int base, len;
4484 htmlParserInputPtr in;
4485 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004486 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004487
4488 in = ctxt->input;
4489 if (in == NULL) return(-1);
4490 base = in->cur - in->base;
4491 if (base < 0) return(-1);
4492 if (ctxt->checkIndex > base)
4493 base = ctxt->checkIndex;
4494 if (in->buf == NULL) {
4495 buf = in->base;
4496 len = in->length;
4497 } else {
4498 buf = in->buf->buffer->content;
4499 len = in->buf->buffer->use;
4500 }
4501 /* take into account the sequence length */
4502 if (third) len -= 2;
4503 else if (next) len --;
4504 for (;base < len;base++) {
William M. Brackc1939562003-08-05 15:52:22 +00004505 if (!incomment && (base + 4 < len) && !iscomment) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004506 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4507 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4508 incomment = 1;
Daniel Veillard97e01882003-07-30 18:59:19 +00004509 /* do not increment past <! - some people use <!--> */
4510 base += 2;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004511 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004512 }
4513 if (incomment) {
William M. Brack4a557d92003-07-29 04:28:04 +00004514 if (base + 3 > len)
Daniel Veillardc1f78342001-11-10 11:43:05 +00004515 return(-1);
4516 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4517 (buf[base + 2] == '>')) {
4518 incomment = 0;
4519 base += 2;
4520 }
4521 continue;
4522 }
Owen Taylor3473f882001-02-23 17:55:21 +00004523 if (buf[base] == first) {
4524 if (third != 0) {
4525 if ((buf[base + 1] != next) ||
4526 (buf[base + 2] != third)) continue;
4527 } else if (next != 0) {
4528 if (buf[base + 1] != next) continue;
4529 }
4530 ctxt->checkIndex = 0;
4531#ifdef DEBUG_PUSH
4532 if (next == 0)
4533 xmlGenericError(xmlGenericErrorContext,
4534 "HPP: lookup '%c' found at %d\n",
4535 first, base);
4536 else if (third == 0)
4537 xmlGenericError(xmlGenericErrorContext,
4538 "HPP: lookup '%c%c' found at %d\n",
4539 first, next, base);
4540 else
4541 xmlGenericError(xmlGenericErrorContext,
4542 "HPP: lookup '%c%c%c' found at %d\n",
4543 first, next, third, base);
4544#endif
4545 return(base - (in->cur - in->base));
4546 }
4547 }
4548 ctxt->checkIndex = base;
4549#ifdef DEBUG_PUSH
4550 if (next == 0)
4551 xmlGenericError(xmlGenericErrorContext,
4552 "HPP: lookup '%c' failed\n", first);
4553 else if (third == 0)
4554 xmlGenericError(xmlGenericErrorContext,
4555 "HPP: lookup '%c%c' failed\n", first, next);
4556 else
4557 xmlGenericError(xmlGenericErrorContext,
4558 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4559#endif
4560 return(-1);
4561}
4562
4563/**
4564 * htmlParseTryOrFinish:
4565 * @ctxt: an HTML parser context
4566 * @terminate: last chunk indicator
4567 *
4568 * Try to progress on parsing
4569 *
4570 * Returns zero if no parsing was possible
4571 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004572static int
Owen Taylor3473f882001-02-23 17:55:21 +00004573htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4574 int ret = 0;
4575 htmlParserInputPtr in;
4576 int avail = 0;
4577 xmlChar cur, next;
4578
4579#ifdef DEBUG_PUSH
4580 switch (ctxt->instate) {
4581 case XML_PARSER_EOF:
4582 xmlGenericError(xmlGenericErrorContext,
4583 "HPP: try EOF\n"); break;
4584 case XML_PARSER_START:
4585 xmlGenericError(xmlGenericErrorContext,
4586 "HPP: try START\n"); break;
4587 case XML_PARSER_MISC:
4588 xmlGenericError(xmlGenericErrorContext,
4589 "HPP: try MISC\n");break;
4590 case XML_PARSER_COMMENT:
4591 xmlGenericError(xmlGenericErrorContext,
4592 "HPP: try COMMENT\n");break;
4593 case XML_PARSER_PROLOG:
4594 xmlGenericError(xmlGenericErrorContext,
4595 "HPP: try PROLOG\n");break;
4596 case XML_PARSER_START_TAG:
4597 xmlGenericError(xmlGenericErrorContext,
4598 "HPP: try START_TAG\n");break;
4599 case XML_PARSER_CONTENT:
4600 xmlGenericError(xmlGenericErrorContext,
4601 "HPP: try CONTENT\n");break;
4602 case XML_PARSER_CDATA_SECTION:
4603 xmlGenericError(xmlGenericErrorContext,
4604 "HPP: try CDATA_SECTION\n");break;
4605 case XML_PARSER_END_TAG:
4606 xmlGenericError(xmlGenericErrorContext,
4607 "HPP: try END_TAG\n");break;
4608 case XML_PARSER_ENTITY_DECL:
4609 xmlGenericError(xmlGenericErrorContext,
4610 "HPP: try ENTITY_DECL\n");break;
4611 case XML_PARSER_ENTITY_VALUE:
4612 xmlGenericError(xmlGenericErrorContext,
4613 "HPP: try ENTITY_VALUE\n");break;
4614 case XML_PARSER_ATTRIBUTE_VALUE:
4615 xmlGenericError(xmlGenericErrorContext,
4616 "HPP: try ATTRIBUTE_VALUE\n");break;
4617 case XML_PARSER_DTD:
4618 xmlGenericError(xmlGenericErrorContext,
4619 "HPP: try DTD\n");break;
4620 case XML_PARSER_EPILOG:
4621 xmlGenericError(xmlGenericErrorContext,
4622 "HPP: try EPILOG\n");break;
4623 case XML_PARSER_PI:
4624 xmlGenericError(xmlGenericErrorContext,
4625 "HPP: try PI\n");break;
4626 case XML_PARSER_SYSTEM_LITERAL:
4627 xmlGenericError(xmlGenericErrorContext,
4628 "HPP: try SYSTEM_LITERAL\n");break;
4629 }
4630#endif
4631
4632 while (1) {
4633
4634 in = ctxt->input;
4635 if (in == NULL) break;
4636 if (in->buf == NULL)
4637 avail = in->length - (in->cur - in->base);
4638 else
4639 avail = in->buf->buffer->use - (in->cur - in->base);
4640 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004641 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004642 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4643 /*
4644 * SAX: end of the document processing.
4645 */
4646 ctxt->instate = XML_PARSER_EOF;
4647 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4648 ctxt->sax->endDocument(ctxt->userData);
4649 }
4650 }
4651 if (avail < 1)
4652 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004653 cur = in->cur[0];
4654 if (cur == 0) {
4655 SKIP(1);
4656 continue;
4657 }
4658
Owen Taylor3473f882001-02-23 17:55:21 +00004659 switch (ctxt->instate) {
4660 case XML_PARSER_EOF:
4661 /*
4662 * Document parsing is done !
4663 */
4664 goto done;
4665 case XML_PARSER_START:
4666 /*
4667 * Very first chars read from the document flow.
4668 */
4669 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004670 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004671 SKIP_BLANKS;
4672 if (in->buf == NULL)
4673 avail = in->length - (in->cur - in->base);
4674 else
4675 avail = in->buf->buffer->use - (in->cur - in->base);
4676 }
4677 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4678 ctxt->sax->setDocumentLocator(ctxt->userData,
4679 &xmlDefaultSAXLocator);
4680 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4681 (!ctxt->disableSAX))
4682 ctxt->sax->startDocument(ctxt->userData);
4683
4684 cur = in->cur[0];
4685 next = in->cur[1];
4686 if ((cur == '<') && (next == '!') &&
4687 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4688 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4689 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4690 (UPP(8) == 'E')) {
4691 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004692 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004693 goto done;
4694#ifdef DEBUG_PUSH
4695 xmlGenericError(xmlGenericErrorContext,
4696 "HPP: Parsing internal subset\n");
4697#endif
4698 htmlParseDocTypeDecl(ctxt);
4699 ctxt->instate = XML_PARSER_PROLOG;
4700#ifdef DEBUG_PUSH
4701 xmlGenericError(xmlGenericErrorContext,
4702 "HPP: entering PROLOG\n");
4703#endif
4704 } else {
4705 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004706#ifdef DEBUG_PUSH
Daniel Veillard597f1c12005-07-03 23:00:18 +00004707 xmlGenericError(xmlGenericErrorContext,
4708 "HPP: entering MISC\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004709#endif
Daniel Veillard597f1c12005-07-03 23:00:18 +00004710 }
Owen Taylor3473f882001-02-23 17:55:21 +00004711 break;
4712 case XML_PARSER_MISC:
4713 SKIP_BLANKS;
4714 if (in->buf == NULL)
4715 avail = in->length - (in->cur - in->base);
4716 else
4717 avail = in->buf->buffer->use - (in->cur - in->base);
4718 if (avail < 2)
4719 goto done;
4720 cur = in->cur[0];
4721 next = in->cur[1];
4722 if ((cur == '<') && (next == '!') &&
4723 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4724 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004725 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004726 goto done;
4727#ifdef DEBUG_PUSH
4728 xmlGenericError(xmlGenericErrorContext,
4729 "HPP: Parsing Comment\n");
4730#endif
4731 htmlParseComment(ctxt);
4732 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004733 } else if ((cur == '<') && (next == '?')) {
4734 if ((!terminate) &&
4735 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4736 goto done;
4737#ifdef DEBUG_PUSH
4738 xmlGenericError(xmlGenericErrorContext,
4739 "HPP: Parsing PI\n");
4740#endif
4741 htmlParsePI(ctxt);
4742 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004743 } else if ((cur == '<') && (next == '!') &&
4744 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4745 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4746 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4747 (UPP(8) == 'E')) {
4748 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004749 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004750 goto done;
4751#ifdef DEBUG_PUSH
4752 xmlGenericError(xmlGenericErrorContext,
4753 "HPP: Parsing internal subset\n");
4754#endif
4755 htmlParseDocTypeDecl(ctxt);
4756 ctxt->instate = XML_PARSER_PROLOG;
4757#ifdef DEBUG_PUSH
4758 xmlGenericError(xmlGenericErrorContext,
4759 "HPP: entering PROLOG\n");
4760#endif
4761 } else if ((cur == '<') && (next == '!') &&
4762 (avail < 9)) {
4763 goto done;
4764 } else {
4765 ctxt->instate = XML_PARSER_START_TAG;
4766#ifdef DEBUG_PUSH
4767 xmlGenericError(xmlGenericErrorContext,
4768 "HPP: entering START_TAG\n");
4769#endif
4770 }
4771 break;
4772 case XML_PARSER_PROLOG:
4773 SKIP_BLANKS;
4774 if (in->buf == NULL)
4775 avail = in->length - (in->cur - in->base);
4776 else
4777 avail = in->buf->buffer->use - (in->cur - in->base);
4778 if (avail < 2)
4779 goto done;
4780 cur = in->cur[0];
4781 next = in->cur[1];
4782 if ((cur == '<') && (next == '!') &&
4783 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4784 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004785 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004786 goto done;
4787#ifdef DEBUG_PUSH
4788 xmlGenericError(xmlGenericErrorContext,
4789 "HPP: Parsing Comment\n");
4790#endif
4791 htmlParseComment(ctxt);
4792 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004793 } else if ((cur == '<') && (next == '?')) {
4794 if ((!terminate) &&
4795 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4796 goto done;
4797#ifdef DEBUG_PUSH
4798 xmlGenericError(xmlGenericErrorContext,
4799 "HPP: Parsing PI\n");
4800#endif
4801 htmlParsePI(ctxt);
4802 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004803 } else if ((cur == '<') && (next == '!') &&
4804 (avail < 4)) {
4805 goto done;
4806 } else {
4807 ctxt->instate = XML_PARSER_START_TAG;
4808#ifdef DEBUG_PUSH
4809 xmlGenericError(xmlGenericErrorContext,
4810 "HPP: entering START_TAG\n");
4811#endif
4812 }
4813 break;
4814 case XML_PARSER_EPILOG:
4815 if (in->buf == NULL)
4816 avail = in->length - (in->cur - in->base);
4817 else
4818 avail = in->buf->buffer->use - (in->cur - in->base);
4819 if (avail < 1)
4820 goto done;
4821 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004822 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004823 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004824 goto done;
4825 }
4826 if (avail < 2)
4827 goto done;
4828 next = in->cur[1];
4829 if ((cur == '<') && (next == '!') &&
4830 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4831 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004832 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004833 goto done;
4834#ifdef DEBUG_PUSH
4835 xmlGenericError(xmlGenericErrorContext,
4836 "HPP: Parsing Comment\n");
4837#endif
4838 htmlParseComment(ctxt);
4839 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004840 } else if ((cur == '<') && (next == '?')) {
4841 if ((!terminate) &&
4842 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4843 goto done;
4844#ifdef DEBUG_PUSH
4845 xmlGenericError(xmlGenericErrorContext,
4846 "HPP: Parsing PI\n");
4847#endif
4848 htmlParsePI(ctxt);
4849 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004850 } else if ((cur == '<') && (next == '!') &&
4851 (avail < 4)) {
4852 goto done;
4853 } else {
4854 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004855 ctxt->wellFormed = 0;
4856 ctxt->instate = XML_PARSER_EOF;
4857#ifdef DEBUG_PUSH
4858 xmlGenericError(xmlGenericErrorContext,
4859 "HPP: entering EOF\n");
4860#endif
4861 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4862 ctxt->sax->endDocument(ctxt->userData);
4863 goto done;
4864 }
4865 break;
4866 case XML_PARSER_START_TAG: {
Daniel Veillard6a0baa02005-12-10 11:11:12 +00004867 const xmlChar *name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004868 int failed;
Daniel Veillardbb371292001-08-16 23:26:59 +00004869 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004870
4871 if (avail < 2)
4872 goto done;
4873 cur = in->cur[0];
4874 if (cur != '<') {
4875 ctxt->instate = XML_PARSER_CONTENT;
4876#ifdef DEBUG_PUSH
4877 xmlGenericError(xmlGenericErrorContext,
4878 "HPP: entering CONTENT\n");
4879#endif
4880 break;
4881 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004882 if (in->cur[1] == '/') {
4883 ctxt->instate = XML_PARSER_END_TAG;
4884 ctxt->checkIndex = 0;
4885#ifdef DEBUG_PUSH
4886 xmlGenericError(xmlGenericErrorContext,
4887 "HPP: entering END_TAG\n");
4888#endif
4889 break;
4890 }
Owen Taylor3473f882001-02-23 17:55:21 +00004891 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004892 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004893 goto done;
4894
Daniel Veillard597f1c12005-07-03 23:00:18 +00004895 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004896 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00004897 if ((failed == -1) ||
Owen Taylor3473f882001-02-23 17:55:21 +00004898 (name == NULL)) {
4899 if (CUR == '>')
4900 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004901 break;
4902 }
Owen Taylor3473f882001-02-23 17:55:21 +00004903
4904 /*
4905 * Lookup the info for that element.
4906 */
4907 info = htmlTagLookup(name);
4908 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004909 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4910 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004911 }
4912
4913 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004914 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004915 */
4916 if ((CUR == '/') && (NXT(1) == '>')) {
4917 SKIP(2);
4918 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4919 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00004920 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004921 ctxt->instate = XML_PARSER_CONTENT;
4922#ifdef DEBUG_PUSH
4923 xmlGenericError(xmlGenericErrorContext,
4924 "HPP: entering CONTENT\n");
4925#endif
4926 break;
4927 }
4928
4929 if (CUR == '>') {
4930 NEXT;
4931 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004932 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4933 "Couldn't find end of Start Tag %s\n",
4934 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004935
4936 /*
4937 * end of parsing of this node.
4938 */
4939 if (xmlStrEqual(name, ctxt->name)) {
4940 nodePop(ctxt);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00004941 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004942 }
4943
4944 ctxt->instate = XML_PARSER_CONTENT;
4945#ifdef DEBUG_PUSH
4946 xmlGenericError(xmlGenericErrorContext,
4947 "HPP: entering CONTENT\n");
4948#endif
4949 break;
4950 }
4951
4952 /*
4953 * Check for an Empty Element from DTD definition
4954 */
4955 if ((info != NULL) && (info->empty)) {
4956 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4957 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00004958 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004959 }
4960 ctxt->instate = XML_PARSER_CONTENT;
4961#ifdef DEBUG_PUSH
4962 xmlGenericError(xmlGenericErrorContext,
4963 "HPP: entering CONTENT\n");
4964#endif
4965 break;
4966 }
4967 case XML_PARSER_CONTENT: {
4968 long cons;
4969 /*
4970 * Handle preparsed entities and charRef
4971 */
4972 if (ctxt->token != 0) {
4973 xmlChar chr[2] = { 0 , 0 } ;
4974
4975 chr[0] = (xmlChar) ctxt->token;
4976 htmlCheckParagraph(ctxt);
4977 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4978 ctxt->sax->characters(ctxt->userData, chr, 1);
4979 ctxt->token = 0;
4980 ctxt->checkIndex = 0;
4981 }
4982 if ((avail == 1) && (terminate)) {
4983 cur = in->cur[0];
4984 if ((cur != '<') && (cur != '&')) {
4985 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00004986 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004987 if (ctxt->sax->ignorableWhitespace != NULL)
4988 ctxt->sax->ignorableWhitespace(
4989 ctxt->userData, &cur, 1);
4990 } else {
4991 htmlCheckParagraph(ctxt);
4992 if (ctxt->sax->characters != NULL)
4993 ctxt->sax->characters(
4994 ctxt->userData, &cur, 1);
4995 }
4996 }
4997 ctxt->token = 0;
4998 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00004999 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00005000 break;
Owen Taylor3473f882001-02-23 17:55:21 +00005001 }
Owen Taylor3473f882001-02-23 17:55:21 +00005002 }
5003 if (avail < 2)
5004 goto done;
5005 cur = in->cur[0];
5006 next = in->cur[1];
5007 cons = ctxt->nbChars;
5008 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5009 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5010 /*
5011 * Handle SCRIPT/STYLE separately
5012 */
Daniel Veillard68716a72006-10-16 09:32:17 +00005013 if (!terminate) {
5014 int idx;
5015 xmlChar val;
5016
5017 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5018 if (idx < 0)
5019 goto done;
5020 val = in->cur[idx + 2];
5021 if (val == 0) /* bad cut of input */
5022 goto done;
5023 }
Owen Taylor3473f882001-02-23 17:55:21 +00005024 htmlParseScript(ctxt);
5025 if ((cur == '<') && (next == '/')) {
5026 ctxt->instate = XML_PARSER_END_TAG;
5027 ctxt->checkIndex = 0;
5028#ifdef DEBUG_PUSH
5029 xmlGenericError(xmlGenericErrorContext,
5030 "HPP: entering END_TAG\n");
5031#endif
5032 break;
5033 }
5034 } else {
5035 /*
5036 * Sometimes DOCTYPE arrives in the middle of the document
5037 */
5038 if ((cur == '<') && (next == '!') &&
5039 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5040 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5041 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5042 (UPP(8) == 'E')) {
5043 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005044 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005045 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00005046 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5047 "Misplaced DOCTYPE declaration\n",
5048 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005049 htmlParseDocTypeDecl(ctxt);
5050 } else if ((cur == '<') && (next == '!') &&
5051 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5052 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005053 (htmlParseLookupSequence(
5054 ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005055 goto done;
5056#ifdef DEBUG_PUSH
5057 xmlGenericError(xmlGenericErrorContext,
5058 "HPP: Parsing Comment\n");
5059#endif
5060 htmlParseComment(ctxt);
5061 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005062 } else if ((cur == '<') && (next == '?')) {
5063 if ((!terminate) &&
5064 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5065 goto done;
5066#ifdef DEBUG_PUSH
5067 xmlGenericError(xmlGenericErrorContext,
5068 "HPP: Parsing PI\n");
5069#endif
5070 htmlParsePI(ctxt);
5071 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00005072 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5073 goto done;
5074 } else if ((cur == '<') && (next == '/')) {
5075 ctxt->instate = XML_PARSER_END_TAG;
5076 ctxt->checkIndex = 0;
5077#ifdef DEBUG_PUSH
5078 xmlGenericError(xmlGenericErrorContext,
5079 "HPP: entering END_TAG\n");
5080#endif
5081 break;
5082 } else if (cur == '<') {
5083 ctxt->instate = XML_PARSER_START_TAG;
5084 ctxt->checkIndex = 0;
5085#ifdef DEBUG_PUSH
5086 xmlGenericError(xmlGenericErrorContext,
5087 "HPP: entering START_TAG\n");
5088#endif
5089 break;
5090 } else if (cur == '&') {
5091 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005092 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005093 goto done;
5094#ifdef DEBUG_PUSH
5095 xmlGenericError(xmlGenericErrorContext,
5096 "HPP: Parsing Reference\n");
5097#endif
5098 /* TODO: check generation of subtrees if noent !!! */
5099 htmlParseReference(ctxt);
5100 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005101 /*
5102 * check that the text sequence is complete
5103 * before handing out the data to the parser
5104 * to avoid problems with erroneous end of
5105 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00005106 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00005107 if ((!terminate) &&
5108 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5109 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00005110 ctxt->checkIndex = 0;
5111#ifdef DEBUG_PUSH
5112 xmlGenericError(xmlGenericErrorContext,
5113 "HPP: Parsing char data\n");
5114#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005115 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005116 }
5117 }
5118 if (cons == ctxt->nbChars) {
5119 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005120 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5121 "detected an error in element content\n",
5122 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005123 }
5124 NEXT;
5125 break;
5126 }
5127
5128 break;
5129 }
5130 case XML_PARSER_END_TAG:
5131 if (avail < 2)
5132 goto done;
5133 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005134 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005135 goto done;
5136 htmlParseEndTag(ctxt);
5137 if (ctxt->nameNr == 0) {
5138 ctxt->instate = XML_PARSER_EPILOG;
5139 } else {
5140 ctxt->instate = XML_PARSER_CONTENT;
5141 }
5142 ctxt->checkIndex = 0;
5143#ifdef DEBUG_PUSH
5144 xmlGenericError(xmlGenericErrorContext,
5145 "HPP: entering CONTENT\n");
5146#endif
5147 break;
5148 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005149 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5150 "HPP: internal error, state == CDATA\n",
5151 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005152 ctxt->instate = XML_PARSER_CONTENT;
5153 ctxt->checkIndex = 0;
5154#ifdef DEBUG_PUSH
5155 xmlGenericError(xmlGenericErrorContext,
5156 "HPP: entering CONTENT\n");
5157#endif
5158 break;
5159 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005160 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5161 "HPP: internal error, state == DTD\n",
5162 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005163 ctxt->instate = XML_PARSER_CONTENT;
5164 ctxt->checkIndex = 0;
5165#ifdef DEBUG_PUSH
5166 xmlGenericError(xmlGenericErrorContext,
5167 "HPP: entering CONTENT\n");
5168#endif
5169 break;
5170 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005171 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5172 "HPP: internal error, state == COMMENT\n",
5173 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005174 ctxt->instate = XML_PARSER_CONTENT;
5175 ctxt->checkIndex = 0;
5176#ifdef DEBUG_PUSH
5177 xmlGenericError(xmlGenericErrorContext,
5178 "HPP: entering CONTENT\n");
5179#endif
5180 break;
5181 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005182 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5183 "HPP: internal error, state == PI\n",
5184 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005185 ctxt->instate = XML_PARSER_CONTENT;
5186 ctxt->checkIndex = 0;
5187#ifdef DEBUG_PUSH
5188 xmlGenericError(xmlGenericErrorContext,
5189 "HPP: entering CONTENT\n");
5190#endif
5191 break;
5192 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005193 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5194 "HPP: internal error, state == ENTITY_DECL\n",
5195 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005196 ctxt->instate = XML_PARSER_CONTENT;
5197 ctxt->checkIndex = 0;
5198#ifdef DEBUG_PUSH
5199 xmlGenericError(xmlGenericErrorContext,
5200 "HPP: entering CONTENT\n");
5201#endif
5202 break;
5203 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005204 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5205 "HPP: internal error, state == ENTITY_VALUE\n",
5206 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005207 ctxt->instate = XML_PARSER_CONTENT;
5208 ctxt->checkIndex = 0;
5209#ifdef DEBUG_PUSH
5210 xmlGenericError(xmlGenericErrorContext,
5211 "HPP: entering DTD\n");
5212#endif
5213 break;
5214 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005215 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5216 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5217 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005218 ctxt->instate = XML_PARSER_START_TAG;
5219 ctxt->checkIndex = 0;
5220#ifdef DEBUG_PUSH
5221 xmlGenericError(xmlGenericErrorContext,
5222 "HPP: entering START_TAG\n");
5223#endif
5224 break;
5225 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005226 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5227 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5228 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005229 ctxt->instate = XML_PARSER_CONTENT;
5230 ctxt->checkIndex = 0;
5231#ifdef DEBUG_PUSH
5232 xmlGenericError(xmlGenericErrorContext,
5233 "HPP: entering CONTENT\n");
5234#endif
5235 break;
5236 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005237 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5238 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5239 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005240 ctxt->instate = XML_PARSER_CONTENT;
5241 ctxt->checkIndex = 0;
5242#ifdef DEBUG_PUSH
5243 xmlGenericError(xmlGenericErrorContext,
5244 "HPP: entering CONTENT\n");
5245#endif
5246 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005247 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005248 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5249 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5250 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005251 ctxt->instate = XML_PARSER_CONTENT;
5252 ctxt->checkIndex = 0;
5253#ifdef DEBUG_PUSH
5254 xmlGenericError(xmlGenericErrorContext,
5255 "HPP: entering CONTENT\n");
5256#endif
5257 break;
5258
Owen Taylor3473f882001-02-23 17:55:21 +00005259 }
5260 }
5261done:
5262 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005263 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005264 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5265 /*
5266 * SAX: end of the document processing.
5267 */
5268 ctxt->instate = XML_PARSER_EOF;
5269 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5270 ctxt->sax->endDocument(ctxt->userData);
5271 }
5272 }
5273 if ((ctxt->myDoc != NULL) &&
5274 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5275 (ctxt->instate == XML_PARSER_EPILOG))) {
5276 xmlDtdPtr dtd;
5277 dtd = xmlGetIntSubset(ctxt->myDoc);
5278 if (dtd == NULL)
5279 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00005280 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005281 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5282 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5283 }
5284#ifdef DEBUG_PUSH
5285 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5286#endif
5287 return(ret);
5288}
5289
5290/**
Owen Taylor3473f882001-02-23 17:55:21 +00005291 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00005292 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00005293 * @chunk: an char array
5294 * @size: the size in byte of the chunk
5295 * @terminate: last chunk indicator
5296 *
5297 * Parse a Chunk of memory
5298 *
5299 * Returns zero if no error, the xmlParserErrors otherwise.
5300 */
5301int
5302htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5303 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00005304 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5305 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5306 "htmlParseChunk: context error\n", NULL, NULL);
5307 return(XML_ERR_INTERNAL_ERROR);
5308 }
Owen Taylor3473f882001-02-23 17:55:21 +00005309 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5310 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5311 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5312 int cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005313 int res;
Owen Taylor3473f882001-02-23 17:55:21 +00005314
Daniel Veillardd2755a82005-08-07 23:42:39 +00005315 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5316 if (res < 0) {
5317 ctxt->errNo = XML_PARSER_EOF;
5318 ctxt->disableSAX = 1;
5319 return (XML_PARSER_EOF);
5320 }
Owen Taylor3473f882001-02-23 17:55:21 +00005321 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5322 ctxt->input->cur = ctxt->input->base + cur;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005323 ctxt->input->end =
5324 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005325#ifdef DEBUG_PUSH
5326 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5327#endif
5328
Daniel Veillard14f752c2003-08-09 11:44:50 +00005329#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00005330 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5331 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005332#endif
Owen Taylor3473f882001-02-23 17:55:21 +00005333 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005334 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5335 xmlParserInputBufferPtr in = ctxt->input->buf;
5336 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5337 (in->raw != NULL)) {
5338 int nbchars;
5339
5340 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5341 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005342 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5343 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005344 return(XML_ERR_INVALID_ENCODING);
5345 }
5346 }
5347 }
Owen Taylor3473f882001-02-23 17:55:21 +00005348 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00005349 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00005350 if (terminate) {
5351 if ((ctxt->instate != XML_PARSER_EOF) &&
5352 (ctxt->instate != XML_PARSER_EPILOG) &&
5353 (ctxt->instate != XML_PARSER_MISC)) {
5354 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005355 ctxt->wellFormed = 0;
5356 }
5357 if (ctxt->instate != XML_PARSER_EOF) {
5358 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5359 ctxt->sax->endDocument(ctxt->userData);
5360 }
5361 ctxt->instate = XML_PARSER_EOF;
5362 }
5363 return((xmlParserErrors) ctxt->errNo);
5364}
5365
5366/************************************************************************
5367 * *
5368 * User entry points *
5369 * *
5370 ************************************************************************/
5371
5372/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005373 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005374 * @sax: a SAX handler
5375 * @user_data: The user data returned on SAX callbacks
5376 * @chunk: a pointer to an array of chars
5377 * @size: number of chars in the array
5378 * @filename: an optional file name or URI
5379 * @enc: an optional encoding
5380 *
5381 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005382 * The value of @filename is used for fetching external entities
5383 * and error/warning reports.
5384 *
5385 * Returns the new parser context or NULL
5386 */
5387htmlParserCtxtPtr
5388htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5389 const char *chunk, int size, const char *filename,
5390 xmlCharEncoding enc) {
5391 htmlParserCtxtPtr ctxt;
5392 htmlParserInputPtr inputStream;
5393 xmlParserInputBufferPtr buf;
5394
Daniel Veillardd0463562001-10-13 09:15:48 +00005395 xmlInitParser();
5396
Owen Taylor3473f882001-02-23 17:55:21 +00005397 buf = xmlAllocParserInputBuffer(enc);
5398 if (buf == NULL) return(NULL);
5399
Daniel Veillardf403d292003-10-05 13:51:35 +00005400 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005401 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005402 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005403 return(NULL);
5404 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00005405 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5406 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005407 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005408 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005409 xmlFree(ctxt->sax);
5410 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5411 if (ctxt->sax == NULL) {
5412 xmlFree(buf);
5413 xmlFree(ctxt);
5414 return(NULL);
5415 }
5416 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5417 if (user_data != NULL)
5418 ctxt->userData = user_data;
5419 }
5420 if (filename == NULL) {
5421 ctxt->directory = NULL;
5422 } else {
5423 ctxt->directory = xmlParserGetDirectory(filename);
5424 }
5425
5426 inputStream = htmlNewInputStream(ctxt);
5427 if (inputStream == NULL) {
5428 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005429 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005430 return(NULL);
5431 }
5432
5433 if (filename == NULL)
5434 inputStream->filename = NULL;
5435 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005436 inputStream->filename = (char *)
5437 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005438 inputStream->buf = buf;
5439 inputStream->base = inputStream->buf->buffer->content;
5440 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillard5f704af2003-03-05 10:01:43 +00005441 inputStream->end =
5442 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005443
5444 inputPush(ctxt, inputStream);
5445
5446 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5447 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005448 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5449 int cur = ctxt->input->cur - ctxt->input->base;
5450
Owen Taylor3473f882001-02-23 17:55:21 +00005451 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005452
5453 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5454 ctxt->input->cur = ctxt->input->base + cur;
5455 ctxt->input->end =
5456 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005457#ifdef DEBUG_PUSH
5458 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5459#endif
5460 }
Daniel Veillard68716a72006-10-16 09:32:17 +00005461 ctxt->progressive = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00005462
5463 return(ctxt);
5464}
William M. Brack21e4ef22005-01-02 09:53:13 +00005465#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00005466
5467/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005468 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005469 * @cur: a pointer to an array of xmlChar
5470 * @encoding: a free form C string describing the HTML document encoding, or NULL
5471 * @sax: the SAX handler block
5472 * @userData: if using SAX, this pointer will be provided on callbacks.
5473 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005474 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5475 * to handle parse events. If sax is NULL, fallback to the default DOM
5476 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00005477 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005478 * Returns the resulting document tree unless SAX is NULL or the document is
5479 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005480 */
5481
5482htmlDocPtr
5483htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5484 htmlDocPtr ret;
5485 htmlParserCtxtPtr ctxt;
5486
Daniel Veillardd0463562001-10-13 09:15:48 +00005487 xmlInitParser();
5488
Owen Taylor3473f882001-02-23 17:55:21 +00005489 if (cur == NULL) return(NULL);
5490
5491
5492 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5493 if (ctxt == NULL) return(NULL);
5494 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005495 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005496 ctxt->sax = sax;
5497 ctxt->userData = userData;
5498 }
5499
5500 htmlParseDocument(ctxt);
5501 ret = ctxt->myDoc;
5502 if (sax != NULL) {
5503 ctxt->sax = NULL;
5504 ctxt->userData = NULL;
5505 }
5506 htmlFreeParserCtxt(ctxt);
5507
5508 return(ret);
5509}
5510
5511/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005512 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005513 * @cur: a pointer to an array of xmlChar
5514 * @encoding: a free form C string describing the HTML document encoding, or NULL
5515 *
5516 * parse an HTML in-memory document and build a tree.
5517 *
5518 * Returns the resulting document tree
5519 */
5520
5521htmlDocPtr
5522htmlParseDoc(xmlChar *cur, const char *encoding) {
5523 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5524}
5525
5526
5527/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005528 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005529 * @filename: the filename
5530 * @encoding: a free form C string describing the HTML document encoding, or NULL
5531 *
5532 * Create a parser context for a file content.
5533 * Automatic support for ZLIB/Compress compressed document is provided
5534 * by default if found at compile-time.
5535 *
5536 * Returns the new parser context or NULL
5537 */
5538htmlParserCtxtPtr
5539htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5540{
5541 htmlParserCtxtPtr ctxt;
5542 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005543 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005544 /* htmlCharEncoding enc; */
5545 xmlChar *content, *content_line = (xmlChar *) "charset=";
5546
Daniel Veillarda03e3652004-11-02 18:45:30 +00005547 if (filename == NULL)
5548 return(NULL);
5549
Daniel Veillardf403d292003-10-05 13:51:35 +00005550 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005551 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00005552 return(NULL);
5553 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005554 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5555 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00005556#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005557 if (xmlDefaultSAXHandler.error != NULL) {
5558 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5559 }
Daniel Veillard87247e82004-01-13 20:42:02 +00005560#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00005561 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005562 return(NULL);
5563 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005564
5565 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5566 xmlFree(canonicFilename);
5567 if (inputStream == NULL) {
5568 xmlFreeParserCtxt(ctxt);
5569 return(NULL);
5570 }
Owen Taylor3473f882001-02-23 17:55:21 +00005571
5572 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005573
Owen Taylor3473f882001-02-23 17:55:21 +00005574 /* set encoding */
5575 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005576 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Owen Taylor3473f882001-02-23 17:55:21 +00005577 if (content) {
5578 strcpy ((char *)content, (char *)content_line);
5579 strcat ((char *)content, (char *)encoding);
5580 htmlCheckEncoding (ctxt, content);
5581 xmlFree (content);
5582 }
5583 }
5584
5585 return(ctxt);
5586}
5587
5588/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005589 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005590 * @filename: the filename
5591 * @encoding: a free form C string describing the HTML document encoding, or NULL
5592 * @sax: the SAX handler block
5593 * @userData: if using SAX, this pointer will be provided on callbacks.
5594 *
5595 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5596 * compressed document is provided by default if found at compile-time.
5597 * It use the given SAX function block to handle the parsing callback.
5598 * If sax is NULL, fallback to the default DOM tree building routines.
5599 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005600 * Returns the resulting document tree unless SAX is NULL or the document is
5601 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005602 */
5603
5604htmlDocPtr
5605htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5606 void *userData) {
5607 htmlDocPtr ret;
5608 htmlParserCtxtPtr ctxt;
5609 htmlSAXHandlerPtr oldsax = NULL;
5610
Daniel Veillardd0463562001-10-13 09:15:48 +00005611 xmlInitParser();
5612
Owen Taylor3473f882001-02-23 17:55:21 +00005613 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5614 if (ctxt == NULL) return(NULL);
5615 if (sax != NULL) {
5616 oldsax = ctxt->sax;
5617 ctxt->sax = sax;
5618 ctxt->userData = userData;
5619 }
5620
5621 htmlParseDocument(ctxt);
5622
5623 ret = ctxt->myDoc;
5624 if (sax != NULL) {
5625 ctxt->sax = oldsax;
5626 ctxt->userData = NULL;
5627 }
5628 htmlFreeParserCtxt(ctxt);
5629
5630 return(ret);
5631}
5632
5633/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005634 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005635 * @filename: the filename
5636 * @encoding: a free form C string describing the HTML document encoding, or NULL
5637 *
5638 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5639 * compressed document is provided by default if found at compile-time.
5640 *
5641 * Returns the resulting document tree
5642 */
5643
5644htmlDocPtr
5645htmlParseFile(const char *filename, const char *encoding) {
5646 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5647}
5648
5649/**
5650 * htmlHandleOmittedElem:
5651 * @val: int 0 or 1
5652 *
5653 * Set and return the previous value for handling HTML omitted tags.
5654 *
5655 * Returns the last value for 0 for no handling, 1 for auto insertion.
5656 */
5657
5658int
5659htmlHandleOmittedElem(int val) {
5660 int old = htmlOmittedDefaultValue;
5661
5662 htmlOmittedDefaultValue = val;
5663 return(old);
5664}
5665
Daniel Veillard930dfb62003-02-05 10:17:38 +00005666/**
5667 * htmlElementAllowedHere:
5668 * @parent: HTML parent element
5669 * @elt: HTML element
5670 *
5671 * Checks whether an HTML element may be a direct child of a parent element.
5672 * Note - doesn't check for deprecated elements
5673 *
5674 * Returns 1 if allowed; 0 otherwise.
5675 */
5676int
5677htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5678 const char** p ;
5679
5680 if ( ! elt || ! parent || ! parent->subelts )
5681 return 0 ;
5682
5683 for ( p = parent->subelts; *p; ++p )
5684 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5685 return 1 ;
5686
5687 return 0 ;
5688}
5689/**
5690 * htmlElementStatusHere:
5691 * @parent: HTML parent element
5692 * @elt: HTML element
5693 *
5694 * Checks whether an HTML element may be a direct child of a parent element.
5695 * and if so whether it is valid or deprecated.
5696 *
5697 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5698 */
5699htmlStatus
5700htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5701 if ( ! parent || ! elt )
5702 return HTML_INVALID ;
5703 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5704 return HTML_INVALID ;
5705
5706 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5707}
5708/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005709 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005710 * @elt: HTML element
5711 * @attr: HTML attribute
5712 * @legacy: whether to allow deprecated attributes
5713 *
5714 * Checks whether an attribute is valid for an element
5715 * Has full knowledge of Required and Deprecated attributes
5716 *
5717 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5718 */
5719htmlStatus
5720htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5721 const char** p ;
5722
5723 if ( !elt || ! attr )
5724 return HTML_INVALID ;
5725
5726 if ( elt->attrs_req )
5727 for ( p = elt->attrs_req; *p; ++p)
5728 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5729 return HTML_REQUIRED ;
5730
5731 if ( elt->attrs_opt )
5732 for ( p = elt->attrs_opt; *p; ++p)
5733 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5734 return HTML_VALID ;
5735
5736 if ( legacy && elt->attrs_depr )
5737 for ( p = elt->attrs_depr; *p; ++p)
5738 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5739 return HTML_DEPRECATED ;
5740
5741 return HTML_INVALID ;
5742}
5743/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005744 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005745 * @node: an htmlNodePtr in a tree
5746 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005747 * for Element nodes)
5748 *
5749 * Checks whether the tree node is valid. Experimental (the author
5750 * only uses the HTML enhancements in a SAX parser)
5751 *
5752 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5753 * legacy allowed) or htmlElementStatusHere (otherwise).
5754 * for Attribute nodes, a return from htmlAttrAllowed
5755 * for other nodes, HTML_NA (no checks performed)
5756 */
5757htmlStatus
5758htmlNodeStatus(const htmlNodePtr node, int legacy) {
5759 if ( ! node )
5760 return HTML_INVALID ;
5761
5762 switch ( node->type ) {
5763 case XML_ELEMENT_NODE:
5764 return legacy
5765 ? ( htmlElementAllowedHere (
5766 htmlTagLookup(node->parent->name) , node->name
5767 ) ? HTML_VALID : HTML_INVALID )
5768 : htmlElementStatusHere(
5769 htmlTagLookup(node->parent->name) ,
5770 htmlTagLookup(node->name) )
5771 ;
5772 case XML_ATTRIBUTE_NODE:
5773 return htmlAttrAllowed(
5774 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5775 default: return HTML_NA ;
5776 }
5777}
Daniel Veillard9475a352003-09-26 12:47:50 +00005778/************************************************************************
5779 * *
5780 * New set (2.6.0) of simpler and more flexible APIs *
5781 * *
5782 ************************************************************************/
5783/**
5784 * DICT_FREE:
5785 * @str: a string
5786 *
5787 * Free a string if it is not owned by the "dict" dictionnary in the
5788 * current scope
5789 */
5790#define DICT_FREE(str) \
5791 if ((str) && ((!dict) || \
5792 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5793 xmlFree((char *)(str));
5794
5795/**
5796 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00005797 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00005798 *
5799 * Reset a parser context
5800 */
5801void
5802htmlCtxtReset(htmlParserCtxtPtr ctxt)
5803{
5804 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00005805 xmlDictPtr dict;
5806
5807 if (ctxt == NULL)
5808 return;
5809
Daniel Veillardf1a27c62006-10-13 22:33:03 +00005810 xmlInitParser();
Daniel Veillarda03e3652004-11-02 18:45:30 +00005811 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00005812
5813 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5814 xmlFreeInputStream(input);
5815 }
5816 ctxt->inputNr = 0;
5817 ctxt->input = NULL;
5818
5819 ctxt->spaceNr = 0;
Daniel Veillarda521d282004-11-09 14:59:59 +00005820 if (ctxt->spaceTab != NULL) {
5821 ctxt->spaceTab[0] = -1;
5822 ctxt->space = &ctxt->spaceTab[0];
5823 } else {
5824 ctxt->space = NULL;
5825 }
Daniel Veillard9475a352003-09-26 12:47:50 +00005826
5827
5828 ctxt->nodeNr = 0;
5829 ctxt->node = NULL;
5830
5831 ctxt->nameNr = 0;
5832 ctxt->name = NULL;
5833
5834 DICT_FREE(ctxt->version);
5835 ctxt->version = NULL;
5836 DICT_FREE(ctxt->encoding);
5837 ctxt->encoding = NULL;
5838 DICT_FREE(ctxt->directory);
5839 ctxt->directory = NULL;
5840 DICT_FREE(ctxt->extSubURI);
5841 ctxt->extSubURI = NULL;
5842 DICT_FREE(ctxt->extSubSystem);
5843 ctxt->extSubSystem = NULL;
5844 if (ctxt->myDoc != NULL)
5845 xmlFreeDoc(ctxt->myDoc);
5846 ctxt->myDoc = NULL;
5847
5848 ctxt->standalone = -1;
5849 ctxt->hasExternalSubset = 0;
5850 ctxt->hasPErefs = 0;
5851 ctxt->html = 1;
5852 ctxt->external = 0;
5853 ctxt->instate = XML_PARSER_START;
5854 ctxt->token = 0;
5855
5856 ctxt->wellFormed = 1;
5857 ctxt->nsWellFormed = 1;
5858 ctxt->valid = 1;
5859 ctxt->vctxt.userData = ctxt;
5860 ctxt->vctxt.error = xmlParserValidityError;
5861 ctxt->vctxt.warning = xmlParserValidityWarning;
5862 ctxt->record_info = 0;
5863 ctxt->nbChars = 0;
5864 ctxt->checkIndex = 0;
5865 ctxt->inSubset = 0;
5866 ctxt->errNo = XML_ERR_OK;
5867 ctxt->depth = 0;
Daniel Veillard772869f2006-11-08 09:16:56 +00005868 ctxt->charset = XML_CHAR_ENCODING_NONE;
Daniel Veillard9475a352003-09-26 12:47:50 +00005869 ctxt->catalogs = NULL;
5870 xmlInitNodeInfoSeq(&ctxt->node_seq);
5871
5872 if (ctxt->attsDefault != NULL) {
5873 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5874 ctxt->attsDefault = NULL;
5875 }
5876 if (ctxt->attsSpecial != NULL) {
5877 xmlHashFree(ctxt->attsSpecial, NULL);
5878 ctxt->attsSpecial = NULL;
5879 }
5880}
5881
5882/**
5883 * htmlCtxtUseOptions:
5884 * @ctxt: an HTML parser context
5885 * @options: a combination of htmlParserOption(s)
5886 *
5887 * Applies the options to the parser context
5888 *
5889 * Returns 0 in case of success, the set of unknown or unimplemented options
5890 * in case of error.
5891 */
5892int
5893htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5894{
Daniel Veillarda03e3652004-11-02 18:45:30 +00005895 if (ctxt == NULL)
5896 return(-1);
5897
Daniel Veillard9475a352003-09-26 12:47:50 +00005898 if (options & HTML_PARSE_NOWARNING) {
5899 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005900 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00005901 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005902 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00005903 }
5904 if (options & HTML_PARSE_NOERROR) {
5905 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005906 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00005907 ctxt->sax->fatalError = NULL;
5908 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005909 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00005910 }
5911 if (options & HTML_PARSE_PEDANTIC) {
5912 ctxt->pedantic = 1;
5913 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005914 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00005915 } else
5916 ctxt->pedantic = 0;
5917 if (options & XML_PARSE_NOBLANKS) {
5918 ctxt->keepBlanks = 0;
5919 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5920 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005921 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00005922 } else
5923 ctxt->keepBlanks = 1;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00005924 if (options & HTML_PARSE_RECOVER) {
5925 ctxt->recovery = 1;
Daniel Veillardaf616a72006-10-17 20:18:39 +00005926 options -= HTML_PARSE_RECOVER;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00005927 } else
5928 ctxt->recovery = 0;
Daniel Veillard8874b942005-08-25 13:19:21 +00005929 if (options & HTML_PARSE_COMPACT) {
5930 ctxt->options |= HTML_PARSE_COMPACT;
5931 options -= HTML_PARSE_COMPACT;
5932 }
Daniel Veillard9475a352003-09-26 12:47:50 +00005933 ctxt->dictNames = 0;
5934 return (options);
5935}
5936
5937/**
5938 * htmlDoRead:
5939 * @ctxt: an HTML parser context
5940 * @URL: the base URL to use for the document
5941 * @encoding: the document encoding, or NULL
5942 * @options: a combination of htmlParserOption(s)
5943 * @reuse: keep the context for reuse
5944 *
5945 * Common front-end for the htmlRead functions
5946 *
5947 * Returns the resulting document tree or NULL
5948 */
5949static htmlDocPtr
5950htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5951 int options, int reuse)
5952{
5953 htmlDocPtr ret;
5954
5955 htmlCtxtUseOptions(ctxt, options);
5956 ctxt->html = 1;
5957 if (encoding != NULL) {
5958 xmlCharEncodingHandlerPtr hdlr;
5959
5960 hdlr = xmlFindCharEncodingHandler(encoding);
5961 if (hdlr != NULL)
5962 xmlSwitchToEncoding(ctxt, hdlr);
5963 }
5964 if ((URL != NULL) && (ctxt->input != NULL) &&
5965 (ctxt->input->filename == NULL))
5966 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
5967 htmlParseDocument(ctxt);
5968 ret = ctxt->myDoc;
5969 ctxt->myDoc = NULL;
5970 if (!reuse) {
5971 if ((ctxt->dictNames) &&
5972 (ret != NULL) &&
5973 (ret->dict == ctxt->dict))
5974 ctxt->dict = NULL;
5975 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00005976 }
5977 return (ret);
5978}
5979
5980/**
5981 * htmlReadDoc:
5982 * @cur: a pointer to a zero terminated string
5983 * @URL: the base URL to use for the document
5984 * @encoding: the document encoding, or NULL
5985 * @options: a combination of htmlParserOption(s)
5986 *
5987 * parse an XML in-memory document and build a tree.
5988 *
5989 * Returns the resulting document tree
5990 */
5991htmlDocPtr
5992htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
5993{
5994 htmlParserCtxtPtr ctxt;
5995
5996 if (cur == NULL)
5997 return (NULL);
5998
Daniel Veillardf1a27c62006-10-13 22:33:03 +00005999 xmlInitParser();
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006000 ctxt = htmlCreateDocParserCtxt(cur, NULL);
Daniel Veillard9475a352003-09-26 12:47:50 +00006001 if (ctxt == NULL)
6002 return (NULL);
6003 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6004}
6005
6006/**
6007 * htmlReadFile:
6008 * @filename: a file or URL
6009 * @encoding: the document encoding, or NULL
6010 * @options: a combination of htmlParserOption(s)
6011 *
6012 * parse an XML file from the filesystem or the network.
6013 *
6014 * Returns the resulting document tree
6015 */
6016htmlDocPtr
6017htmlReadFile(const char *filename, const char *encoding, int options)
6018{
6019 htmlParserCtxtPtr ctxt;
6020
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006021 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006022 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6023 if (ctxt == NULL)
6024 return (NULL);
6025 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6026}
6027
6028/**
6029 * htmlReadMemory:
6030 * @buffer: a pointer to a char array
6031 * @size: the size of the array
6032 * @URL: the base URL to use for the document
6033 * @encoding: the document encoding, or NULL
6034 * @options: a combination of htmlParserOption(s)
6035 *
6036 * parse an XML in-memory document and build a tree.
6037 *
6038 * Returns the resulting document tree
6039 */
6040htmlDocPtr
6041htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6042{
6043 htmlParserCtxtPtr ctxt;
6044
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006045 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006046 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6047 if (ctxt == NULL)
6048 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006049 htmlDefaultSAXHandlerInit();
William M. Brackd43cdcd2004-08-03 15:13:29 +00006050 if (ctxt->sax != NULL)
6051 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00006052 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6053}
6054
6055/**
6056 * htmlReadFd:
6057 * @fd: an open file descriptor
6058 * @URL: the base URL to use for the document
6059 * @encoding: the document encoding, or NULL
6060 * @options: a combination of htmlParserOption(s)
6061 *
6062 * parse an XML from a file descriptor and build a tree.
6063 *
6064 * Returns the resulting document tree
6065 */
6066htmlDocPtr
6067htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6068{
6069 htmlParserCtxtPtr ctxt;
6070 xmlParserInputBufferPtr input;
6071 xmlParserInputPtr stream;
6072
6073 if (fd < 0)
6074 return (NULL);
6075
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006076 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006077 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6078 if (input == NULL)
6079 return (NULL);
6080 ctxt = xmlNewParserCtxt();
6081 if (ctxt == NULL) {
6082 xmlFreeParserInputBuffer(input);
6083 return (NULL);
6084 }
6085 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6086 if (stream == NULL) {
6087 xmlFreeParserInputBuffer(input);
6088 xmlFreeParserCtxt(ctxt);
6089 return (NULL);
6090 }
6091 inputPush(ctxt, stream);
6092 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6093}
6094
6095/**
6096 * htmlReadIO:
6097 * @ioread: an I/O read function
6098 * @ioclose: an I/O close function
6099 * @ioctx: an I/O handler
6100 * @URL: the base URL to use for the document
6101 * @encoding: the document encoding, or NULL
6102 * @options: a combination of htmlParserOption(s)
6103 *
6104 * parse an HTML document from I/O functions and source and build a tree.
6105 *
6106 * Returns the resulting document tree
6107 */
6108htmlDocPtr
6109htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6110 void *ioctx, const char *URL, const char *encoding, int options)
6111{
6112 htmlParserCtxtPtr ctxt;
6113 xmlParserInputBufferPtr input;
6114 xmlParserInputPtr stream;
6115
6116 if (ioread == NULL)
6117 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006118 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006119
6120 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6121 XML_CHAR_ENCODING_NONE);
6122 if (input == NULL)
6123 return (NULL);
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006124 ctxt = htmlNewParserCtxt();
Daniel Veillard9475a352003-09-26 12:47:50 +00006125 if (ctxt == NULL) {
6126 xmlFreeParserInputBuffer(input);
6127 return (NULL);
6128 }
6129 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6130 if (stream == NULL) {
6131 xmlFreeParserInputBuffer(input);
6132 xmlFreeParserCtxt(ctxt);
6133 return (NULL);
6134 }
6135 inputPush(ctxt, stream);
6136 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6137}
6138
6139/**
6140 * htmlCtxtReadDoc:
6141 * @ctxt: an HTML parser context
6142 * @cur: a pointer to a zero terminated string
6143 * @URL: the base URL to use for the document
6144 * @encoding: the document encoding, or NULL
6145 * @options: a combination of htmlParserOption(s)
6146 *
6147 * parse an XML in-memory document and build a tree.
6148 * This reuses the existing @ctxt parser context
6149 *
6150 * Returns the resulting document tree
6151 */
6152htmlDocPtr
6153htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6154 const char *URL, const char *encoding, int options)
6155{
6156 xmlParserInputPtr stream;
6157
6158 if (cur == NULL)
6159 return (NULL);
6160 if (ctxt == NULL)
6161 return (NULL);
6162
6163 htmlCtxtReset(ctxt);
6164
6165 stream = xmlNewStringInputStream(ctxt, cur);
6166 if (stream == NULL) {
6167 return (NULL);
6168 }
6169 inputPush(ctxt, stream);
6170 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6171}
6172
6173/**
6174 * htmlCtxtReadFile:
6175 * @ctxt: an HTML parser context
6176 * @filename: a file or URL
6177 * @encoding: the document encoding, or NULL
6178 * @options: a combination of htmlParserOption(s)
6179 *
6180 * parse an XML file from the filesystem or the network.
6181 * This reuses the existing @ctxt parser context
6182 *
6183 * Returns the resulting document tree
6184 */
6185htmlDocPtr
6186htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6187 const char *encoding, int options)
6188{
6189 xmlParserInputPtr stream;
6190
6191 if (filename == NULL)
6192 return (NULL);
6193 if (ctxt == NULL)
6194 return (NULL);
6195
6196 htmlCtxtReset(ctxt);
6197
Daniel Veillard29614c72004-11-26 10:47:26 +00006198 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006199 if (stream == NULL) {
6200 return (NULL);
6201 }
6202 inputPush(ctxt, stream);
6203 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6204}
6205
6206/**
6207 * htmlCtxtReadMemory:
6208 * @ctxt: an HTML parser context
6209 * @buffer: a pointer to a char array
6210 * @size: the size of the array
6211 * @URL: the base URL to use for the document
6212 * @encoding: the document encoding, or NULL
6213 * @options: a combination of htmlParserOption(s)
6214 *
6215 * parse an XML in-memory document and build a tree.
6216 * This reuses the existing @ctxt parser context
6217 *
6218 * Returns the resulting document tree
6219 */
6220htmlDocPtr
6221htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6222 const char *URL, const char *encoding, int options)
6223{
6224 xmlParserInputBufferPtr input;
6225 xmlParserInputPtr stream;
6226
6227 if (ctxt == NULL)
6228 return (NULL);
6229 if (buffer == NULL)
6230 return (NULL);
6231
6232 htmlCtxtReset(ctxt);
6233
6234 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6235 if (input == NULL) {
6236 return(NULL);
6237 }
6238
6239 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6240 if (stream == NULL) {
6241 xmlFreeParserInputBuffer(input);
6242 return(NULL);
6243 }
6244
6245 inputPush(ctxt, stream);
6246 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6247}
6248
6249/**
6250 * htmlCtxtReadFd:
6251 * @ctxt: an HTML parser context
6252 * @fd: an open file descriptor
6253 * @URL: the base URL to use for the document
6254 * @encoding: the document encoding, or NULL
6255 * @options: a combination of htmlParserOption(s)
6256 *
6257 * parse an XML from a file descriptor and build a tree.
6258 * This reuses the existing @ctxt parser context
6259 *
6260 * Returns the resulting document tree
6261 */
6262htmlDocPtr
6263htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6264 const char *URL, const char *encoding, int options)
6265{
6266 xmlParserInputBufferPtr input;
6267 xmlParserInputPtr stream;
6268
6269 if (fd < 0)
6270 return (NULL);
6271 if (ctxt == NULL)
6272 return (NULL);
6273
6274 htmlCtxtReset(ctxt);
6275
6276
6277 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6278 if (input == NULL)
6279 return (NULL);
6280 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6281 if (stream == NULL) {
6282 xmlFreeParserInputBuffer(input);
6283 return (NULL);
6284 }
6285 inputPush(ctxt, stream);
6286 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6287}
6288
6289/**
6290 * htmlCtxtReadIO:
6291 * @ctxt: an HTML parser context
6292 * @ioread: an I/O read function
6293 * @ioclose: an I/O close function
6294 * @ioctx: an I/O handler
6295 * @URL: the base URL to use for the document
6296 * @encoding: the document encoding, or NULL
6297 * @options: a combination of htmlParserOption(s)
6298 *
6299 * parse an HTML document from I/O functions and source and build a tree.
6300 * This reuses the existing @ctxt parser context
6301 *
6302 * Returns the resulting document tree
6303 */
6304htmlDocPtr
6305htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6306 xmlInputCloseCallback ioclose, void *ioctx,
6307 const char *URL,
6308 const char *encoding, int options)
6309{
6310 xmlParserInputBufferPtr input;
6311 xmlParserInputPtr stream;
6312
6313 if (ioread == NULL)
6314 return (NULL);
6315 if (ctxt == NULL)
6316 return (NULL);
6317
6318 htmlCtxtReset(ctxt);
6319
6320 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6321 XML_CHAR_ENCODING_NONE);
6322 if (input == NULL)
6323 return (NULL);
6324 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6325 if (stream == NULL) {
6326 xmlFreeParserInputBuffer(input);
6327 return (NULL);
6328 }
6329 inputPush(ctxt, stream);
6330 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6331}
6332
Daniel Veillard5d4644e2005-04-01 13:11:58 +00006333#define bottom_HTMLparser
6334#include "elfgcchack.h"
Owen Taylor3473f882001-02-23 17:55:21 +00006335#endif /* LIBXML_HTML_ENABLED */