blob: 3fe4b5dec8768d83daec84a38438187cd3027a03 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillardf403d292003-10-05 13:51:35 +000062 * Some factorized error routines *
63 * *
64 ************************************************************************/
65
66/**
William M. Brackedb65a72004-02-06 07:36:04 +000067 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000068 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
Daniel Veillard157fee02003-10-31 10:36:03 +000076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000079 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
Daniel Veillard157fee02003-10-31 10:36:03 +0000109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000112 if (ctxt != NULL)
113 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
117 NULL, 0, 0,
118 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000119 if (ctxt != NULL)
120 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
128 * @val: integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
135{
Daniel Veillard157fee02003-10-31 10:36:03 +0000136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
138 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000139 if (ctxt != NULL)
140 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000144 if (ctxt != NULL)
145 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000146}
147
148/************************************************************************
149 * *
Owen Taylor3473f882001-02-23 17:55:21 +0000150 * Parser stacks related functions and macros *
151 * *
152 ************************************************************************/
153
Daniel Veillard1c732d22002-11-30 11:22:59 +0000154/**
155 * htmlnamePush:
156 * @ctxt: an HTML parser context
157 * @value: the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000162 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000163static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000165{
166 if (ctxt->nameNr >= ctxt->nameMax) {
167 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000168 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000169 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000170 ctxt->nameMax *
171 sizeof(ctxt->nameTab[0]));
172 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000173 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000174 return (0);
175 }
176 }
177 ctxt->nameTab[ctxt->nameNr] = value;
178 ctxt->name = value;
179 return (ctxt->nameNr++);
180}
181/**
182 * htmlnamePop:
183 * @ctxt: an HTML parser context
184 *
185 * Pops the top element name from the name stack
186 *
187 * Returns the name just removed
188 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000189static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000190htmlnamePop(htmlParserCtxtPtr ctxt)
191{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000192 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000193
Daniel Veillard1c732d22002-11-30 11:22:59 +0000194 if (ctxt->nameNr <= 0)
195 return (0);
196 ctxt->nameNr--;
197 if (ctxt->nameNr < 0)
198 return (0);
199 if (ctxt->nameNr > 0)
200 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
201 else
202 ctxt->name = NULL;
203 ret = ctxt->nameTab[ctxt->nameNr];
204 ctxt->nameTab[ctxt->nameNr] = 0;
205 return (ret);
206}
Owen Taylor3473f882001-02-23 17:55:21 +0000207
208/*
209 * Macros for accessing the content. Those should be used only by the parser,
210 * and not exported.
211 *
212 * Dirty macros, i.e. one need to make assumption on the context to use them
213 *
214 * CUR_PTR return the current pointer to the xmlChar to be parsed.
215 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
216 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
217 * in UNICODE mode. This should be used internally by the parser
218 * only to compare to ASCII values otherwise it would break when
219 * running with UTF-8 encoding.
220 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
221 * to compare on ASCII based substring.
222 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
223 * it should be used only to compare on ASCII based substring.
224 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000225 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000226 *
227 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
228 *
229 * CURRENT Returns the current char value, with the full decoding of
230 * UTF-8 if we are using this mode. It returns an int.
231 * NEXT Skip to the next character, this does the proper decoding
232 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000233 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000234 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
235 */
236
237#define UPPER (toupper(*ctxt->input->cur))
238
Daniel Veillard77a90a72003-03-22 00:04:05 +0000239#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000240
241#define NXT(val) ctxt->input->cur[(val)]
242
243#define UPP(val) (toupper(ctxt->input->cur[(val)]))
244
245#define CUR_PTR ctxt->input->cur
246
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000247#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
248 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
249 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000250
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000251#define GROW if ((ctxt->progressive == 0) && \
252 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
253 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000254
255#define CURRENT ((int) (*ctxt->input->cur))
256
257#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
258
259/* Inported from XML */
260
Daniel Veillard561b7f82002-03-20 21:55:57 +0000261/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
262#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000263#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000264
Daniel Veillard561b7f82002-03-20 21:55:57 +0000265#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000266#define NXT(val) ctxt->input->cur[(val)]
267#define CUR_PTR ctxt->input->cur
268
269
270#define NEXTL(l) do { \
271 if (*(ctxt->input->cur) == '\n') { \
272 ctxt->input->line++; ctxt->input->col = 1; \
273 } else ctxt->input->col++; \
274 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
275 } while (0)
276
277/************
278 \
279 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
280 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
281 ************/
282
283#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
284#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
285
286#define COPY_BUF(l,b,i,v) \
287 if (l == 1) b[i++] = (xmlChar) v; \
288 else i += xmlCopyChar(l,&b[i],v)
289
290/**
291 * htmlCurrentChar:
292 * @ctxt: the HTML parser context
293 * @len: pointer to the length of the char read
294 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000295 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000296 * bytes in the input buffer. Implement the end of line normalization:
297 * 2.11 End-of-Line Handling
298 * If the encoding is unspecified, in the case we find an ISO-Latin-1
299 * char, then the encoding converter is plugged in automatically.
300 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000301 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000302 */
303
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000304static int
Owen Taylor3473f882001-02-23 17:55:21 +0000305htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
306 if (ctxt->instate == XML_PARSER_EOF)
307 return(0);
308
309 if (ctxt->token != 0) {
310 *len = 0;
311 return(ctxt->token);
312 }
313 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
314 /*
315 * We are supposed to handle UTF8, check it's valid
316 * From rfc2044: encoding of the Unicode values on UTF-8:
317 *
318 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
319 * 0000 0000-0000 007F 0xxxxxxx
320 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
321 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
322 *
323 * Check for the 0x110000 limit too
324 */
325 const unsigned char *cur = ctxt->input->cur;
326 unsigned char c;
327 unsigned int val;
328
329 c = *cur;
330 if (c & 0x80) {
331 if (cur[1] == 0)
332 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
333 if ((cur[1] & 0xc0) != 0x80)
334 goto encoding_error;
335 if ((c & 0xe0) == 0xe0) {
336
337 if (cur[2] == 0)
338 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
339 if ((cur[2] & 0xc0) != 0x80)
340 goto encoding_error;
341 if ((c & 0xf0) == 0xf0) {
342 if (cur[3] == 0)
343 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
344 if (((c & 0xf8) != 0xf0) ||
345 ((cur[3] & 0xc0) != 0x80))
346 goto encoding_error;
347 /* 4-byte code */
348 *len = 4;
349 val = (cur[0] & 0x7) << 18;
350 val |= (cur[1] & 0x3f) << 12;
351 val |= (cur[2] & 0x3f) << 6;
352 val |= cur[3] & 0x3f;
353 } else {
354 /* 3-byte code */
355 *len = 3;
356 val = (cur[0] & 0xf) << 12;
357 val |= (cur[1] & 0x3f) << 6;
358 val |= cur[2] & 0x3f;
359 }
360 } else {
361 /* 2-byte code */
362 *len = 2;
363 val = (cur[0] & 0x1f) << 6;
364 val |= cur[1] & 0x3f;
365 }
366 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000367 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
368 "Char 0x%X out of allowed range\n", val);
Owen Taylor3473f882001-02-23 17:55:21 +0000369 }
370 return(val);
371 } else {
372 /* 1-byte code */
373 *len = 1;
374 return((int) *ctxt->input->cur);
375 }
376 }
377 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000378 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000379 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000380 * XML constructs only use < 128 chars
381 */
382 *len = 1;
383 if ((int) *ctxt->input->cur < 0x80)
384 return((int) *ctxt->input->cur);
385
386 /*
387 * Humm this is bad, do an automatic flow conversion
388 */
389 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
390 ctxt->charset = XML_CHAR_ENCODING_UTF8;
391 return(xmlCurrentChar(ctxt, len));
392
393encoding_error:
394 /*
395 * If we detect an UTF8 error that probably mean that the
396 * input encoding didn't get properly advertized in the
397 * declaration header. Report the error and switch the encoding
398 * to ISO-Latin-1 (if you don't like this policy, just declare the
399 * encoding !)
400 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000401 {
402 char buffer[150];
403
404 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
Owen Taylor3473f882001-02-23 17:55:21 +0000405 ctxt->input->cur[0], ctxt->input->cur[1],
406 ctxt->input->cur[2], ctxt->input->cur[3]);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000407 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
408 "Input is not proper UTF-8, indicate encoding !\n",
409 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000410 }
411
412 ctxt->charset = XML_CHAR_ENCODING_8859_1;
413 *len = 1;
414 return((int) *ctxt->input->cur);
415}
416
417/**
Owen Taylor3473f882001-02-23 17:55:21 +0000418 * htmlSkipBlankChars:
419 * @ctxt: the HTML parser context
420 *
421 * skip all blanks character found at that point in the input streams.
422 *
423 * Returns the number of space chars skipped
424 */
425
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000426static int
Owen Taylor3473f882001-02-23 17:55:21 +0000427htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
428 int res = 0;
429
William M. Brack76e95df2003-10-18 16:20:14 +0000430 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000431 if ((*ctxt->input->cur == 0) &&
432 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
433 xmlPopInput(ctxt);
434 } else {
435 if (*(ctxt->input->cur) == '\n') {
436 ctxt->input->line++; ctxt->input->col = 1;
437 } else ctxt->input->col++;
438 ctxt->input->cur++;
439 ctxt->nbChars++;
440 if (*ctxt->input->cur == 0)
441 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
442 }
443 res++;
444 }
445 return(res);
446}
447
448
449
450/************************************************************************
451 * *
452 * The list of HTML elements and their properties *
453 * *
454 ************************************************************************/
455
456/*
457 * Start Tag: 1 means the start tag can be ommited
458 * End Tag: 1 means the end tag can be ommited
459 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000460 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000461 * Depr: this element is deprecated
462 * DTD: 1 means that this element is valid only in the Loose DTD
463 * 2 means that this element is valid only in the Frameset DTD
464 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000465 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000466 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000467 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000468
469/* Definitions and a couple of vars for HTML Elements */
470
471#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000472#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000473#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000474#define NB_PHRASE 10
Daniel Veillard930dfb62003-02-05 10:17:38 +0000475#define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000476#define NB_SPECIAL 15
Daniel Veillard930dfb62003-02-05 10:17:38 +0000477#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000478#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
479#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
480#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000481#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000482#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000483#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000484#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000485#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000486#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000487#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000488#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000489#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000490#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000491#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000492#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000493#define EMPTY NULL
494
495
496static const char* html_flow[] = { FLOW, NULL } ;
497static const char* html_inline[] = { INLINE, NULL } ;
498
499/* placeholders: elts with content but no subelements */
500static const char* html_pcdata[] = { NULL } ;
501#define html_cdata html_pcdata
502
503
504/* ... and for HTML Attributes */
505
506#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000507#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000508#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000509#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000510#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000511#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000512#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000513#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000514#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000515#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000516#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000517#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000518
519static const char* html_attrs[] = { ATTRS, NULL } ;
520static const char* core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
521static const char* core_attrs[] = { COREATTRS, NULL } ;
522static const char* i18n_attrs[] = { I18N, NULL } ;
523
524
525/* Other declarations that should go inline ... */
526static const char* a_attrs[] = { ATTRS, "charset", "type", "name",
527 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
528 "tabindex", "onfocus", "onblur", NULL } ;
529static const char* target_attr[] = { "target", NULL } ;
530static const char* rows_cols_attr[] = { "rows", "cols", NULL } ;
531static const char* alt_attr[] = { "alt", NULL } ;
532static const char* src_alt_attrs[] = { "src", "alt", NULL } ;
533static const char* href_attrs[] = { "href", NULL } ;
534static const char* clear_attrs[] = { "clear", NULL } ;
535static const char* inline_p[] = { INLINE, "p", NULL } ;
536static const char* flow_param[] = { FLOW, "param", NULL } ;
537static const char* applet_attrs[] = { COREATTRS , "codebase",
538 "archive", "alt", "name", "height", "width", "align",
539 "hspace", "vspace", NULL } ;
540static const char* area_attrs[] = { "shape", "coords", "href", "nohref",
541 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
542static const char* basefont_attrs[] =
543 { "id", "size", "color", "face", NULL } ;
544static const char* quote_attrs[] = { ATTRS, "cite", NULL } ;
545static const char* body_contents[] = { FLOW, "ins", "del", NULL } ;
546static const char* body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
547static const char* body_depr[] = { "background", "bgcolor", "text",
548 "link", "vlink", "alink", NULL } ;
549static const char* button_attrs[] = { ATTRS, "name", "value", "type",
550 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
551
552
553static const char* col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
554static const char* col_elt[] = { "col", NULL } ;
555static const char* edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
556static const char* compact_attrs[] = { ATTRS, "compact", NULL } ;
557static const char* dl_contents[] = { "dt", "dd", NULL } ;
558static const char* compact_attr[] = { "compact", NULL } ;
559static const char* label_attr[] = { "label", NULL } ;
560static const char* fieldset_contents[] = { FLOW, "legend" } ;
561static const char* font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
562static const char* form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
563static const char* form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
564static const char* frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
565static const char* frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
566static const char* frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
567static const char* head_attrs[] = { I18N, "profile", NULL } ;
568static const char* head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
569static const char* hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
570static const char* version_attr[] = { "version", NULL } ;
571static const char* html_content[] = { "head", "body", "frameset", NULL } ;
572static const char* iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
573static const char* img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
574static const char* input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
575static const char* prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
576static const char* label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
577static const char* legend_attrs[] = { ATTRS, "accesskey", NULL } ;
578static const char* align_attr[] = { "align", NULL } ;
579static const char* link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
580static const char* map_contents[] = { BLOCK, "area", NULL } ;
581static const char* name_attr[] = { "name", NULL } ;
582static const char* action_attr[] = { "action", NULL } ;
583static const char* blockli_elt[] = { BLOCK, "li", NULL } ;
584static const char* meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
585static const char* content_attr[] = { "content", NULL } ;
586static const char* type_attr[] = { "type", NULL } ;
587static const char* noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
588static const char* object_contents[] = { FLOW, "param", NULL } ;
589static const char* object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
590static const char* object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
591static const char* ol_attrs[] = { "type", "compact", "start", NULL} ;
592static const char* option_elt[] = { "option", NULL } ;
593static const char* optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
594static const char* option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
595static const char* param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
596static const char* width_attr[] = { "width", NULL } ;
597static const char* pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
598static const char* script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
599static const char* language_attr[] = { "language", NULL } ;
600static const char* select_content[] = { "optgroup", "option", NULL } ;
601static const char* select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
602static const char* style_attrs[] = { I18N, "media", "title", NULL } ;
603static const char* table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
604static const char* table_depr[] = { "align", "bgcolor", NULL } ;
605static const char* table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
606static const char* tr_elt[] = { "tr", NULL } ;
607static const char* talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
608static const char* th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
609static const char* th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
610static const char* textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
611static const char* tr_contents[] = { "th", "td", NULL } ;
612static const char* bgcolor_attr[] = { "bgcolor", NULL } ;
613static const char* li_elt[] = { "li", NULL } ;
614static const char* ul_depr[] = { "type", "compact", NULL} ;
615static const char* dir_attr[] = { "dir", NULL} ;
616
617#define DECL (const char**)
618
Daniel Veillard22090732001-07-16 00:06:07 +0000619static const htmlElemDesc
620html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000621{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
622 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
623},
624{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
625 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
626},
627{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
628 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
629},
630{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
631 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
632},
633{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
634 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
635},
636{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
637 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
638},
639{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
640 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
641},
642{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
643 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
644},
645{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
646 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
647},
648{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
649 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
650},
651{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
652 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
653},
654{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
655 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
656},
657{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
658 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
659},
660{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
661 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
662},
663{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
664 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
665},
666{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
667 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
668},
669{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
670 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
671},
672{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
673 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
674},
675{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
676 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
677},
678{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
679 EMPTY , NULL , DECL col_attrs , NULL, NULL
680},
681{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
682 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
683},
684{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
685 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
686},
687{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
688 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
689},
690{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
691 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
692},
693{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
694 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
695},
696{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
697 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
698},
699{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
700 DECL dl_contents , "dd" , html_attrs, DECL compact_attr, NULL
701},
702{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
703 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
704},
705{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
706 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
707},
708{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
709 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
710},
711{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
712 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
713},
714{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
715 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
716},
717{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
718 EMPTY, NULL, NULL, DECL frame_attrs, NULL
719},
720{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
721 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
722},
723{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
724 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
725},
726{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
727 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
728},
729{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
730 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
731},
732{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
733 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
734},
735{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
736 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
737},
738{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
739 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
740},
741{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
742 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
743},
744{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
745 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
746},
747{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
748 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
749},
750{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
751 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
752},
753{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
754 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
755},
756{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
757 EMPTY, NULL, DECL img_attrs, DECL align_attr, src_alt_attrs
758},
759{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
760 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
761},
762{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
763 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
764},
765{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
766 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
767},
768{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
769 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
770},
771{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
772 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
773},
774{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
775 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
776},
777{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
778 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
779},
780{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
781 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
782},
783{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
784 DECL map_contents , NULL, DECL html_attrs , NULL, name_attr
785},
786{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
787 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
788},
789{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
790 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
791},
792{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
793 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
794},
795{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
796 DECL html_flow, "div", DECL html_attrs, NULL, NULL
797},
798{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
799 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
800},
801{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
802 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
803},
804{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
805 option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
806},
807{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
808 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
809},
810{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
811 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
812},
813{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
814 EMPTY, NULL, DECL param_attrs, NULL, name_attr
815},
816{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
817 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
818},
819{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
820 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
821},
822{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
823 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
824},
825{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
826 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
827},
828{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
829 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
830},
831{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
832 DECL select_content, NULL, DECL select_attrs, NULL, NULL
833},
834{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
835 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
836},
837{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
838 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
839},
840{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
841 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
842},
843{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
844 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
845},
846{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
847 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
848},
849{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
850 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
851},
852{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
853 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
854},
855{ "table", 0, 0, 0, 0, 0, 0, 0, "",
856 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
857},
858{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
859 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
860},
861{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
862 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
863},
864{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
865 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
866},
867{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
868 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
869},
870{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
871 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
872},
873{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
874 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
875},
876{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
877 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
878},
879{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
880 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
881},
882{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
883 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
884},
885{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
886 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
887},
888{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
889 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
890},
891{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
892 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
893}
Owen Taylor3473f882001-02-23 17:55:21 +0000894};
895
896/*
Owen Taylor3473f882001-02-23 17:55:21 +0000897 * start tags that imply the end of current element
898 */
Daniel Veillard22090732001-07-16 00:06:07 +0000899static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000900"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
901 "dl", "ul", "ol", "menu", "dir", "address", "pre",
902 "listing", "xmp", "head", NULL,
903"head", "p", NULL,
904"title", "p", NULL,
905"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +0000906"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000907"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
908 "pre", "listing", "xmp", "head", "li", NULL,
909"hr", "p", "head", NULL,
910"h1", "p", "head", NULL,
911"h2", "p", "head", NULL,
912"h3", "p", "head", NULL,
913"h4", "p", "head", NULL,
914"h5", "p", "head", NULL,
915"h6", "p", "head", NULL,
916"dir", "p", "head", NULL,
917"address", "p", "head", "ul", NULL,
918"pre", "p", "head", "ul", NULL,
919"listing", "p", "head", NULL,
920"xmp", "p", "head", NULL,
921"blockquote", "p", "head", NULL,
922"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
923 "xmp", "head", NULL,
924"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
925 "head", "dd", NULL,
926"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
927 "head", "dt", NULL,
928"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
929 "listing", "xmp", NULL,
930"ol", "p", "head", "ul", NULL,
931"menu", "p", "head", "ul", NULL,
932"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
933"div", "p", "head", NULL,
934"noscript", "p", "head", NULL,
935"center", "font", "b", "i", "p", "head", NULL,
936"a", "a", NULL,
937"caption", "p", NULL,
938"colgroup", "caption", "colgroup", "col", "p", NULL,
939"col", "caption", "col", "p", NULL,
940"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
941 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000942"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
943"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000944"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
945"thead", "caption", "col", "colgroup", NULL,
946"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
947 "tbody", "p", NULL,
948"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
949 "tfoot", "tbody", "p", NULL,
950"optgroup", "option", NULL,
951"option", "option", NULL,
952"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
953 "pre", "listing", "xmp", "a", NULL,
954NULL
955};
956
957/*
958 * The list of HTML elements which are supposed not to have
959 * CDATA content and where a p element will be implied
960 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000961 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000962 * implied paragraph
963 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000964static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000965 "html",
966 "head",
967 "body",
968 NULL
969};
970
971/*
972 * The list of HTML attributes which are of content %Script;
973 * NOTE: when adding ones, check htmlIsScriptAttribute() since
974 * it assumes the name starts with 'on'
975 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000976static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000977 "onclick",
978 "ondblclick",
979 "onmousedown",
980 "onmouseup",
981 "onmouseover",
982 "onmousemove",
983 "onmouseout",
984 "onkeypress",
985 "onkeydown",
986 "onkeyup",
987 "onload",
988 "onunload",
989 "onfocus",
990 "onblur",
991 "onsubmit",
992 "onrest",
993 "onchange",
994 "onselect"
995};
996
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000997/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000998 * This table is used by the htmlparser to know what to do with
999 * broken html pages. By assigning different priorities to different
1000 * elements the parser can decide how to handle extra endtags.
1001 * Endtags are only allowed to close elements with lower or equal
1002 * priority.
1003 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001004
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001005typedef struct {
1006 const char *name;
1007 int priority;
1008} elementPriority;
1009
Daniel Veillard22090732001-07-16 00:06:07 +00001010static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001011 {"div", 150},
1012 {"td", 160},
1013 {"th", 160},
1014 {"tr", 170},
1015 {"thead", 180},
1016 {"tbody", 180},
1017 {"tfoot", 180},
1018 {"table", 190},
1019 {"head", 200},
1020 {"body", 200},
1021 {"html", 220},
1022 {NULL, 100} /* Default priority */
1023};
Owen Taylor3473f882001-02-23 17:55:21 +00001024
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001025static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001026static int htmlStartCloseIndexinitialized = 0;
1027
1028/************************************************************************
1029 * *
1030 * functions to handle HTML specific data *
1031 * *
1032 ************************************************************************/
1033
1034/**
1035 * htmlInitAutoClose:
1036 *
1037 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1038 * This is not reentrant. Call xmlInitParser() once before processing in
1039 * case of use in multithreaded programs.
1040 */
1041void
1042htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001043 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001044
1045 if (htmlStartCloseIndexinitialized) return;
1046
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001047 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1048 indx = 0;
1049 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1050 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001051 while (htmlStartClose[i] != NULL) i++;
1052 i++;
1053 }
1054 htmlStartCloseIndexinitialized = 1;
1055}
1056
1057/**
1058 * htmlTagLookup:
1059 * @tag: The tag name in lowercase
1060 *
1061 * Lookup the HTML tag in the ElementTable
1062 *
1063 * Returns the related htmlElemDescPtr or NULL if not found.
1064 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001065const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001066htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001067 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001068
1069 for (i = 0; i < (sizeof(html40ElementTable) /
1070 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001071 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001072 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001073 }
1074 return(NULL);
1075}
1076
1077/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001078 * htmlGetEndPriority:
1079 * @name: The name of the element to look up the priority for.
1080 *
1081 * Return value: The "endtag" priority.
1082 **/
1083static int
1084htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001085 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001086
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001087 while ((htmlEndPriority[i].name != NULL) &&
1088 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1089 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001090
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001091 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001092}
1093
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001094
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001095/**
Owen Taylor3473f882001-02-23 17:55:21 +00001096 * htmlCheckAutoClose:
1097 * @newtag: The new tag name
1098 * @oldtag: The old tag name
1099 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001100 * Checks whether the new tag is one of the registered valid tags for
1101 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001102 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1103 *
1104 * Returns 0 if no, 1 if yes.
1105 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001106static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001107htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1108{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001109 int i, indx;
1110 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001111
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001112 if (htmlStartCloseIndexinitialized == 0)
1113 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001114
1115 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001116 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001117 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001118 if (closed == NULL)
1119 return (0);
1120 if (xmlStrEqual(BAD_CAST * closed, newtag))
1121 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001122 }
1123
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001124 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001125 i++;
1126 while (htmlStartClose[i] != NULL) {
1127 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001128 return (1);
1129 }
1130 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001131 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001132 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001133}
1134
1135/**
1136 * htmlAutoCloseOnClose:
1137 * @ctxt: an HTML parser context
1138 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001139 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001140 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001141 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001142 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001143static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001144htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1145{
1146 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001147 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001148
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001149 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001150
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001151 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001152
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001153 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1154 break;
1155 /*
1156 * A missplaced endtag can only close elements with lower
1157 * or equal priority, so if we find an element with higher
1158 * priority before we find an element with
1159 * matching name, we just ignore this endtag
1160 */
1161 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1162 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001163 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001164 if (i < 0)
1165 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001166
1167 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001168 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001169 if ((info != NULL) && (info->endTag == 3)) {
1170 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1171 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001172 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001173 }
1174 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1175 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001176 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001177 }
1178}
1179
1180/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001181 * htmlAutoCloseOnEnd:
1182 * @ctxt: an HTML parser context
1183 *
1184 * Close all remaining tags at the end of the stream
1185 */
1186static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001187htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1188{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001189 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001190
William M. Brack899e64a2003-09-26 18:03:42 +00001191 if (ctxt->nameNr == 0)
1192 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001193 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001194 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1195 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001196 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001197 }
1198}
1199
1200/**
Owen Taylor3473f882001-02-23 17:55:21 +00001201 * htmlAutoClose:
1202 * @ctxt: an HTML parser context
1203 * @newtag: The new tag name or NULL
1204 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001205 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001206 * The list is kept in htmlStartClose array. This function is
1207 * called when a new tag has been detected and generates the
1208 * appropriates closes if possible/needed.
1209 * If newtag is NULL this mean we are at the end of the resource
1210 * and we should check
1211 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001212static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001213htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1214{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001215 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001216 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001217 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1218 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001219 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001220 }
1221 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001222 htmlAutoCloseOnEnd(ctxt);
1223 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001224 }
1225 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001226 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1227 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1228 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001229 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1230 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001231 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001232 }
Owen Taylor3473f882001-02-23 17:55:21 +00001233}
1234
1235/**
1236 * htmlAutoCloseTag:
1237 * @doc: the HTML document
1238 * @name: The tag name
1239 * @elem: the HTML element
1240 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001241 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001242 * The list is kept in htmlStartClose array. This function checks
1243 * if the element or one of it's children would autoclose the
1244 * given tag.
1245 *
1246 * Returns 1 if autoclose, 0 otherwise
1247 */
1248int
1249htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1250 htmlNodePtr child;
1251
1252 if (elem == NULL) return(1);
1253 if (xmlStrEqual(name, elem->name)) return(0);
1254 if (htmlCheckAutoClose(elem->name, name)) return(1);
1255 child = elem->children;
1256 while (child != NULL) {
1257 if (htmlAutoCloseTag(doc, name, child)) return(1);
1258 child = child->next;
1259 }
1260 return(0);
1261}
1262
1263/**
1264 * htmlIsAutoClosed:
1265 * @doc: the HTML document
1266 * @elem: the HTML element
1267 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001268 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001269 * The list is kept in htmlStartClose array. This function checks
1270 * if a tag is autoclosed by one of it's child
1271 *
1272 * Returns 1 if autoclosed, 0 otherwise
1273 */
1274int
1275htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1276 htmlNodePtr child;
1277
1278 if (elem == NULL) return(1);
1279 child = elem->children;
1280 while (child != NULL) {
1281 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1282 child = child->next;
1283 }
1284 return(0);
1285}
1286
1287/**
1288 * htmlCheckImplied:
1289 * @ctxt: an HTML parser context
1290 * @newtag: The new tag name
1291 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001292 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001293 * called when a new tag has been detected and generates the
1294 * appropriates implicit tags if missing
1295 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001296static void
Owen Taylor3473f882001-02-23 17:55:21 +00001297htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1298 if (!htmlOmittedDefaultValue)
1299 return;
1300 if (xmlStrEqual(newtag, BAD_CAST"html"))
1301 return;
1302 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001303 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001304 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1305 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1306 }
1307 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1308 return;
1309 if ((ctxt->nameNr <= 1) &&
1310 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1311 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1312 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1313 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1314 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1315 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1316 /*
1317 * dropped OBJECT ... i you put it first BODY will be
1318 * assumed !
1319 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001320 htmlnamePush(ctxt, BAD_CAST"head");
Owen Taylor3473f882001-02-23 17:55:21 +00001321 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1322 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1323 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1324 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1325 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1326 int i;
1327 for (i = 0;i < ctxt->nameNr;i++) {
1328 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1329 return;
1330 }
1331 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1332 return;
1333 }
1334 }
1335
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001336 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001337 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1338 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1339 }
1340}
1341
1342/**
1343 * htmlCheckParagraph
1344 * @ctxt: an HTML parser context
1345 *
1346 * Check whether a p element need to be implied before inserting
1347 * characters in the current element.
1348 *
1349 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1350 * in case of error.
1351 */
1352
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001353static int
Owen Taylor3473f882001-02-23 17:55:21 +00001354htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1355 const xmlChar *tag;
1356 int i;
1357
1358 if (ctxt == NULL)
1359 return(-1);
1360 tag = ctxt->name;
1361 if (tag == NULL) {
1362 htmlAutoClose(ctxt, BAD_CAST"p");
1363 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001364 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001365 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1366 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1367 return(1);
1368 }
1369 if (!htmlOmittedDefaultValue)
1370 return(0);
1371 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1372 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001373 htmlAutoClose(ctxt, BAD_CAST"p");
1374 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001375 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001376 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1377 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1378 return(1);
1379 }
1380 }
1381 return(0);
1382}
1383
1384/**
1385 * htmlIsScriptAttribute:
1386 * @name: an attribute name
1387 *
1388 * Check if an attribute is of content type Script
1389 *
1390 * Returns 1 is the attribute is a script 0 otherwise
1391 */
1392int
1393htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001394 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001395
1396 if (name == NULL)
1397 return(0);
1398 /*
1399 * all script attributes start with 'on'
1400 */
1401 if ((name[0] != 'o') || (name[1] != 'n'))
1402 return(0);
1403 for (i = 0;
1404 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1405 i++) {
1406 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1407 return(1);
1408 }
1409 return(0);
1410}
1411
1412/************************************************************************
1413 * *
1414 * The list of HTML predefined entities *
1415 * *
1416 ************************************************************************/
1417
1418
Daniel Veillard22090732001-07-16 00:06:07 +00001419static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001420/*
1421 * the 4 absolute ones, plus apostrophe.
1422 */
1423{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1424{ 38, "amp", "ampersand, U+0026 ISOnum" },
1425{ 39, "apos", "single quote" },
1426{ 60, "lt", "less-than sign, U+003C ISOnum" },
1427{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1428
1429/*
1430 * A bunch still in the 128-255 range
1431 * Replacing them depend really on the charset used.
1432 */
1433{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1434{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1435{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1436{ 163, "pound","pound sign, U+00A3 ISOnum" },
1437{ 164, "curren","currency sign, U+00A4 ISOnum" },
1438{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1439{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1440{ 167, "sect", "section sign, U+00A7 ISOnum" },
1441{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1442{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1443{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1444{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1445{ 172, "not", "not sign, U+00AC ISOnum" },
1446{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1447{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1448{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1449{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1450{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1451{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1452{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1453{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1454{ 181, "micro","micro sign, U+00B5 ISOnum" },
1455{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1456{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1457{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1458{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1459{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1460{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1461{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1462{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1463{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1464{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1465{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1466{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1467{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1468{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1469{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1470{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1471{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1472{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1473{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1474{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1475{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1476{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1477{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1478{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1479{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1480{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1481{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1482{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1483{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1484{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1485{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1486{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1487{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1488{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1489{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1490{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1491{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1492{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1493{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1494{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1495{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1496{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1497{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1498{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1499{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1500{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1501{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1502{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1503{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1504{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1505{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1506{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1507{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1508{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1509{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1510{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1511{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1512{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1513{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1514{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1515{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1516{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1517{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1518{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1519{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1520{ 247, "divide","division sign, U+00F7 ISOnum" },
1521{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1522{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1523{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1524{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1525{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1526{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1527{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1528{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1529
1530{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1531{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1532{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1533{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1534{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1535
1536/*
1537 * Anything below should really be kept as entities references
1538 */
1539{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1540
1541{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1542{ 732, "tilde","small tilde, U+02DC ISOdia" },
1543
1544{ 913, "Alpha","greek capital letter alpha, U+0391" },
1545{ 914, "Beta", "greek capital letter beta, U+0392" },
1546{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1547{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1548{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1549{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1550{ 919, "Eta", "greek capital letter eta, U+0397" },
1551{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1552{ 921, "Iota", "greek capital letter iota, U+0399" },
1553{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001554{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001555{ 924, "Mu", "greek capital letter mu, U+039C" },
1556{ 925, "Nu", "greek capital letter nu, U+039D" },
1557{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1558{ 927, "Omicron","greek capital letter omicron, U+039F" },
1559{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1560{ 929, "Rho", "greek capital letter rho, U+03A1" },
1561{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1562{ 932, "Tau", "greek capital letter tau, U+03A4" },
1563{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1564{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1565{ 935, "Chi", "greek capital letter chi, U+03A7" },
1566{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1567{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1568
1569{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1570{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1571{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1572{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1573{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1574{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1575{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1576{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1577{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1578{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1579{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1580{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1581{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1582{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1583{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1584{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1585{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1586{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1587{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1588{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1589{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1590{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1591{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1592{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1593{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1594{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1595{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1596{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1597
1598{ 8194, "ensp", "en space, U+2002 ISOpub" },
1599{ 8195, "emsp", "em space, U+2003 ISOpub" },
1600{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1601{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1602{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1603{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1604{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1605{ 8211, "ndash","en dash, U+2013 ISOpub" },
1606{ 8212, "mdash","em dash, U+2014 ISOpub" },
1607{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1608{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1609{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1610{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1611{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1612{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1613{ 8224, "dagger","dagger, U+2020 ISOpub" },
1614{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1615
1616{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1617{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1618
1619{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1620
1621{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1622{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1623
1624{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1625{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1626
1627{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1628{ 8260, "frasl","fraction slash, U+2044 NEW" },
1629
1630{ 8364, "euro", "euro sign, U+20AC NEW" },
1631
1632{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1633{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1634{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1635{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1636{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1637{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1638{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1639{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1640{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1641{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1642{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1643{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1644{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1645{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1646{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1647{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1648
1649{ 8704, "forall","for all, U+2200 ISOtech" },
1650{ 8706, "part", "partial differential, U+2202 ISOtech" },
1651{ 8707, "exist","there exists, U+2203 ISOtech" },
1652{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1653{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1654{ 8712, "isin", "element of, U+2208 ISOtech" },
1655{ 8713, "notin","not an element of, U+2209 ISOtech" },
1656{ 8715, "ni", "contains as member, U+220B ISOtech" },
1657{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001658{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001659{ 8722, "minus","minus sign, U+2212 ISOtech" },
1660{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1661{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1662{ 8733, "prop", "proportional to, U+221D ISOtech" },
1663{ 8734, "infin","infinity, U+221E ISOtech" },
1664{ 8736, "ang", "angle, U+2220 ISOamso" },
1665{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1666{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1667{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1668{ 8746, "cup", "union = cup, U+222A ISOtech" },
1669{ 8747, "int", "integral, U+222B ISOtech" },
1670{ 8756, "there4","therefore, U+2234 ISOtech" },
1671{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1672{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1673{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1674{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1675{ 8801, "equiv","identical to, U+2261 ISOtech" },
1676{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1677{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1678{ 8834, "sub", "subset of, U+2282 ISOtech" },
1679{ 8835, "sup", "superset of, U+2283 ISOtech" },
1680{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1681{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1682{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1683{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1684{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1685{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1686{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1687{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1688{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1689{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1690{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1691{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1692{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1693{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1694
1695{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1696{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1697{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1698{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1699
1700};
1701
1702/************************************************************************
1703 * *
1704 * Commodity functions to handle entities *
1705 * *
1706 ************************************************************************/
1707
1708/*
1709 * Macro used to grow the current buffer.
1710 */
1711#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001712 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001713 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001714 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1715 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001716 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001717 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001718 return(NULL); \
1719 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001720 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001721}
1722
1723/**
1724 * htmlEntityLookup:
1725 * @name: the entity name
1726 *
1727 * Lookup the given entity in EntitiesTable
1728 *
1729 * TODO: the linear scan is really ugly, an hash table is really needed.
1730 *
1731 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1732 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001733const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001734htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001735 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001736
1737 for (i = 0;i < (sizeof(html40EntitiesTable)/
1738 sizeof(html40EntitiesTable[0]));i++) {
1739 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001740 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001741 }
1742 }
1743 return(NULL);
1744}
1745
1746/**
1747 * htmlEntityValueLookup:
1748 * @value: the entity's unicode value
1749 *
1750 * Lookup the given entity in EntitiesTable
1751 *
1752 * TODO: the linear scan is really ugly, an hash table is really needed.
1753 *
1754 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1755 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001756const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001757htmlEntityValueLookup(unsigned int value) {
1758 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001759
1760 for (i = 0;i < (sizeof(html40EntitiesTable)/
1761 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001762 if (html40EntitiesTable[i].value >= value) {
1763 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001764 break;
William M. Brack78637da2003-07-31 14:47:38 +00001765 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001766 }
Owen Taylor3473f882001-02-23 17:55:21 +00001767 }
1768 return(NULL);
1769}
1770
1771/**
1772 * UTF8ToHtml:
1773 * @out: a pointer to an array of bytes to store the result
1774 * @outlen: the length of @out
1775 * @in: a pointer to an array of UTF-8 chars
1776 * @inlen: the length of @in
1777 *
1778 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1779 * plus HTML entities block of chars out.
1780 *
1781 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1782 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001783 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001784 * The value of @outlen after return is the number of octets consumed.
1785 */
1786int
1787UTF8ToHtml(unsigned char* out, int *outlen,
1788 const unsigned char* in, int *inlen) {
1789 const unsigned char* processed = in;
1790 const unsigned char* outend;
1791 const unsigned char* outstart = out;
1792 const unsigned char* instart = in;
1793 const unsigned char* inend;
1794 unsigned int c, d;
1795 int trailing;
1796
1797 if (in == NULL) {
1798 /*
1799 * initialization nothing to do
1800 */
1801 *outlen = 0;
1802 *inlen = 0;
1803 return(0);
1804 }
1805 inend = in + (*inlen);
1806 outend = out + (*outlen);
1807 while (in < inend) {
1808 d = *in++;
1809 if (d < 0x80) { c= d; trailing= 0; }
1810 else if (d < 0xC0) {
1811 /* trailing byte in leading position */
1812 *outlen = out - outstart;
1813 *inlen = processed - instart;
1814 return(-2);
1815 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1816 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1817 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1818 else {
1819 /* no chance for this in Ascii */
1820 *outlen = out - outstart;
1821 *inlen = processed - instart;
1822 return(-2);
1823 }
1824
1825 if (inend - in < trailing) {
1826 break;
1827 }
1828
1829 for ( ; trailing; trailing--) {
1830 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1831 break;
1832 c <<= 6;
1833 c |= d & 0x3F;
1834 }
1835
1836 /* assertion: c is a single UTF-4 value */
1837 if (c < 0x80) {
1838 if (out + 1 >= outend)
1839 break;
1840 *out++ = c;
1841 } else {
1842 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001843 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001844
1845 /*
1846 * Try to lookup a predefined HTML entity for it
1847 */
1848
1849 ent = htmlEntityValueLookup(c);
1850 if (ent == NULL) {
1851 /* no chance for this in Ascii */
1852 *outlen = out - outstart;
1853 *inlen = processed - instart;
1854 return(-2);
1855 }
1856 len = strlen(ent->name);
1857 if (out + 2 + len >= outend)
1858 break;
1859 *out++ = '&';
1860 memcpy(out, ent->name, len);
1861 out += len;
1862 *out++ = ';';
1863 }
1864 processed = in;
1865 }
1866 *outlen = out - outstart;
1867 *inlen = processed - instart;
1868 return(0);
1869}
1870
1871/**
1872 * htmlEncodeEntities:
1873 * @out: a pointer to an array of bytes to store the result
1874 * @outlen: the length of @out
1875 * @in: a pointer to an array of UTF-8 chars
1876 * @inlen: the length of @in
1877 * @quoteChar: the quote character to escape (' or ") or zero.
1878 *
1879 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1880 * plus HTML entities block of chars out.
1881 *
1882 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1883 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001884 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001885 * The value of @outlen after return is the number of octets consumed.
1886 */
1887int
1888htmlEncodeEntities(unsigned char* out, int *outlen,
1889 const unsigned char* in, int *inlen, int quoteChar) {
1890 const unsigned char* processed = in;
1891 const unsigned char* outend = out + (*outlen);
1892 const unsigned char* outstart = out;
1893 const unsigned char* instart = in;
1894 const unsigned char* inend = in + (*inlen);
1895 unsigned int c, d;
1896 int trailing;
1897
1898 while (in < inend) {
1899 d = *in++;
1900 if (d < 0x80) { c= d; trailing= 0; }
1901 else if (d < 0xC0) {
1902 /* trailing byte in leading position */
1903 *outlen = out - outstart;
1904 *inlen = processed - instart;
1905 return(-2);
1906 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1907 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1908 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1909 else {
1910 /* no chance for this in Ascii */
1911 *outlen = out - outstart;
1912 *inlen = processed - instart;
1913 return(-2);
1914 }
1915
1916 if (inend - in < trailing)
1917 break;
1918
1919 while (trailing--) {
1920 if (((d= *in++) & 0xC0) != 0x80) {
1921 *outlen = out - outstart;
1922 *inlen = processed - instart;
1923 return(-2);
1924 }
1925 c <<= 6;
1926 c |= d & 0x3F;
1927 }
1928
1929 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001930 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1931 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001932 if (out >= outend)
1933 break;
1934 *out++ = c;
1935 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001936 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001937 const char *cp;
1938 char nbuf[16];
1939 int len;
1940
1941 /*
1942 * Try to lookup a predefined HTML entity for it
1943 */
1944 ent = htmlEntityValueLookup(c);
1945 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001946 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001947 cp = nbuf;
1948 }
1949 else
1950 cp = ent->name;
1951 len = strlen(cp);
1952 if (out + 2 + len > outend)
1953 break;
1954 *out++ = '&';
1955 memcpy(out, cp, len);
1956 out += len;
1957 *out++ = ';';
1958 }
1959 processed = in;
1960 }
1961 *outlen = out - outstart;
1962 *inlen = processed - instart;
1963 return(0);
1964}
1965
Owen Taylor3473f882001-02-23 17:55:21 +00001966/************************************************************************
1967 * *
1968 * Commodity functions to handle streams *
1969 * *
1970 ************************************************************************/
1971
1972/**
Owen Taylor3473f882001-02-23 17:55:21 +00001973 * htmlNewInputStream:
1974 * @ctxt: an HTML parser context
1975 *
1976 * Create a new input stream structure
1977 * Returns the new input stream or NULL
1978 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001979static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001980htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1981 htmlParserInputPtr input;
1982
1983 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1984 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00001985 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001986 return(NULL);
1987 }
1988 memset(input, 0, sizeof(htmlParserInput));
1989 input->filename = NULL;
1990 input->directory = NULL;
1991 input->base = NULL;
1992 input->cur = NULL;
1993 input->buf = NULL;
1994 input->line = 1;
1995 input->col = 1;
1996 input->buf = NULL;
1997 input->free = NULL;
1998 input->version = NULL;
1999 input->consumed = 0;
2000 input->length = 0;
2001 return(input);
2002}
2003
2004
2005/************************************************************************
2006 * *
2007 * Commodity functions, cleanup needed ? *
2008 * *
2009 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002010/*
2011 * all tags allowing pc data from the html 4.01 loose dtd
2012 * NOTE: it might be more apropriate to integrate this information
2013 * into the html40ElementTable array but I don't want to risk any
2014 * binary incomptibility
2015 */
2016static const char *allowPCData[] = {
2017 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2018 "blockquote", "body", "button", "caption", "center", "cite", "code",
2019 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2020 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2021 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2022 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2023};
Owen Taylor3473f882001-02-23 17:55:21 +00002024
2025/**
2026 * areBlanks:
2027 * @ctxt: an HTML parser context
2028 * @str: a xmlChar *
2029 * @len: the size of @str
2030 *
2031 * Is this a sequence of blank chars that one can ignore ?
2032 *
2033 * Returns 1 if ignorable 0 otherwise.
2034 */
2035
2036static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002037 unsigned int i;
2038 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002039 xmlNodePtr lastChild;
2040
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002041 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002042 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002043
2044 if (CUR == 0) return(1);
2045 if (CUR != '<') return(0);
2046 if (ctxt->name == NULL)
2047 return(1);
2048 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2049 return(1);
2050 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2051 return(1);
2052 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
2053 return(1);
2054 if (ctxt->node == NULL) return(0);
2055 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002056 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2057 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002058 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002059 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2060 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002061 /* keep ws in constructs like ...<b> </b>...
2062 for all tags "b" allowing PCDATA */
2063 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2064 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2065 return(0);
2066 }
2067 }
Owen Taylor3473f882001-02-23 17:55:21 +00002068 } else if (xmlNodeIsText(lastChild)) {
2069 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002070 } else {
2071 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2072 for all tags "p" allowing PCDATA */
2073 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2074 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2075 return(0);
2076 }
2077 }
Owen Taylor3473f882001-02-23 17:55:21 +00002078 }
2079 return(1);
2080}
2081
2082/**
Owen Taylor3473f882001-02-23 17:55:21 +00002083 * htmlNewDocNoDtD:
2084 * @URI: URI for the dtd, or NULL
2085 * @ExternalID: the external ID of the DTD, or NULL
2086 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002087 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2088 * are NULL
2089 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002090 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002091 */
2092htmlDocPtr
2093htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2094 xmlDocPtr cur;
2095
2096 /*
2097 * Allocate a new document and fill the fields.
2098 */
2099 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2100 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002101 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002102 return(NULL);
2103 }
2104 memset(cur, 0, sizeof(xmlDoc));
2105
2106 cur->type = XML_HTML_DOCUMENT_NODE;
2107 cur->version = NULL;
2108 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002109 cur->doc = cur;
2110 cur->name = NULL;
2111 cur->children = NULL;
2112 cur->extSubset = NULL;
2113 cur->oldNs = NULL;
2114 cur->encoding = NULL;
2115 cur->standalone = 1;
2116 cur->compression = 0;
2117 cur->ids = NULL;
2118 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002119 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002120 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002121 if ((ExternalID != NULL) ||
2122 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002123 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002124 return(cur);
2125}
2126
2127/**
2128 * htmlNewDoc:
2129 * @URI: URI for the dtd, or NULL
2130 * @ExternalID: the external ID of the DTD, or NULL
2131 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002132 * Creates a new HTML document
2133 *
Owen Taylor3473f882001-02-23 17:55:21 +00002134 * Returns a new document
2135 */
2136htmlDocPtr
2137htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2138 if ((URI == NULL) && (ExternalID == NULL))
2139 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002140 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2141 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002142
2143 return(htmlNewDocNoDtD(URI, ExternalID));
2144}
2145
2146
2147/************************************************************************
2148 * *
2149 * The parser itself *
2150 * Relates to http://www.w3.org/TR/html40 *
2151 * *
2152 ************************************************************************/
2153
2154/************************************************************************
2155 * *
2156 * The parser itself *
2157 * *
2158 ************************************************************************/
2159
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002160static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002161
Owen Taylor3473f882001-02-23 17:55:21 +00002162/**
2163 * htmlParseHTMLName:
2164 * @ctxt: an HTML parser context
2165 *
2166 * parse an HTML tag or attribute name, note that we convert it to lowercase
2167 * since HTML names are not case-sensitive.
2168 *
2169 * Returns the Tag Name parsed or NULL
2170 */
2171
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002172static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002173htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002174 int i = 0;
2175 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2176
William M. Brackd1757ab2004-10-02 22:07:48 +00002177 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Owen Taylor3473f882001-02-23 17:55:21 +00002178 (CUR != ':')) return(NULL);
2179
2180 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002181 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Owen Taylor3473f882001-02-23 17:55:21 +00002182 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2183 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2184 else loc[i] = CUR;
2185 i++;
2186
2187 NEXT;
2188 }
2189
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002190 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002191}
2192
2193/**
2194 * htmlParseName:
2195 * @ctxt: an HTML parser context
2196 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002197 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002198 *
2199 * Returns the Name parsed or NULL
2200 */
2201
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002202static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002203htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002204 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002205 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002206 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002207
2208 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002209
2210 /*
2211 * Accelerator for simple ASCII names
2212 */
2213 in = ctxt->input->cur;
2214 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2215 ((*in >= 0x41) && (*in <= 0x5A)) ||
2216 (*in == '_') || (*in == ':')) {
2217 in++;
2218 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2219 ((*in >= 0x41) && (*in <= 0x5A)) ||
2220 ((*in >= 0x30) && (*in <= 0x39)) ||
2221 (*in == '_') || (*in == '-') ||
2222 (*in == ':') || (*in == '.'))
2223 in++;
2224 if ((*in > 0) && (*in < 0x80)) {
2225 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002226 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002227 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002228 ctxt->nbChars += count;
2229 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002230 return(ret);
2231 }
2232 }
2233 return(htmlParseNameComplex(ctxt));
2234}
2235
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002236static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002237htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002238 int len = 0, l;
2239 int c;
2240 int count = 0;
2241
2242 /*
2243 * Handler for more complex cases
2244 */
2245 GROW;
2246 c = CUR_CHAR(l);
2247 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2248 (!IS_LETTER(c) && (c != '_') &&
2249 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002250 return(NULL);
2251 }
2252
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002253 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2254 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2255 (c == '.') || (c == '-') ||
2256 (c == '_') || (c == ':') ||
2257 (IS_COMBINING(c)) ||
2258 (IS_EXTENDER(c)))) {
2259 if (count++ > 100) {
2260 count = 0;
2261 GROW;
2262 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002263 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002264 NEXTL(l);
2265 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002266 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002267 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002268}
2269
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002270
Owen Taylor3473f882001-02-23 17:55:21 +00002271/**
2272 * htmlParseHTMLAttribute:
2273 * @ctxt: an HTML parser context
2274 * @stop: a char stop value
2275 *
2276 * parse an HTML attribute value till the stop (quote), if
2277 * stop is 0 then it stops at the first space
2278 *
2279 * Returns the attribute parsed or NULL
2280 */
2281
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002282static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002283htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2284 xmlChar *buffer = NULL;
2285 int buffer_size = 0;
2286 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002287 const xmlChar *name = NULL;
2288 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002289 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002290
2291 /*
2292 * allocate a translation buffer.
2293 */
2294 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002295 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002296 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002297 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002298 return(NULL);
2299 }
2300 out = buffer;
2301
2302 /*
2303 * Ok loop until we reach one of the ending chars
2304 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002305 while ((CUR != 0) && (CUR != stop)) {
2306 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002307 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002308 if (CUR == '&') {
2309 if (NXT(1) == '#') {
2310 unsigned int c;
2311 int bits;
2312
2313 c = htmlParseCharRef(ctxt);
2314 if (c < 0x80)
2315 { *out++ = c; bits= -6; }
2316 else if (c < 0x800)
2317 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2318 else if (c < 0x10000)
2319 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2320 else
2321 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2322
2323 for ( ; bits >= 0; bits-= 6) {
2324 *out++ = ((c >> bits) & 0x3F) | 0x80;
2325 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002326
2327 if (out - buffer > buffer_size - 100) {
2328 int indx = out - buffer;
2329
2330 growBuffer(buffer);
2331 out = &buffer[indx];
2332 }
Owen Taylor3473f882001-02-23 17:55:21 +00002333 } else {
2334 ent = htmlParseEntityRef(ctxt, &name);
2335 if (name == NULL) {
2336 *out++ = '&';
2337 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002338 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002339
2340 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002341 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002342 }
2343 } else if (ent == NULL) {
2344 *out++ = '&';
2345 cur = name;
2346 while (*cur != 0) {
2347 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002348 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002349
2350 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002351 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002352 }
2353 *out++ = *cur++;
2354 }
Owen Taylor3473f882001-02-23 17:55:21 +00002355 } else {
2356 unsigned int c;
2357 int bits;
2358
2359 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002360 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002361
2362 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002363 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002364 }
2365 c = (xmlChar)ent->value;
2366 if (c < 0x80)
2367 { *out++ = c; bits= -6; }
2368 else if (c < 0x800)
2369 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2370 else if (c < 0x10000)
2371 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2372 else
2373 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2374
2375 for ( ; bits >= 0; bits-= 6) {
2376 *out++ = ((c >> bits) & 0x3F) | 0x80;
2377 }
Owen Taylor3473f882001-02-23 17:55:21 +00002378 }
2379 }
2380 } else {
2381 unsigned int c;
2382 int bits, l;
2383
2384 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002385 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002386
2387 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002388 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002389 }
2390 c = CUR_CHAR(l);
2391 if (c < 0x80)
2392 { *out++ = c; bits= -6; }
2393 else if (c < 0x800)
2394 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2395 else if (c < 0x10000)
2396 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2397 else
2398 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2399
2400 for ( ; bits >= 0; bits-= 6) {
2401 *out++ = ((c >> bits) & 0x3F) | 0x80;
2402 }
2403 NEXT;
2404 }
2405 }
2406 *out++ = 0;
2407 return(buffer);
2408}
2409
2410/**
Owen Taylor3473f882001-02-23 17:55:21 +00002411 * htmlParseEntityRef:
2412 * @ctxt: an HTML parser context
2413 * @str: location to store the entity name
2414 *
2415 * parse an HTML ENTITY references
2416 *
2417 * [68] EntityRef ::= '&' Name ';'
2418 *
2419 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2420 * if non-NULL *str will have to be freed by the caller.
2421 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002422const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002423htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2424 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002425 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002426 *str = NULL;
2427
2428 if (CUR == '&') {
2429 NEXT;
2430 name = htmlParseName(ctxt);
2431 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002432 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2433 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002434 } else {
2435 GROW;
2436 if (CUR == ';') {
2437 *str = name;
2438
2439 /*
2440 * Lookup the entity in the table.
2441 */
2442 ent = htmlEntityLookup(name);
2443 if (ent != NULL) /* OK that's ugly !!! */
2444 NEXT;
2445 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002446 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2447 "htmlParseEntityRef: expecting ';'\n",
2448 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002449 *str = name;
2450 }
2451 }
2452 }
2453 return(ent);
2454}
2455
2456/**
2457 * htmlParseAttValue:
2458 * @ctxt: an HTML parser context
2459 *
2460 * parse a value for an attribute
2461 * Note: the parser won't do substitution of entities here, this
2462 * will be handled later in xmlStringGetNodeList, unless it was
2463 * asked for ctxt->replaceEntities != 0
2464 *
2465 * Returns the AttValue parsed or NULL.
2466 */
2467
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002468static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002469htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2470 xmlChar *ret = NULL;
2471
2472 if (CUR == '"') {
2473 NEXT;
2474 ret = htmlParseHTMLAttribute(ctxt, '"');
2475 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002476 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2477 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002478 } else
2479 NEXT;
2480 } else if (CUR == '\'') {
2481 NEXT;
2482 ret = htmlParseHTMLAttribute(ctxt, '\'');
2483 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002484 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2485 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002486 } else
2487 NEXT;
2488 } else {
2489 /*
2490 * That's an HTMLism, the attribute value may not be quoted
2491 */
2492 ret = htmlParseHTMLAttribute(ctxt, 0);
2493 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002494 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2495 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002496 }
2497 }
2498 return(ret);
2499}
2500
2501/**
2502 * htmlParseSystemLiteral:
2503 * @ctxt: an HTML parser context
2504 *
2505 * parse an HTML Literal
2506 *
2507 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2508 *
2509 * Returns the SystemLiteral parsed or NULL
2510 */
2511
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002512static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002513htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2514 const xmlChar *q;
2515 xmlChar *ret = NULL;
2516
2517 if (CUR == '"') {
2518 NEXT;
2519 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002520 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002521 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002522 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002523 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2524 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002525 } else {
2526 ret = xmlStrndup(q, CUR_PTR - q);
2527 NEXT;
2528 }
2529 } else if (CUR == '\'') {
2530 NEXT;
2531 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002532 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002533 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002534 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002535 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2536 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002537 } else {
2538 ret = xmlStrndup(q, CUR_PTR - q);
2539 NEXT;
2540 }
2541 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002542 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2543 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002544 }
2545
2546 return(ret);
2547}
2548
2549/**
2550 * htmlParsePubidLiteral:
2551 * @ctxt: an HTML parser context
2552 *
2553 * parse an HTML public literal
2554 *
2555 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2556 *
2557 * Returns the PubidLiteral parsed or NULL.
2558 */
2559
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002560static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002561htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2562 const xmlChar *q;
2563 xmlChar *ret = NULL;
2564 /*
2565 * Name ::= (Letter | '_') (NameChar)*
2566 */
2567 if (CUR == '"') {
2568 NEXT;
2569 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002570 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002571 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002572 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2573 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002574 } else {
2575 ret = xmlStrndup(q, CUR_PTR - q);
2576 NEXT;
2577 }
2578 } else if (CUR == '\'') {
2579 NEXT;
2580 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002581 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002582 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002583 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002584 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2585 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002586 } else {
2587 ret = xmlStrndup(q, CUR_PTR - q);
2588 NEXT;
2589 }
2590 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002591 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2592 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002593 }
2594
2595 return(ret);
2596}
2597
2598/**
2599 * htmlParseScript:
2600 * @ctxt: an HTML parser context
2601 *
2602 * parse the content of an HTML SCRIPT or STYLE element
2603 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2604 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2605 * http://www.w3.org/TR/html4/types.html#type-script
2606 * http://www.w3.org/TR/html4/types.html#h-6.15
2607 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2608 *
2609 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2610 * element and the value of intrinsic event attributes. User agents must
2611 * not evaluate script data as HTML markup but instead must pass it on as
2612 * data to a script engine.
2613 * NOTES:
2614 * - The content is passed like CDATA
2615 * - the attributes for style and scripting "onXXX" are also described
2616 * as CDATA but SGML allows entities references in attributes so their
2617 * processing is identical as other attributes
2618 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002619static void
Owen Taylor3473f882001-02-23 17:55:21 +00002620htmlParseScript(htmlParserCtxtPtr ctxt) {
2621 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2622 int nbchar = 0;
2623 xmlChar cur;
2624
2625 SHRINK;
2626 cur = CUR;
William M. Brack76e95df2003-10-18 16:20:14 +00002627 while (IS_CHAR_CH(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002628 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2629 (NXT(3) == '-')) {
2630 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2631 if (ctxt->sax->cdataBlock!= NULL) {
2632 /*
2633 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2634 */
2635 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002636 } else if (ctxt->sax->characters != NULL) {
2637 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Daniel Veillardc1f78342001-11-10 11:43:05 +00002638 }
2639 }
2640 nbchar = 0;
2641 htmlParseComment(ctxt);
2642 cur = CUR;
2643 continue;
2644 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002645 /*
2646 * One should break here, the specification is clear:
2647 * Authors should therefore escape "</" within the content.
2648 * Escape mechanisms are specific to each scripting or
2649 * style sheet language.
2650 */
2651 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2652 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2653 break; /* while */
2654 }
2655 buf[nbchar++] = cur;
2656 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2657 if (ctxt->sax->cdataBlock!= NULL) {
2658 /*
2659 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2660 */
2661 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002662 } else if (ctxt->sax->characters != NULL) {
2663 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002664 }
2665 nbchar = 0;
2666 }
2667 NEXT;
2668 cur = CUR;
2669 }
William M. Brack76e95df2003-10-18 16:20:14 +00002670 if (!(IS_CHAR_CH(cur))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002671 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2672 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002673 NEXT;
2674 }
2675
2676 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2677 if (ctxt->sax->cdataBlock!= NULL) {
2678 /*
2679 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2680 */
2681 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002682 } else if (ctxt->sax->characters != NULL) {
2683 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002684 }
2685 }
2686}
2687
2688
2689/**
2690 * htmlParseCharData:
2691 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002692 *
2693 * parse a CharData section.
2694 * if we are within a CDATA section ']]>' marks an end of section.
2695 *
2696 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2697 */
2698
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002699static void
2700htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002701 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2702 int nbchar = 0;
2703 int cur, l;
2704
2705 SHRINK;
2706 cur = CUR_CHAR(l);
2707 while (((cur != '<') || (ctxt->token == '<')) &&
2708 ((cur != '&') || (ctxt->token == '&')) &&
2709 (IS_CHAR(cur))) {
2710 COPY_BUF(l,buf,nbchar,cur);
2711 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2712 /*
2713 * Ok the segment is to be consumed as chars.
2714 */
2715 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2716 if (areBlanks(ctxt, buf, nbchar)) {
2717 if (ctxt->sax->ignorableWhitespace != NULL)
2718 ctxt->sax->ignorableWhitespace(ctxt->userData,
2719 buf, nbchar);
2720 } else {
2721 htmlCheckParagraph(ctxt);
2722 if (ctxt->sax->characters != NULL)
2723 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2724 }
2725 }
2726 nbchar = 0;
2727 }
2728 NEXTL(l);
2729 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002730 if (cur == 0) {
2731 SHRINK;
2732 GROW;
2733 cur = CUR_CHAR(l);
2734 }
Owen Taylor3473f882001-02-23 17:55:21 +00002735 }
2736 if (nbchar != 0) {
2737 /*
2738 * Ok the segment is to be consumed as chars.
2739 */
2740 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2741 if (areBlanks(ctxt, buf, nbchar)) {
2742 if (ctxt->sax->ignorableWhitespace != NULL)
2743 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2744 } else {
2745 htmlCheckParagraph(ctxt);
2746 if (ctxt->sax->characters != NULL)
2747 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2748 }
2749 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002750 } else {
2751 /*
2752 * Loop detection
2753 */
2754 if (cur == 0)
2755 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002756 }
2757}
2758
2759/**
2760 * htmlParseExternalID:
2761 * @ctxt: an HTML parser context
2762 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002763 *
2764 * Parse an External ID or a Public ID
2765 *
Owen Taylor3473f882001-02-23 17:55:21 +00002766 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2767 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2768 *
2769 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2770 *
2771 * Returns the function returns SystemLiteral and in the second
2772 * case publicID receives PubidLiteral, is strict is off
2773 * it is possible to return NULL and have publicID set.
2774 */
2775
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002776static xmlChar *
2777htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002778 xmlChar *URI = NULL;
2779
2780 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2781 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2782 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2783 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002784 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002785 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2786 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002787 }
2788 SKIP_BLANKS;
2789 URI = htmlParseSystemLiteral(ctxt);
2790 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002791 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2792 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002793 }
2794 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2795 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2796 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2797 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002798 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002799 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2800 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002801 }
2802 SKIP_BLANKS;
2803 *publicID = htmlParsePubidLiteral(ctxt);
2804 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002805 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2806 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2807 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002808 }
2809 SKIP_BLANKS;
2810 if ((CUR == '"') || (CUR == '\'')) {
2811 URI = htmlParseSystemLiteral(ctxt);
2812 }
2813 }
2814 return(URI);
2815}
2816
2817/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00002818 * xmlParsePI:
2819 * @ctxt: an XML parser context
2820 *
2821 * parse an XML Processing Instruction.
2822 *
2823 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2824 */
2825static void
2826htmlParsePI(htmlParserCtxtPtr ctxt) {
2827 xmlChar *buf = NULL;
2828 int len = 0;
2829 int size = HTML_PARSER_BUFFER_SIZE;
2830 int cur, l;
2831 const xmlChar *target;
2832 xmlParserInputState state;
2833 int count = 0;
2834
2835 if ((RAW == '<') && (NXT(1) == '?')) {
2836 state = ctxt->instate;
2837 ctxt->instate = XML_PARSER_PI;
2838 /*
2839 * this is a Processing Instruction.
2840 */
2841 SKIP(2);
2842 SHRINK;
2843
2844 /*
2845 * Parse the target name and check for special support like
2846 * namespace.
2847 */
2848 target = htmlParseName(ctxt);
2849 if (target != NULL) {
2850 if (RAW == '>') {
2851 SKIP(1);
2852
2853 /*
2854 * SAX: PI detected.
2855 */
2856 if ((ctxt->sax) && (!ctxt->disableSAX) &&
2857 (ctxt->sax->processingInstruction != NULL))
2858 ctxt->sax->processingInstruction(ctxt->userData,
2859 target, NULL);
2860 ctxt->instate = state;
2861 return;
2862 }
2863 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
2864 if (buf == NULL) {
2865 htmlErrMemory(ctxt, NULL);
2866 ctxt->instate = state;
2867 return;
2868 }
2869 cur = CUR;
2870 if (!IS_BLANK(cur)) {
2871 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2872 "ParsePI: PI %s space expected\n", target, NULL);
2873 }
2874 SKIP_BLANKS;
2875 cur = CUR_CHAR(l);
2876 while (IS_CHAR(cur) && (cur != '>')) {
2877 if (len + 5 >= size) {
2878 xmlChar *tmp;
2879
2880 size *= 2;
2881 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2882 if (tmp == NULL) {
2883 htmlErrMemory(ctxt, NULL);
2884 xmlFree(buf);
2885 ctxt->instate = state;
2886 return;
2887 }
2888 buf = tmp;
2889 }
2890 count++;
2891 if (count > 50) {
2892 GROW;
2893 count = 0;
2894 }
2895 COPY_BUF(l,buf,len,cur);
2896 NEXTL(l);
2897 cur = CUR_CHAR(l);
2898 if (cur == 0) {
2899 SHRINK;
2900 GROW;
2901 cur = CUR_CHAR(l);
2902 }
2903 }
2904 buf[len] = 0;
2905 if (cur != '>') {
2906 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
2907 "ParsePI: PI %s never end ...\n", target, NULL);
2908 } else {
2909 SKIP(1);
2910
2911 /*
2912 * SAX: PI detected.
2913 */
2914 if ((ctxt->sax) && (!ctxt->disableSAX) &&
2915 (ctxt->sax->processingInstruction != NULL))
2916 ctxt->sax->processingInstruction(ctxt->userData,
2917 target, buf);
2918 }
2919 xmlFree(buf);
2920 } else {
2921 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
2922 "PI is not started correctly", NULL, NULL);
2923 }
2924 ctxt->instate = state;
2925 }
2926}
2927
2928/**
Owen Taylor3473f882001-02-23 17:55:21 +00002929 * htmlParseComment:
2930 * @ctxt: an HTML parser context
2931 *
2932 * Parse an XML (SGML) comment <!-- .... -->
2933 *
2934 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2935 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002936static void
Owen Taylor3473f882001-02-23 17:55:21 +00002937htmlParseComment(htmlParserCtxtPtr ctxt) {
2938 xmlChar *buf = NULL;
2939 int len;
2940 int size = HTML_PARSER_BUFFER_SIZE;
2941 int q, ql;
2942 int r, rl;
2943 int cur, l;
2944 xmlParserInputState state;
2945
2946 /*
2947 * Check that there is a comment right here.
2948 */
2949 if ((RAW != '<') || (NXT(1) != '!') ||
2950 (NXT(2) != '-') || (NXT(3) != '-')) return;
2951
2952 state = ctxt->instate;
2953 ctxt->instate = XML_PARSER_COMMENT;
2954 SHRINK;
2955 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002956 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002957 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002958 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002959 ctxt->instate = state;
2960 return;
2961 }
2962 q = CUR_CHAR(ql);
2963 NEXTL(ql);
2964 r = CUR_CHAR(rl);
2965 NEXTL(rl);
2966 cur = CUR_CHAR(l);
2967 len = 0;
2968 while (IS_CHAR(cur) &&
2969 ((cur != '>') ||
2970 (r != '-') || (q != '-'))) {
2971 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00002972 xmlChar *tmp;
2973
Owen Taylor3473f882001-02-23 17:55:21 +00002974 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00002975 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2976 if (tmp == NULL) {
2977 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00002978 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002979 ctxt->instate = state;
2980 return;
2981 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00002982 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00002983 }
2984 COPY_BUF(ql,buf,len,q);
2985 q = r;
2986 ql = rl;
2987 r = cur;
2988 rl = l;
2989 NEXTL(l);
2990 cur = CUR_CHAR(l);
2991 if (cur == 0) {
2992 SHRINK;
2993 GROW;
2994 cur = CUR_CHAR(l);
2995 }
2996 }
2997 buf[len] = 0;
2998 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002999 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3000 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003001 xmlFree(buf);
3002 } else {
3003 NEXT;
3004 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3005 (!ctxt->disableSAX))
3006 ctxt->sax->comment(ctxt->userData, buf);
3007 xmlFree(buf);
3008 }
3009 ctxt->instate = state;
3010}
3011
3012/**
3013 * htmlParseCharRef:
3014 * @ctxt: an HTML parser context
3015 *
3016 * parse Reference declarations
3017 *
3018 * [66] CharRef ::= '&#' [0-9]+ ';' |
3019 * '&#x' [0-9a-fA-F]+ ';'
3020 *
3021 * Returns the value parsed (as an int)
3022 */
3023int
3024htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3025 int val = 0;
3026
Daniel Veillarda03e3652004-11-02 18:45:30 +00003027 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3028 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3029 "htmlParseCharRef: context error\n",
3030 NULL, NULL);
3031 return(0);
3032 }
Owen Taylor3473f882001-02-23 17:55:21 +00003033 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003034 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003035 SKIP(3);
3036 while (CUR != ';') {
3037 if ((CUR >= '0') && (CUR <= '9'))
3038 val = val * 16 + (CUR - '0');
3039 else if ((CUR >= 'a') && (CUR <= 'f'))
3040 val = val * 16 + (CUR - 'a') + 10;
3041 else if ((CUR >= 'A') && (CUR <= 'F'))
3042 val = val * 16 + (CUR - 'A') + 10;
3043 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003044 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3045 "htmlParseCharRef: invalid hexadecimal value\n",
3046 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003047 return(0);
3048 }
3049 NEXT;
3050 }
3051 if (CUR == ';')
3052 NEXT;
3053 } else if ((CUR == '&') && (NXT(1) == '#')) {
3054 SKIP(2);
3055 while (CUR != ';') {
3056 if ((CUR >= '0') && (CUR <= '9'))
3057 val = val * 10 + (CUR - '0');
3058 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003059 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3060 "htmlParseCharRef: invalid decimal value\n",
3061 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003062 return(0);
3063 }
3064 NEXT;
3065 }
3066 if (CUR == ';')
3067 NEXT;
3068 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003069 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3070 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003071 }
3072 /*
3073 * Check the value IS_CHAR ...
3074 */
3075 if (IS_CHAR(val)) {
3076 return(val);
3077 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003078 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3079 "htmlParseCharRef: invalid xmlChar value %d\n",
3080 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003081 }
3082 return(0);
3083}
3084
3085
3086/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003087 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003088 * @ctxt: an HTML parser context
3089 *
3090 * parse a DOCTYPE declaration
3091 *
3092 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3093 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3094 */
3095
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003096static void
Owen Taylor3473f882001-02-23 17:55:21 +00003097htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003098 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003099 xmlChar *ExternalID = NULL;
3100 xmlChar *URI = NULL;
3101
3102 /*
3103 * We know that '<!DOCTYPE' has been detected.
3104 */
3105 SKIP(9);
3106
3107 SKIP_BLANKS;
3108
3109 /*
3110 * Parse the DOCTYPE name.
3111 */
3112 name = htmlParseName(ctxt);
3113 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003114 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3115 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3116 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003117 }
3118 /*
3119 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3120 */
3121
3122 SKIP_BLANKS;
3123
3124 /*
3125 * Check for SystemID and ExternalID
3126 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003127 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003128 SKIP_BLANKS;
3129
3130 /*
3131 * We should be at the end of the DOCTYPE declaration.
3132 */
3133 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003134 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3135 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003136 /* We shouldn't try to resynchronize ... */
3137 }
3138 NEXT;
3139
3140 /*
3141 * Create or update the document accordingly to the DOCTYPE
3142 */
3143 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3144 (!ctxt->disableSAX))
3145 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3146
3147 /*
3148 * Cleanup, since we don't use all those identifiers
3149 */
3150 if (URI != NULL) xmlFree(URI);
3151 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003152}
3153
3154/**
3155 * htmlParseAttribute:
3156 * @ctxt: an HTML parser context
3157 * @value: a xmlChar ** used to store the value of the attribute
3158 *
3159 * parse an attribute
3160 *
3161 * [41] Attribute ::= Name Eq AttValue
3162 *
3163 * [25] Eq ::= S? '=' S?
3164 *
3165 * With namespace:
3166 *
3167 * [NS 11] Attribute ::= QName Eq AttValue
3168 *
3169 * Also the case QName == xmlns:??? is handled independently as a namespace
3170 * definition.
3171 *
3172 * Returns the attribute name, and the value in *value.
3173 */
3174
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003175static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003176htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003177 const xmlChar *name;
3178 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003179
3180 *value = NULL;
3181 name = htmlParseHTMLName(ctxt);
3182 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003183 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3184 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003185 return(NULL);
3186 }
3187
3188 /*
3189 * read the value
3190 */
3191 SKIP_BLANKS;
3192 if (CUR == '=') {
3193 NEXT;
3194 SKIP_BLANKS;
3195 val = htmlParseAttValue(ctxt);
3196 /******
3197 } else {
3198 * TODO : some attribute must have values, some may not
3199 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3200 ctxt->sax->warning(ctxt->userData,
3201 "No value for attribute %s\n", name); */
3202 }
3203
3204 *value = val;
3205 return(name);
3206}
3207
3208/**
3209 * htmlCheckEncoding:
3210 * @ctxt: an HTML parser context
3211 * @attvalue: the attribute value
3212 *
3213 * Checks an http-equiv attribute from a Meta tag to detect
3214 * the encoding
3215 * If a new encoding is detected the parser is switched to decode
3216 * it and pass UTF8
3217 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003218static void
Owen Taylor3473f882001-02-23 17:55:21 +00003219htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3220 const xmlChar *encoding;
3221
3222 if ((ctxt == NULL) || (attvalue == NULL))
3223 return;
3224
3225 /* do not change encoding */
3226 if (ctxt->input->encoding != NULL)
3227 return;
3228
3229 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3230 if (encoding != NULL) {
3231 encoding += 8;
3232 } else {
3233 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3234 if (encoding != NULL)
3235 encoding += 9;
3236 }
3237 if (encoding != NULL) {
3238 xmlCharEncoding enc;
3239 xmlCharEncodingHandlerPtr handler;
3240
3241 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3242
3243 if (ctxt->input->encoding != NULL)
3244 xmlFree((xmlChar *) ctxt->input->encoding);
3245 ctxt->input->encoding = xmlStrdup(encoding);
3246
3247 enc = xmlParseCharEncoding((const char *) encoding);
3248 /*
3249 * registered set of known encodings
3250 */
3251 if (enc != XML_CHAR_ENCODING_ERROR) {
3252 xmlSwitchEncoding(ctxt, enc);
3253 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3254 } else {
3255 /*
3256 * fallback for unknown encodings
3257 */
3258 handler = xmlFindCharEncodingHandler((const char *) encoding);
3259 if (handler != NULL) {
3260 xmlSwitchToEncoding(ctxt, handler);
3261 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3262 } else {
3263 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3264 }
3265 }
3266
3267 if ((ctxt->input->buf != NULL) &&
3268 (ctxt->input->buf->encoder != NULL) &&
3269 (ctxt->input->buf->raw != NULL) &&
3270 (ctxt->input->buf->buffer != NULL)) {
3271 int nbchars;
3272 int processed;
3273
3274 /*
3275 * convert as much as possible to the parser reading buffer.
3276 */
3277 processed = ctxt->input->cur - ctxt->input->base;
3278 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3279 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3280 ctxt->input->buf->buffer,
3281 ctxt->input->buf->raw);
3282 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003283 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3284 "htmlCheckEncoding: encoder error\n",
3285 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003286 }
3287 ctxt->input->base =
3288 ctxt->input->cur = ctxt->input->buf->buffer->content;
3289 }
3290 }
3291}
3292
3293/**
3294 * htmlCheckMeta:
3295 * @ctxt: an HTML parser context
3296 * @atts: the attributes values
3297 *
3298 * Checks an attributes from a Meta tag
3299 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003300static void
Owen Taylor3473f882001-02-23 17:55:21 +00003301htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3302 int i;
3303 const xmlChar *att, *value;
3304 int http = 0;
3305 const xmlChar *content = NULL;
3306
3307 if ((ctxt == NULL) || (atts == NULL))
3308 return;
3309
3310 i = 0;
3311 att = atts[i++];
3312 while (att != NULL) {
3313 value = atts[i++];
3314 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3315 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3316 http = 1;
3317 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3318 content = value;
3319 att = atts[i++];
3320 }
3321 if ((http) && (content != NULL))
3322 htmlCheckEncoding(ctxt, content);
3323
3324}
3325
3326/**
3327 * htmlParseStartTag:
3328 * @ctxt: an HTML parser context
3329 *
3330 * parse a start of tag either for rule element or
3331 * EmptyElement. In both case we don't parse the tag closing chars.
3332 *
3333 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3334 *
3335 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3336 *
3337 * With namespace:
3338 *
3339 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3340 *
3341 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3342 *
3343 */
3344
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003345static void
Owen Taylor3473f882001-02-23 17:55:21 +00003346htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003347 const xmlChar *name;
3348 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003349 xmlChar *attvalue;
Daniel Veillardf403d292003-10-05 13:51:35 +00003350 const xmlChar **atts = ctxt->atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003351 int nbatts = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +00003352 int maxatts = ctxt->maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003353 int meta = 0;
3354 int i;
3355
Daniel Veillarda03e3652004-11-02 18:45:30 +00003356 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3357 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3358 "htmlParseStartTag: context error\n", NULL, NULL);
3359 return;
3360 }
Owen Taylor3473f882001-02-23 17:55:21 +00003361 if (CUR != '<') return;
3362 NEXT;
3363
3364 GROW;
3365 name = htmlParseHTMLName(ctxt);
3366 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003367 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3368 "htmlParseStartTag: invalid element name\n",
3369 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003370 /* Dump the bogus tag like browsers do */
William M. Brack76e95df2003-10-18 16:20:14 +00003371 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Owen Taylor3473f882001-02-23 17:55:21 +00003372 NEXT;
3373 return;
3374 }
3375 if (xmlStrEqual(name, BAD_CAST"meta"))
3376 meta = 1;
3377
3378 /*
3379 * Check for auto-closure of HTML elements.
3380 */
3381 htmlAutoClose(ctxt, name);
3382
3383 /*
3384 * Check for implied HTML elements.
3385 */
3386 htmlCheckImplied(ctxt, name);
3387
3388 /*
3389 * Avoid html at any level > 0, head at any level != 1
3390 * or any attempt to recurse body
3391 */
3392 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003393 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3394 "htmlParseStartTag: misplaced <html> tag\n",
3395 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003396 return;
3397 }
3398 if ((ctxt->nameNr != 1) &&
3399 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003400 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3401 "htmlParseStartTag: misplaced <head> tag\n",
3402 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003403 return;
3404 }
3405 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003406 int indx;
3407 for (indx = 0;indx < ctxt->nameNr;indx++) {
3408 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003409 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3410 "htmlParseStartTag: misplaced <body> tag\n",
3411 name, NULL);
Daniel Veillardc59d8262003-11-20 21:59:12 +00003412 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3413 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003414 return;
3415 }
3416 }
3417 }
3418
3419 /*
3420 * Now parse the attributes, it ends up with the ending
3421 *
3422 * (S Attribute)* S?
3423 */
3424 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003425 while ((IS_CHAR_CH(CUR)) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003426 (CUR != '>') &&
3427 ((CUR != '/') || (NXT(1) != '>'))) {
3428 long cons = ctxt->nbChars;
3429
3430 GROW;
3431 attname = htmlParseAttribute(ctxt, &attvalue);
3432 if (attname != NULL) {
3433
3434 /*
3435 * Well formedness requires at most one declaration of an attribute
3436 */
3437 for (i = 0; i < nbatts;i += 2) {
3438 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003439 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3440 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003441 if (attvalue != NULL)
3442 xmlFree(attvalue);
3443 goto failed;
3444 }
3445 }
3446
3447 /*
3448 * Add the pair to atts
3449 */
3450 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003451 maxatts = 22; /* allow for 10 attrs by default */
3452 atts = (const xmlChar **)
3453 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003454 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003455 htmlErrMemory(ctxt, NULL);
3456 if (attvalue != NULL)
3457 xmlFree(attvalue);
3458 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003459 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003460 ctxt->atts = atts;
3461 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003462 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003463 const xmlChar **n;
3464
Owen Taylor3473f882001-02-23 17:55:21 +00003465 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003466 n = (const xmlChar **) xmlRealloc((void *) atts,
3467 maxatts * sizeof(const xmlChar *));
3468 if (n == NULL) {
3469 htmlErrMemory(ctxt, NULL);
3470 if (attvalue != NULL)
3471 xmlFree(attvalue);
3472 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003473 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003474 atts = n;
3475 ctxt->atts = atts;
3476 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003477 }
3478 atts[nbatts++] = attname;
3479 atts[nbatts++] = attvalue;
3480 atts[nbatts] = NULL;
3481 atts[nbatts + 1] = NULL;
3482 }
3483 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003484 if (attvalue != NULL)
3485 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003486 /* Dump the bogus attribute string up to the next blank or
3487 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003488 while ((IS_CHAR_CH(CUR)) &&
3489 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003490 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003491 NEXT;
3492 }
3493
3494failed:
3495 SKIP_BLANKS;
3496 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003497 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3498 "htmlParseStartTag: problem parsing attributes\n",
3499 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003500 break;
3501 }
3502 }
3503
3504 /*
3505 * Handle specific association to the META tag
3506 */
3507 if (meta)
3508 htmlCheckMeta(ctxt, atts);
3509
3510 /*
3511 * SAX: Start of Element !
3512 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003513 htmlnamePush(ctxt, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003514 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3515 if (nbatts != 0)
3516 ctxt->sax->startElement(ctxt->userData, name, atts);
3517 else
3518 ctxt->sax->startElement(ctxt->userData, name, NULL);
3519 }
Owen Taylor3473f882001-02-23 17:55:21 +00003520
3521 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003522 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003523 if (atts[i] != NULL)
3524 xmlFree((xmlChar *) atts[i]);
3525 }
Owen Taylor3473f882001-02-23 17:55:21 +00003526 }
Owen Taylor3473f882001-02-23 17:55:21 +00003527}
3528
3529/**
3530 * htmlParseEndTag:
3531 * @ctxt: an HTML parser context
3532 *
3533 * parse an end of tag
3534 *
3535 * [42] ETag ::= '</' Name S? '>'
3536 *
3537 * With namespace
3538 *
3539 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003540 *
3541 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003542 */
3543
Daniel Veillardf420ac52001-07-04 16:04:09 +00003544static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003545htmlParseEndTag(htmlParserCtxtPtr ctxt)
3546{
3547 const xmlChar *name;
3548 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003549 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003550
3551 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003552 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3553 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003554 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003555 }
3556 SKIP(2);
3557
3558 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003559 if (name == NULL)
3560 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003561
3562 /*
3563 * We should definitely be at the ending "S? '>'" part
3564 */
3565 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003566 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003567 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3568 "End tag : expected '>'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003569 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003570 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003571
3572 /*
3573 * If the name read is not one of the element in the parsing stack
3574 * then return, it's just an error.
3575 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003576 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3577 if (xmlStrEqual(name, ctxt->nameTab[i]))
3578 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003579 }
3580 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003581 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3582 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003583 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003584 }
3585
3586
3587 /*
3588 * Check for auto-closure of HTML elements.
3589 */
3590
3591 htmlAutoCloseOnClose(ctxt, name);
3592
3593 /*
3594 * Well formedness constraints, opening and closing must match.
3595 * With the exception that the autoclose may have popped stuff out
3596 * of the stack.
3597 */
3598 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003599 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003600 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3601 "Opening and ending tag mismatch: %s and %s\n",
3602 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003603 }
3604 }
3605
3606 /*
3607 * SAX: End of Tag
3608 */
3609 oldname = ctxt->name;
3610 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003611 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3612 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003613 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003614 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003615 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003616 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003617 }
3618
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003619 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003620}
3621
3622
3623/**
3624 * htmlParseReference:
3625 * @ctxt: an HTML parser context
3626 *
3627 * parse and handle entity references in content,
3628 * this will end-up in a call to character() since this is either a
3629 * CharRef, or a predefined entity.
3630 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003631static void
Owen Taylor3473f882001-02-23 17:55:21 +00003632htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003633 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003634 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003635 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003636 if (CUR != '&') return;
3637
3638 if (NXT(1) == '#') {
3639 unsigned int c;
3640 int bits, i = 0;
3641
3642 c = htmlParseCharRef(ctxt);
3643 if (c == 0)
3644 return;
3645
3646 if (c < 0x80) { out[i++]= c; bits= -6; }
3647 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3648 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3649 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3650
3651 for ( ; bits >= 0; bits-= 6) {
3652 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3653 }
3654 out[i] = 0;
3655
3656 htmlCheckParagraph(ctxt);
3657 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3658 ctxt->sax->characters(ctxt->userData, out, i);
3659 } else {
3660 ent = htmlParseEntityRef(ctxt, &name);
3661 if (name == NULL) {
3662 htmlCheckParagraph(ctxt);
3663 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3664 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3665 return;
3666 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003667 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003668 htmlCheckParagraph(ctxt);
3669 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3670 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3671 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3672 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3673 }
3674 } else {
3675 unsigned int c;
3676 int bits, i = 0;
3677
3678 c = ent->value;
3679 if (c < 0x80)
3680 { out[i++]= c; bits= -6; }
3681 else if (c < 0x800)
3682 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3683 else if (c < 0x10000)
3684 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3685 else
3686 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3687
3688 for ( ; bits >= 0; bits-= 6) {
3689 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3690 }
3691 out[i] = 0;
3692
3693 htmlCheckParagraph(ctxt);
3694 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3695 ctxt->sax->characters(ctxt->userData, out, i);
3696 }
Owen Taylor3473f882001-02-23 17:55:21 +00003697 }
3698}
3699
3700/**
3701 * htmlParseContent:
3702 * @ctxt: an HTML parser context
3703 * @name: the node name
3704 *
3705 * Parse a content: comment, sub-element, reference or text.
3706 *
3707 */
3708
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003709static void
Owen Taylor3473f882001-02-23 17:55:21 +00003710htmlParseContent(htmlParserCtxtPtr ctxt) {
3711 xmlChar *currentNode;
3712 int depth;
3713
3714 currentNode = xmlStrdup(ctxt->name);
3715 depth = ctxt->nameNr;
3716 while (1) {
3717 long cons = ctxt->nbChars;
3718
3719 GROW;
3720 /*
3721 * Our tag or one of it's parent or children is ending.
3722 */
3723 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003724 if (htmlParseEndTag(ctxt) &&
3725 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3726 if (currentNode != NULL)
3727 xmlFree(currentNode);
3728 return;
3729 }
3730 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003731 }
3732
3733 /*
3734 * Has this node been popped out during parsing of
3735 * the next element
3736 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003737 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3738 (!xmlStrEqual(currentNode, ctxt->name)))
3739 {
Owen Taylor3473f882001-02-23 17:55:21 +00003740 if (currentNode != NULL) xmlFree(currentNode);
3741 return;
3742 }
3743
Daniel Veillardf9533d12001-03-03 10:04:57 +00003744 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3745 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003746 /*
3747 * Handle SCRIPT/STYLE separately
3748 */
3749 htmlParseScript(ctxt);
3750 } else {
3751 /*
3752 * Sometimes DOCTYPE arrives in the middle of the document
3753 */
3754 if ((CUR == '<') && (NXT(1) == '!') &&
3755 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3756 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3757 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3758 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003759 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3760 "Misplaced DOCTYPE declaration\n",
3761 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003762 htmlParseDocTypeDecl(ctxt);
3763 }
3764
3765 /*
3766 * First case : a comment
3767 */
3768 if ((CUR == '<') && (NXT(1) == '!') &&
3769 (NXT(2) == '-') && (NXT(3) == '-')) {
3770 htmlParseComment(ctxt);
3771 }
3772
3773 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003774 * Second case : a Processing Instruction.
3775 */
3776 else if ((CUR == '<') && (NXT(1) == '?')) {
3777 htmlParsePI(ctxt);
3778 }
3779
3780 /*
3781 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00003782 */
3783 else if (CUR == '<') {
3784 htmlParseElement(ctxt);
3785 }
3786
3787 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003788 * Fourth case : a reference. If if has not been resolved,
Owen Taylor3473f882001-02-23 17:55:21 +00003789 * parsing returns it's Name, create the node
3790 */
3791 else if (CUR == '&') {
3792 htmlParseReference(ctxt);
3793 }
3794
3795 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003796 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00003797 */
3798 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003799 htmlAutoCloseOnEnd(ctxt);
3800 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003801 }
3802
3803 /*
3804 * Last case, text. Note that References are handled directly.
3805 */
3806 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003807 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003808 }
3809
3810 if (cons == ctxt->nbChars) {
3811 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003812 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3813 "detected an error in element content\n",
3814 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003815 }
3816 break;
3817 }
3818 }
3819 GROW;
3820 }
3821 if (currentNode != NULL) xmlFree(currentNode);
3822}
3823
3824/**
3825 * htmlParseElement:
3826 * @ctxt: an HTML parser context
3827 *
3828 * parse an HTML element, this is highly recursive
3829 *
3830 * [39] element ::= EmptyElemTag | STag content ETag
3831 *
3832 * [41] Attribute ::= Name Eq AttValue
3833 */
3834
3835void
3836htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003837 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003838 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003839 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003840 htmlParserNodeInfo node_info;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003841 const xmlChar *oldname;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003842 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003843 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003844
Daniel Veillarda03e3652004-11-02 18:45:30 +00003845 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3846 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3847 "htmlParseStartTag: context error\n", NULL, NULL);
3848 return;
3849 }
3850 depth = ctxt->nameNr;
Owen Taylor3473f882001-02-23 17:55:21 +00003851 /* Capture start position */
3852 if (ctxt->record_info) {
3853 node_info.begin_pos = ctxt->input->consumed +
3854 (CUR_PTR - ctxt->input->base);
3855 node_info.begin_line = ctxt->input->line;
3856 }
3857
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003858 oldname = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00003859 htmlParseStartTag(ctxt);
3860 name = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00003861 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3862 (name == NULL)) {
3863 if (CUR == '>')
3864 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003865 return;
3866 }
Owen Taylor3473f882001-02-23 17:55:21 +00003867
3868 /*
3869 * Lookup the info for that element.
3870 */
3871 info = htmlTagLookup(name);
3872 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003873 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
3874 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003875 }
3876
3877 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003878 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003879 */
3880 if ((CUR == '/') && (NXT(1) == '>')) {
3881 SKIP(2);
3882 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3883 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003884 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003885 return;
3886 }
3887
3888 if (CUR == '>') {
3889 NEXT;
3890 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003891 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3892 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003893
3894 /*
3895 * end of parsing of this node.
3896 */
3897 if (xmlStrEqual(name, ctxt->name)) {
3898 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00003899 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003900 }
3901
3902 /*
3903 * Capture end position and add node
3904 */
3905 if ( currentNode != NULL && ctxt->record_info ) {
3906 node_info.end_pos = ctxt->input->consumed +
3907 (CUR_PTR - ctxt->input->base);
3908 node_info.end_line = ctxt->input->line;
3909 node_info.node = ctxt->node;
3910 xmlParserAddNodeInfo(ctxt, &node_info);
3911 }
3912 return;
3913 }
3914
3915 /*
3916 * Check for an Empty Element from DTD definition
3917 */
3918 if ((info != NULL) && (info->empty)) {
3919 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3920 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003921 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003922 return;
3923 }
3924
3925 /*
3926 * Parse the content of the element:
3927 */
3928 currentNode = xmlStrdup(ctxt->name);
3929 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00003930 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003931 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003932 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003933 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003934 if (ctxt->nameNr < depth) break;
3935 }
3936
Owen Taylor3473f882001-02-23 17:55:21 +00003937 /*
3938 * Capture end position and add node
3939 */
3940 if ( currentNode != NULL && ctxt->record_info ) {
3941 node_info.end_pos = ctxt->input->consumed +
3942 (CUR_PTR - ctxt->input->base);
3943 node_info.end_line = ctxt->input->line;
3944 node_info.node = ctxt->node;
3945 xmlParserAddNodeInfo(ctxt, &node_info);
3946 }
William M. Brack76e95df2003-10-18 16:20:14 +00003947 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003948 htmlAutoCloseOnEnd(ctxt);
3949 }
3950
Owen Taylor3473f882001-02-23 17:55:21 +00003951 if (currentNode != NULL)
3952 xmlFree(currentNode);
3953}
3954
3955/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003956 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00003957 * @ctxt: an HTML parser context
3958 *
3959 * parse an HTML document (and build a tree if using the standard SAX
3960 * interface).
3961 *
3962 * Returns 0, -1 in case of error. the parser context is augmented
3963 * as a result of the parsing.
3964 */
3965
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00003966int
Owen Taylor3473f882001-02-23 17:55:21 +00003967htmlParseDocument(htmlParserCtxtPtr ctxt) {
3968 xmlDtdPtr dtd;
3969
Daniel Veillardd0463562001-10-13 09:15:48 +00003970 xmlInitParser();
3971
Owen Taylor3473f882001-02-23 17:55:21 +00003972 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00003973
Daniel Veillarda03e3652004-11-02 18:45:30 +00003974 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3975 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3976 "htmlParseDocument: context error\n", NULL, NULL);
3977 return(XML_ERR_INTERNAL_ERROR);
3978 }
3979 ctxt->html = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00003980 GROW;
3981 /*
3982 * SAX: beginning of the document processing.
3983 */
3984 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3985 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3986
3987 /*
3988 * Wipe out everything which is before the first '<'
3989 */
3990 SKIP_BLANKS;
3991 if (CUR == 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003992 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
3993 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003994 }
3995
3996 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3997 ctxt->sax->startDocument(ctxt->userData);
3998
3999
4000 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004001 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004002 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004003 while (((CUR == '<') && (NXT(1) == '!') &&
4004 (NXT(2) == '-') && (NXT(3) == '-')) ||
4005 ((CUR == '<') && (NXT(1) == '?'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004006 htmlParseComment(ctxt);
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004007 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004008 SKIP_BLANKS;
4009 }
4010
4011
4012 /*
4013 * Then possibly doc type declaration(s) and more Misc
4014 * (doctypedecl Misc*)?
4015 */
4016 if ((CUR == '<') && (NXT(1) == '!') &&
4017 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4018 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4019 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4020 (UPP(8) == 'E')) {
4021 htmlParseDocTypeDecl(ctxt);
4022 }
4023 SKIP_BLANKS;
4024
4025 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004026 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004027 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004028 while (((CUR == '<') && (NXT(1) == '!') &&
4029 (NXT(2) == '-') && (NXT(3) == '-')) ||
4030 ((CUR == '<') && (NXT(1) == '?'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004031 htmlParseComment(ctxt);
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004032 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004033 SKIP_BLANKS;
4034 }
4035
4036 /*
4037 * Time to start parsing the tree itself
4038 */
4039 htmlParseContent(ctxt);
4040
4041 /*
4042 * autoclose
4043 */
4044 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004045 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004046
4047
4048 /*
4049 * SAX: end of the document processing.
4050 */
4051 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4052 ctxt->sax->endDocument(ctxt->userData);
4053
4054 if (ctxt->myDoc != NULL) {
4055 dtd = xmlGetIntSubset(ctxt->myDoc);
4056 if (dtd == NULL)
4057 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00004058 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004059 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4060 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4061 }
4062 if (! ctxt->wellFormed) return(-1);
4063 return(0);
4064}
4065
4066
4067/************************************************************************
4068 * *
4069 * Parser contexts handling *
4070 * *
4071 ************************************************************************/
4072
4073/**
William M. Brackedb65a72004-02-06 07:36:04 +00004074 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004075 * @ctxt: an HTML parser context
4076 *
4077 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004078 *
4079 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004080 */
4081
Daniel Veillardf403d292003-10-05 13:51:35 +00004082static int
Owen Taylor3473f882001-02-23 17:55:21 +00004083htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4084{
4085 htmlSAXHandler *sax;
4086
Daniel Veillardf403d292003-10-05 13:51:35 +00004087 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004088 memset(ctxt, 0, sizeof(htmlParserCtxt));
4089
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004090 ctxt->dict = xmlDictCreate();
4091 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004092 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4093 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004094 }
Owen Taylor3473f882001-02-23 17:55:21 +00004095 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4096 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004097 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4098 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004099 }
4100 else
4101 memset(sax, 0, sizeof(htmlSAXHandler));
4102
4103 /* Allocate the Input stack */
4104 ctxt->inputTab = (htmlParserInputPtr *)
4105 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4106 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004107 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004108 ctxt->inputNr = 0;
4109 ctxt->inputMax = 0;
4110 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004111 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004112 }
4113 ctxt->inputNr = 0;
4114 ctxt->inputMax = 5;
4115 ctxt->input = NULL;
4116 ctxt->version = NULL;
4117 ctxt->encoding = NULL;
4118 ctxt->standalone = -1;
4119 ctxt->instate = XML_PARSER_START;
4120
4121 /* Allocate the Node stack */
4122 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4123 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004124 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004125 ctxt->nodeNr = 0;
4126 ctxt->nodeMax = 0;
4127 ctxt->node = NULL;
4128 ctxt->inputNr = 0;
4129 ctxt->inputMax = 0;
4130 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004131 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004132 }
4133 ctxt->nodeNr = 0;
4134 ctxt->nodeMax = 10;
4135 ctxt->node = NULL;
4136
4137 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004138 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004139 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004140 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004141 ctxt->nameNr = 0;
4142 ctxt->nameMax = 10;
4143 ctxt->name = NULL;
4144 ctxt->nodeNr = 0;
4145 ctxt->nodeMax = 0;
4146 ctxt->node = NULL;
4147 ctxt->inputNr = 0;
4148 ctxt->inputMax = 0;
4149 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004150 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004151 }
4152 ctxt->nameNr = 0;
4153 ctxt->nameMax = 10;
4154 ctxt->name = NULL;
4155
Daniel Veillard092643b2003-09-25 14:29:29 +00004156 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004157 else {
4158 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004159 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004160 }
4161 ctxt->userData = ctxt;
4162 ctxt->myDoc = NULL;
4163 ctxt->wellFormed = 1;
4164 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004165 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004166 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004167 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004168 ctxt->vctxt.userData = ctxt;
4169 ctxt->vctxt.error = xmlParserValidityError;
4170 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004171 ctxt->record_info = 0;
4172 ctxt->validate = 0;
4173 ctxt->nbChars = 0;
4174 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004175 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004176 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004177 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004178}
4179
4180/**
4181 * htmlFreeParserCtxt:
4182 * @ctxt: an HTML parser context
4183 *
4184 * Free all the memory used by a parser context. However the parsed
4185 * document in ctxt->myDoc is not freed.
4186 */
4187
4188void
4189htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4190{
4191 xmlFreeParserCtxt(ctxt);
4192}
4193
4194/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004195 * htmlNewParserCtxt:
4196 *
4197 * Allocate and initialize a new parser context.
4198 *
4199 * Returns the xmlParserCtxtPtr or NULL
4200 */
4201
4202static htmlParserCtxtPtr
4203htmlNewParserCtxt(void)
4204{
4205 xmlParserCtxtPtr ctxt;
4206
4207 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4208 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004209 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004210 return(NULL);
4211 }
4212 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004213 if (htmlInitParserCtxt(ctxt) < 0) {
4214 htmlFreeParserCtxt(ctxt);
4215 return(NULL);
4216 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004217 return(ctxt);
4218}
4219
4220/**
4221 * htmlCreateMemoryParserCtxt:
4222 * @buffer: a pointer to a char array
4223 * @size: the size of the array
4224 *
4225 * Create a parser context for an HTML in-memory document.
4226 *
4227 * Returns the new parser context or NULL
4228 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004229htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004230htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4231 xmlParserCtxtPtr ctxt;
4232 xmlParserInputPtr input;
4233 xmlParserInputBufferPtr buf;
4234
4235 if (buffer == NULL)
4236 return(NULL);
4237 if (size <= 0)
4238 return(NULL);
4239
4240 ctxt = htmlNewParserCtxt();
4241 if (ctxt == NULL)
4242 return(NULL);
4243
4244 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4245 if (buf == NULL) return(NULL);
4246
4247 input = xmlNewInputStream(ctxt);
4248 if (input == NULL) {
4249 xmlFreeParserCtxt(ctxt);
4250 return(NULL);
4251 }
4252
4253 input->filename = NULL;
4254 input->buf = buf;
4255 input->base = input->buf->buffer->content;
4256 input->cur = input->buf->buffer->content;
4257 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4258
4259 inputPush(ctxt, input);
4260 return(ctxt);
4261}
4262
4263/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004264 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004265 * @cur: a pointer to an array of xmlChar
4266 * @encoding: a free form C string describing the HTML document encoding, or NULL
4267 *
4268 * Create a parser context for an HTML document.
4269 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004270 * TODO: check the need to add encoding handling there
4271 *
Owen Taylor3473f882001-02-23 17:55:21 +00004272 * Returns the new parser context or NULL
4273 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004274static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00004275htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004276 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004277 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004278
Daniel Veillard1d995272002-07-22 16:43:32 +00004279 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004280 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004281 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004282 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4283
4284 if (encoding != NULL) {
4285 xmlCharEncoding enc;
4286 xmlCharEncodingHandlerPtr handler;
4287
4288 if (ctxt->input->encoding != NULL)
4289 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004290 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004291
4292 enc = xmlParseCharEncoding(encoding);
4293 /*
4294 * registered set of known encodings
4295 */
4296 if (enc != XML_CHAR_ENCODING_ERROR) {
4297 xmlSwitchEncoding(ctxt, enc);
4298 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004299 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4300 "Unsupported encoding %s\n",
4301 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004302 }
4303 } else {
4304 /*
4305 * fallback for unknown encodings
4306 */
4307 handler = xmlFindCharEncodingHandler((const char *) encoding);
4308 if (handler != NULL) {
4309 xmlSwitchToEncoding(ctxt, handler);
4310 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004311 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4312 "Unsupported encoding %s\n",
4313 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004314 }
4315 }
4316 }
4317 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004318}
4319
Daniel Veillard73b013f2003-09-30 12:36:01 +00004320#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004321/************************************************************************
4322 * *
4323 * Progressive parsing interfaces *
4324 * *
4325 ************************************************************************/
4326
4327/**
4328 * htmlParseLookupSequence:
4329 * @ctxt: an HTML parser context
4330 * @first: the first char to lookup
4331 * @next: the next char to lookup or zero
4332 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004333 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004334 *
4335 * Try to find if a sequence (first, next, third) or just (first next) or
4336 * (first) is available in the input stream.
4337 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4338 * to avoid rescanning sequences of bytes, it DOES change the state of the
4339 * parser, do not use liberally.
4340 * This is basically similar to xmlParseLookupSequence()
4341 *
4342 * Returns the index to the current parsing point if the full sequence
4343 * is available, -1 otherwise.
4344 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004345static int
Owen Taylor3473f882001-02-23 17:55:21 +00004346htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
William M. Brackc1939562003-08-05 15:52:22 +00004347 xmlChar next, xmlChar third, int iscomment) {
Owen Taylor3473f882001-02-23 17:55:21 +00004348 int base, len;
4349 htmlParserInputPtr in;
4350 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004351 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004352
4353 in = ctxt->input;
4354 if (in == NULL) return(-1);
4355 base = in->cur - in->base;
4356 if (base < 0) return(-1);
4357 if (ctxt->checkIndex > base)
4358 base = ctxt->checkIndex;
4359 if (in->buf == NULL) {
4360 buf = in->base;
4361 len = in->length;
4362 } else {
4363 buf = in->buf->buffer->content;
4364 len = in->buf->buffer->use;
4365 }
4366 /* take into account the sequence length */
4367 if (third) len -= 2;
4368 else if (next) len --;
4369 for (;base < len;base++) {
William M. Brackc1939562003-08-05 15:52:22 +00004370 if (!incomment && (base + 4 < len) && !iscomment) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004371 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4372 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4373 incomment = 1;
Daniel Veillard97e01882003-07-30 18:59:19 +00004374 /* do not increment past <! - some people use <!--> */
4375 base += 2;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004376 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004377 }
4378 if (incomment) {
William M. Brack4a557d92003-07-29 04:28:04 +00004379 if (base + 3 > len)
Daniel Veillardc1f78342001-11-10 11:43:05 +00004380 return(-1);
4381 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4382 (buf[base + 2] == '>')) {
4383 incomment = 0;
4384 base += 2;
4385 }
4386 continue;
4387 }
Owen Taylor3473f882001-02-23 17:55:21 +00004388 if (buf[base] == first) {
4389 if (third != 0) {
4390 if ((buf[base + 1] != next) ||
4391 (buf[base + 2] != third)) continue;
4392 } else if (next != 0) {
4393 if (buf[base + 1] != next) continue;
4394 }
4395 ctxt->checkIndex = 0;
4396#ifdef DEBUG_PUSH
4397 if (next == 0)
4398 xmlGenericError(xmlGenericErrorContext,
4399 "HPP: lookup '%c' found at %d\n",
4400 first, base);
4401 else if (third == 0)
4402 xmlGenericError(xmlGenericErrorContext,
4403 "HPP: lookup '%c%c' found at %d\n",
4404 first, next, base);
4405 else
4406 xmlGenericError(xmlGenericErrorContext,
4407 "HPP: lookup '%c%c%c' found at %d\n",
4408 first, next, third, base);
4409#endif
4410 return(base - (in->cur - in->base));
4411 }
4412 }
4413 ctxt->checkIndex = base;
4414#ifdef DEBUG_PUSH
4415 if (next == 0)
4416 xmlGenericError(xmlGenericErrorContext,
4417 "HPP: lookup '%c' failed\n", first);
4418 else if (third == 0)
4419 xmlGenericError(xmlGenericErrorContext,
4420 "HPP: lookup '%c%c' failed\n", first, next);
4421 else
4422 xmlGenericError(xmlGenericErrorContext,
4423 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4424#endif
4425 return(-1);
4426}
4427
4428/**
4429 * htmlParseTryOrFinish:
4430 * @ctxt: an HTML parser context
4431 * @terminate: last chunk indicator
4432 *
4433 * Try to progress on parsing
4434 *
4435 * Returns zero if no parsing was possible
4436 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004437static int
Owen Taylor3473f882001-02-23 17:55:21 +00004438htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4439 int ret = 0;
4440 htmlParserInputPtr in;
4441 int avail = 0;
4442 xmlChar cur, next;
4443
4444#ifdef DEBUG_PUSH
4445 switch (ctxt->instate) {
4446 case XML_PARSER_EOF:
4447 xmlGenericError(xmlGenericErrorContext,
4448 "HPP: try EOF\n"); break;
4449 case XML_PARSER_START:
4450 xmlGenericError(xmlGenericErrorContext,
4451 "HPP: try START\n"); break;
4452 case XML_PARSER_MISC:
4453 xmlGenericError(xmlGenericErrorContext,
4454 "HPP: try MISC\n");break;
4455 case XML_PARSER_COMMENT:
4456 xmlGenericError(xmlGenericErrorContext,
4457 "HPP: try COMMENT\n");break;
4458 case XML_PARSER_PROLOG:
4459 xmlGenericError(xmlGenericErrorContext,
4460 "HPP: try PROLOG\n");break;
4461 case XML_PARSER_START_TAG:
4462 xmlGenericError(xmlGenericErrorContext,
4463 "HPP: try START_TAG\n");break;
4464 case XML_PARSER_CONTENT:
4465 xmlGenericError(xmlGenericErrorContext,
4466 "HPP: try CONTENT\n");break;
4467 case XML_PARSER_CDATA_SECTION:
4468 xmlGenericError(xmlGenericErrorContext,
4469 "HPP: try CDATA_SECTION\n");break;
4470 case XML_PARSER_END_TAG:
4471 xmlGenericError(xmlGenericErrorContext,
4472 "HPP: try END_TAG\n");break;
4473 case XML_PARSER_ENTITY_DECL:
4474 xmlGenericError(xmlGenericErrorContext,
4475 "HPP: try ENTITY_DECL\n");break;
4476 case XML_PARSER_ENTITY_VALUE:
4477 xmlGenericError(xmlGenericErrorContext,
4478 "HPP: try ENTITY_VALUE\n");break;
4479 case XML_PARSER_ATTRIBUTE_VALUE:
4480 xmlGenericError(xmlGenericErrorContext,
4481 "HPP: try ATTRIBUTE_VALUE\n");break;
4482 case XML_PARSER_DTD:
4483 xmlGenericError(xmlGenericErrorContext,
4484 "HPP: try DTD\n");break;
4485 case XML_PARSER_EPILOG:
4486 xmlGenericError(xmlGenericErrorContext,
4487 "HPP: try EPILOG\n");break;
4488 case XML_PARSER_PI:
4489 xmlGenericError(xmlGenericErrorContext,
4490 "HPP: try PI\n");break;
4491 case XML_PARSER_SYSTEM_LITERAL:
4492 xmlGenericError(xmlGenericErrorContext,
4493 "HPP: try SYSTEM_LITERAL\n");break;
4494 }
4495#endif
4496
4497 while (1) {
4498
4499 in = ctxt->input;
4500 if (in == NULL) break;
4501 if (in->buf == NULL)
4502 avail = in->length - (in->cur - in->base);
4503 else
4504 avail = in->buf->buffer->use - (in->cur - in->base);
4505 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004506 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004507 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4508 /*
4509 * SAX: end of the document processing.
4510 */
4511 ctxt->instate = XML_PARSER_EOF;
4512 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4513 ctxt->sax->endDocument(ctxt->userData);
4514 }
4515 }
4516 if (avail < 1)
4517 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004518 cur = in->cur[0];
4519 if (cur == 0) {
4520 SKIP(1);
4521 continue;
4522 }
4523
Owen Taylor3473f882001-02-23 17:55:21 +00004524 switch (ctxt->instate) {
4525 case XML_PARSER_EOF:
4526 /*
4527 * Document parsing is done !
4528 */
4529 goto done;
4530 case XML_PARSER_START:
4531 /*
4532 * Very first chars read from the document flow.
4533 */
4534 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004535 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004536 SKIP_BLANKS;
4537 if (in->buf == NULL)
4538 avail = in->length - (in->cur - in->base);
4539 else
4540 avail = in->buf->buffer->use - (in->cur - in->base);
4541 }
4542 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4543 ctxt->sax->setDocumentLocator(ctxt->userData,
4544 &xmlDefaultSAXLocator);
4545 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4546 (!ctxt->disableSAX))
4547 ctxt->sax->startDocument(ctxt->userData);
4548
4549 cur = in->cur[0];
4550 next = in->cur[1];
4551 if ((cur == '<') && (next == '!') &&
4552 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4553 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4554 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4555 (UPP(8) == 'E')) {
4556 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004557 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004558 goto done;
4559#ifdef DEBUG_PUSH
4560 xmlGenericError(xmlGenericErrorContext,
4561 "HPP: Parsing internal subset\n");
4562#endif
4563 htmlParseDocTypeDecl(ctxt);
4564 ctxt->instate = XML_PARSER_PROLOG;
4565#ifdef DEBUG_PUSH
4566 xmlGenericError(xmlGenericErrorContext,
4567 "HPP: entering PROLOG\n");
4568#endif
4569 } else {
4570 ctxt->instate = XML_PARSER_MISC;
4571 }
4572#ifdef DEBUG_PUSH
4573 xmlGenericError(xmlGenericErrorContext,
4574 "HPP: entering MISC\n");
4575#endif
4576 break;
4577 case XML_PARSER_MISC:
4578 SKIP_BLANKS;
4579 if (in->buf == NULL)
4580 avail = in->length - (in->cur - in->base);
4581 else
4582 avail = in->buf->buffer->use - (in->cur - in->base);
4583 if (avail < 2)
4584 goto done;
4585 cur = in->cur[0];
4586 next = in->cur[1];
4587 if ((cur == '<') && (next == '!') &&
4588 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4589 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004590 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004591 goto done;
4592#ifdef DEBUG_PUSH
4593 xmlGenericError(xmlGenericErrorContext,
4594 "HPP: Parsing Comment\n");
4595#endif
4596 htmlParseComment(ctxt);
4597 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004598 } else if ((cur == '<') && (next == '?')) {
4599 if ((!terminate) &&
4600 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4601 goto done;
4602#ifdef DEBUG_PUSH
4603 xmlGenericError(xmlGenericErrorContext,
4604 "HPP: Parsing PI\n");
4605#endif
4606 htmlParsePI(ctxt);
4607 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004608 } else if ((cur == '<') && (next == '!') &&
4609 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4610 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4611 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4612 (UPP(8) == 'E')) {
4613 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004614 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004615 goto done;
4616#ifdef DEBUG_PUSH
4617 xmlGenericError(xmlGenericErrorContext,
4618 "HPP: Parsing internal subset\n");
4619#endif
4620 htmlParseDocTypeDecl(ctxt);
4621 ctxt->instate = XML_PARSER_PROLOG;
4622#ifdef DEBUG_PUSH
4623 xmlGenericError(xmlGenericErrorContext,
4624 "HPP: entering PROLOG\n");
4625#endif
4626 } else if ((cur == '<') && (next == '!') &&
4627 (avail < 9)) {
4628 goto done;
4629 } else {
4630 ctxt->instate = XML_PARSER_START_TAG;
4631#ifdef DEBUG_PUSH
4632 xmlGenericError(xmlGenericErrorContext,
4633 "HPP: entering START_TAG\n");
4634#endif
4635 }
4636 break;
4637 case XML_PARSER_PROLOG:
4638 SKIP_BLANKS;
4639 if (in->buf == NULL)
4640 avail = in->length - (in->cur - in->base);
4641 else
4642 avail = in->buf->buffer->use - (in->cur - in->base);
4643 if (avail < 2)
4644 goto done;
4645 cur = in->cur[0];
4646 next = in->cur[1];
4647 if ((cur == '<') && (next == '!') &&
4648 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4649 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004650 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004651 goto done;
4652#ifdef DEBUG_PUSH
4653 xmlGenericError(xmlGenericErrorContext,
4654 "HPP: Parsing Comment\n");
4655#endif
4656 htmlParseComment(ctxt);
4657 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004658 } else if ((cur == '<') && (next == '?')) {
4659 if ((!terminate) &&
4660 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4661 goto done;
4662#ifdef DEBUG_PUSH
4663 xmlGenericError(xmlGenericErrorContext,
4664 "HPP: Parsing PI\n");
4665#endif
4666 htmlParsePI(ctxt);
4667 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004668 } else if ((cur == '<') && (next == '!') &&
4669 (avail < 4)) {
4670 goto done;
4671 } else {
4672 ctxt->instate = XML_PARSER_START_TAG;
4673#ifdef DEBUG_PUSH
4674 xmlGenericError(xmlGenericErrorContext,
4675 "HPP: entering START_TAG\n");
4676#endif
4677 }
4678 break;
4679 case XML_PARSER_EPILOG:
4680 if (in->buf == NULL)
4681 avail = in->length - (in->cur - in->base);
4682 else
4683 avail = in->buf->buffer->use - (in->cur - in->base);
4684 if (avail < 1)
4685 goto done;
4686 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004687 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004688 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004689 goto done;
4690 }
4691 if (avail < 2)
4692 goto done;
4693 next = in->cur[1];
4694 if ((cur == '<') && (next == '!') &&
4695 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4696 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004697 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004698 goto done;
4699#ifdef DEBUG_PUSH
4700 xmlGenericError(xmlGenericErrorContext,
4701 "HPP: Parsing Comment\n");
4702#endif
4703 htmlParseComment(ctxt);
4704 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004705 } else if ((cur == '<') && (next == '?')) {
4706 if ((!terminate) &&
4707 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4708 goto done;
4709#ifdef DEBUG_PUSH
4710 xmlGenericError(xmlGenericErrorContext,
4711 "HPP: Parsing PI\n");
4712#endif
4713 htmlParsePI(ctxt);
4714 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004715 } else if ((cur == '<') && (next == '!') &&
4716 (avail < 4)) {
4717 goto done;
4718 } else {
4719 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004720 ctxt->wellFormed = 0;
4721 ctxt->instate = XML_PARSER_EOF;
4722#ifdef DEBUG_PUSH
4723 xmlGenericError(xmlGenericErrorContext,
4724 "HPP: entering EOF\n");
4725#endif
4726 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4727 ctxt->sax->endDocument(ctxt->userData);
4728 goto done;
4729 }
4730 break;
4731 case XML_PARSER_START_TAG: {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004732 const xmlChar *name, *oldname;
Owen Taylor3473f882001-02-23 17:55:21 +00004733 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004734 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004735
4736 if (avail < 2)
4737 goto done;
4738 cur = in->cur[0];
4739 if (cur != '<') {
4740 ctxt->instate = XML_PARSER_CONTENT;
4741#ifdef DEBUG_PUSH
4742 xmlGenericError(xmlGenericErrorContext,
4743 "HPP: entering CONTENT\n");
4744#endif
4745 break;
4746 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004747 if (in->cur[1] == '/') {
4748 ctxt->instate = XML_PARSER_END_TAG;
4749 ctxt->checkIndex = 0;
4750#ifdef DEBUG_PUSH
4751 xmlGenericError(xmlGenericErrorContext,
4752 "HPP: entering END_TAG\n");
4753#endif
4754 break;
4755 }
Owen Taylor3473f882001-02-23 17:55:21 +00004756 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004757 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004758 goto done;
4759
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004760 oldname = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00004761 htmlParseStartTag(ctxt);
4762 name = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00004763 if (((depth == ctxt->nameNr) &&
4764 (xmlStrEqual(oldname, ctxt->name))) ||
4765 (name == NULL)) {
4766 if (CUR == '>')
4767 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004768 break;
4769 }
Owen Taylor3473f882001-02-23 17:55:21 +00004770
4771 /*
4772 * Lookup the info for that element.
4773 */
4774 info = htmlTagLookup(name);
4775 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004776 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4777 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004778 }
4779
4780 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004781 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004782 */
4783 if ((CUR == '/') && (NXT(1) == '>')) {
4784 SKIP(2);
4785 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4786 ctxt->sax->endElement(ctxt->userData, name);
4787 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004788 ctxt->instate = XML_PARSER_CONTENT;
4789#ifdef DEBUG_PUSH
4790 xmlGenericError(xmlGenericErrorContext,
4791 "HPP: entering CONTENT\n");
4792#endif
4793 break;
4794 }
4795
4796 if (CUR == '>') {
4797 NEXT;
4798 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004799 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4800 "Couldn't find end of Start Tag %s\n",
4801 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004802
4803 /*
4804 * end of parsing of this node.
4805 */
4806 if (xmlStrEqual(name, ctxt->name)) {
4807 nodePop(ctxt);
4808 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004809 }
4810
4811 ctxt->instate = XML_PARSER_CONTENT;
4812#ifdef DEBUG_PUSH
4813 xmlGenericError(xmlGenericErrorContext,
4814 "HPP: entering CONTENT\n");
4815#endif
4816 break;
4817 }
4818
4819 /*
4820 * Check for an Empty Element from DTD definition
4821 */
4822 if ((info != NULL) && (info->empty)) {
4823 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4824 ctxt->sax->endElement(ctxt->userData, name);
4825 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004826 }
4827 ctxt->instate = XML_PARSER_CONTENT;
4828#ifdef DEBUG_PUSH
4829 xmlGenericError(xmlGenericErrorContext,
4830 "HPP: entering CONTENT\n");
4831#endif
4832 break;
4833 }
4834 case XML_PARSER_CONTENT: {
4835 long cons;
4836 /*
4837 * Handle preparsed entities and charRef
4838 */
4839 if (ctxt->token != 0) {
4840 xmlChar chr[2] = { 0 , 0 } ;
4841
4842 chr[0] = (xmlChar) ctxt->token;
4843 htmlCheckParagraph(ctxt);
4844 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4845 ctxt->sax->characters(ctxt->userData, chr, 1);
4846 ctxt->token = 0;
4847 ctxt->checkIndex = 0;
4848 }
4849 if ((avail == 1) && (terminate)) {
4850 cur = in->cur[0];
4851 if ((cur != '<') && (cur != '&')) {
4852 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00004853 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004854 if (ctxt->sax->ignorableWhitespace != NULL)
4855 ctxt->sax->ignorableWhitespace(
4856 ctxt->userData, &cur, 1);
4857 } else {
4858 htmlCheckParagraph(ctxt);
4859 if (ctxt->sax->characters != NULL)
4860 ctxt->sax->characters(
4861 ctxt->userData, &cur, 1);
4862 }
4863 }
4864 ctxt->token = 0;
4865 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00004866 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00004867 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004868 }
Owen Taylor3473f882001-02-23 17:55:21 +00004869 }
4870 if (avail < 2)
4871 goto done;
4872 cur = in->cur[0];
4873 next = in->cur[1];
4874 cons = ctxt->nbChars;
4875 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4876 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4877 /*
4878 * Handle SCRIPT/STYLE separately
4879 */
4880 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004881 (htmlParseLookupSequence(ctxt, '<', '/', 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004882 goto done;
4883 htmlParseScript(ctxt);
4884 if ((cur == '<') && (next == '/')) {
4885 ctxt->instate = XML_PARSER_END_TAG;
4886 ctxt->checkIndex = 0;
4887#ifdef DEBUG_PUSH
4888 xmlGenericError(xmlGenericErrorContext,
4889 "HPP: entering END_TAG\n");
4890#endif
4891 break;
4892 }
4893 } else {
4894 /*
4895 * Sometimes DOCTYPE arrives in the middle of the document
4896 */
4897 if ((cur == '<') && (next == '!') &&
4898 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4899 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4900 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4901 (UPP(8) == 'E')) {
4902 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004903 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004904 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00004905 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4906 "Misplaced DOCTYPE declaration\n",
4907 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004908 htmlParseDocTypeDecl(ctxt);
4909 } else if ((cur == '<') && (next == '!') &&
4910 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4911 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004912 (htmlParseLookupSequence(
4913 ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004914 goto done;
4915#ifdef DEBUG_PUSH
4916 xmlGenericError(xmlGenericErrorContext,
4917 "HPP: Parsing Comment\n");
4918#endif
4919 htmlParseComment(ctxt);
4920 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004921 } else if ((cur == '<') && (next == '?')) {
4922 if ((!terminate) &&
4923 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4924 goto done;
4925#ifdef DEBUG_PUSH
4926 xmlGenericError(xmlGenericErrorContext,
4927 "HPP: Parsing PI\n");
4928#endif
4929 htmlParsePI(ctxt);
4930 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00004931 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4932 goto done;
4933 } else if ((cur == '<') && (next == '/')) {
4934 ctxt->instate = XML_PARSER_END_TAG;
4935 ctxt->checkIndex = 0;
4936#ifdef DEBUG_PUSH
4937 xmlGenericError(xmlGenericErrorContext,
4938 "HPP: entering END_TAG\n");
4939#endif
4940 break;
4941 } else if (cur == '<') {
4942 ctxt->instate = XML_PARSER_START_TAG;
4943 ctxt->checkIndex = 0;
4944#ifdef DEBUG_PUSH
4945 xmlGenericError(xmlGenericErrorContext,
4946 "HPP: entering START_TAG\n");
4947#endif
4948 break;
4949 } else if (cur == '&') {
4950 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004951 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004952 goto done;
4953#ifdef DEBUG_PUSH
4954 xmlGenericError(xmlGenericErrorContext,
4955 "HPP: Parsing Reference\n");
4956#endif
4957 /* TODO: check generation of subtrees if noent !!! */
4958 htmlParseReference(ctxt);
4959 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00004960 /*
4961 * check that the text sequence is complete
4962 * before handing out the data to the parser
4963 * to avoid problems with erroneous end of
4964 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00004965 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00004966 if ((!terminate) &&
4967 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
4968 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00004969 ctxt->checkIndex = 0;
4970#ifdef DEBUG_PUSH
4971 xmlGenericError(xmlGenericErrorContext,
4972 "HPP: Parsing char data\n");
4973#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004974 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004975 }
4976 }
4977 if (cons == ctxt->nbChars) {
4978 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004979 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4980 "detected an error in element content\n",
4981 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004982 }
4983 NEXT;
4984 break;
4985 }
4986
4987 break;
4988 }
4989 case XML_PARSER_END_TAG:
4990 if (avail < 2)
4991 goto done;
4992 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004993 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004994 goto done;
4995 htmlParseEndTag(ctxt);
4996 if (ctxt->nameNr == 0) {
4997 ctxt->instate = XML_PARSER_EPILOG;
4998 } else {
4999 ctxt->instate = XML_PARSER_CONTENT;
5000 }
5001 ctxt->checkIndex = 0;
5002#ifdef DEBUG_PUSH
5003 xmlGenericError(xmlGenericErrorContext,
5004 "HPP: entering CONTENT\n");
5005#endif
5006 break;
5007 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005008 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5009 "HPP: internal error, state == CDATA\n",
5010 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005011 ctxt->instate = XML_PARSER_CONTENT;
5012 ctxt->checkIndex = 0;
5013#ifdef DEBUG_PUSH
5014 xmlGenericError(xmlGenericErrorContext,
5015 "HPP: entering CONTENT\n");
5016#endif
5017 break;
5018 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005019 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5020 "HPP: internal error, state == DTD\n",
5021 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005022 ctxt->instate = XML_PARSER_CONTENT;
5023 ctxt->checkIndex = 0;
5024#ifdef DEBUG_PUSH
5025 xmlGenericError(xmlGenericErrorContext,
5026 "HPP: entering CONTENT\n");
5027#endif
5028 break;
5029 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005030 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5031 "HPP: internal error, state == COMMENT\n",
5032 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005033 ctxt->instate = XML_PARSER_CONTENT;
5034 ctxt->checkIndex = 0;
5035#ifdef DEBUG_PUSH
5036 xmlGenericError(xmlGenericErrorContext,
5037 "HPP: entering CONTENT\n");
5038#endif
5039 break;
5040 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005041 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5042 "HPP: internal error, state == PI\n",
5043 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005044 ctxt->instate = XML_PARSER_CONTENT;
5045 ctxt->checkIndex = 0;
5046#ifdef DEBUG_PUSH
5047 xmlGenericError(xmlGenericErrorContext,
5048 "HPP: entering CONTENT\n");
5049#endif
5050 break;
5051 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005052 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5053 "HPP: internal error, state == ENTITY_DECL\n",
5054 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005055 ctxt->instate = XML_PARSER_CONTENT;
5056 ctxt->checkIndex = 0;
5057#ifdef DEBUG_PUSH
5058 xmlGenericError(xmlGenericErrorContext,
5059 "HPP: entering CONTENT\n");
5060#endif
5061 break;
5062 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005063 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5064 "HPP: internal error, state == ENTITY_VALUE\n",
5065 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005066 ctxt->instate = XML_PARSER_CONTENT;
5067 ctxt->checkIndex = 0;
5068#ifdef DEBUG_PUSH
5069 xmlGenericError(xmlGenericErrorContext,
5070 "HPP: entering DTD\n");
5071#endif
5072 break;
5073 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005074 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5075 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5076 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005077 ctxt->instate = XML_PARSER_START_TAG;
5078 ctxt->checkIndex = 0;
5079#ifdef DEBUG_PUSH
5080 xmlGenericError(xmlGenericErrorContext,
5081 "HPP: entering START_TAG\n");
5082#endif
5083 break;
5084 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005085 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5086 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5087 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005088 ctxt->instate = XML_PARSER_CONTENT;
5089 ctxt->checkIndex = 0;
5090#ifdef DEBUG_PUSH
5091 xmlGenericError(xmlGenericErrorContext,
5092 "HPP: entering CONTENT\n");
5093#endif
5094 break;
5095 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005096 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5097 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5098 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005099 ctxt->instate = XML_PARSER_CONTENT;
5100 ctxt->checkIndex = 0;
5101#ifdef DEBUG_PUSH
5102 xmlGenericError(xmlGenericErrorContext,
5103 "HPP: entering CONTENT\n");
5104#endif
5105 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005106 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005107 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5108 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5109 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005110 ctxt->instate = XML_PARSER_CONTENT;
5111 ctxt->checkIndex = 0;
5112#ifdef DEBUG_PUSH
5113 xmlGenericError(xmlGenericErrorContext,
5114 "HPP: entering CONTENT\n");
5115#endif
5116 break;
5117
Owen Taylor3473f882001-02-23 17:55:21 +00005118 }
5119 }
5120done:
5121 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005122 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005123 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5124 /*
5125 * SAX: end of the document processing.
5126 */
5127 ctxt->instate = XML_PARSER_EOF;
5128 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5129 ctxt->sax->endDocument(ctxt->userData);
5130 }
5131 }
5132 if ((ctxt->myDoc != NULL) &&
5133 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5134 (ctxt->instate == XML_PARSER_EPILOG))) {
5135 xmlDtdPtr dtd;
5136 dtd = xmlGetIntSubset(ctxt->myDoc);
5137 if (dtd == NULL)
5138 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00005139 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005140 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5141 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5142 }
5143#ifdef DEBUG_PUSH
5144 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5145#endif
5146 return(ret);
5147}
5148
5149/**
Owen Taylor3473f882001-02-23 17:55:21 +00005150 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00005151 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00005152 * @chunk: an char array
5153 * @size: the size in byte of the chunk
5154 * @terminate: last chunk indicator
5155 *
5156 * Parse a Chunk of memory
5157 *
5158 * Returns zero if no error, the xmlParserErrors otherwise.
5159 */
5160int
5161htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5162 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00005163 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5164 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5165 "htmlParseChunk: context error\n", NULL, NULL);
5166 return(XML_ERR_INTERNAL_ERROR);
5167 }
Owen Taylor3473f882001-02-23 17:55:21 +00005168 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5169 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5170 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5171 int cur = ctxt->input->cur - ctxt->input->base;
5172
5173 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5174 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5175 ctxt->input->cur = ctxt->input->base + cur;
5176#ifdef DEBUG_PUSH
5177 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5178#endif
5179
Daniel Veillard14f752c2003-08-09 11:44:50 +00005180#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00005181 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5182 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005183#endif
Owen Taylor3473f882001-02-23 17:55:21 +00005184 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005185 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5186 xmlParserInputBufferPtr in = ctxt->input->buf;
5187 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5188 (in->raw != NULL)) {
5189 int nbchars;
5190
5191 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5192 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005193 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5194 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005195 return(XML_ERR_INVALID_ENCODING);
5196 }
5197 }
5198 }
Owen Taylor3473f882001-02-23 17:55:21 +00005199 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00005200 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00005201 if (terminate) {
5202 if ((ctxt->instate != XML_PARSER_EOF) &&
5203 (ctxt->instate != XML_PARSER_EPILOG) &&
5204 (ctxt->instate != XML_PARSER_MISC)) {
5205 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005206 ctxt->wellFormed = 0;
5207 }
5208 if (ctxt->instate != XML_PARSER_EOF) {
5209 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5210 ctxt->sax->endDocument(ctxt->userData);
5211 }
5212 ctxt->instate = XML_PARSER_EOF;
5213 }
5214 return((xmlParserErrors) ctxt->errNo);
5215}
Daniel Veillard73b013f2003-09-30 12:36:01 +00005216#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00005217
5218/************************************************************************
5219 * *
5220 * User entry points *
5221 * *
5222 ************************************************************************/
5223
5224/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005225 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005226 * @sax: a SAX handler
5227 * @user_data: The user data returned on SAX callbacks
5228 * @chunk: a pointer to an array of chars
5229 * @size: number of chars in the array
5230 * @filename: an optional file name or URI
5231 * @enc: an optional encoding
5232 *
5233 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005234 * The value of @filename is used for fetching external entities
5235 * and error/warning reports.
5236 *
5237 * Returns the new parser context or NULL
5238 */
5239htmlParserCtxtPtr
5240htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5241 const char *chunk, int size, const char *filename,
5242 xmlCharEncoding enc) {
5243 htmlParserCtxtPtr ctxt;
5244 htmlParserInputPtr inputStream;
5245 xmlParserInputBufferPtr buf;
5246
Daniel Veillardd0463562001-10-13 09:15:48 +00005247 xmlInitParser();
5248
Owen Taylor3473f882001-02-23 17:55:21 +00005249 buf = xmlAllocParserInputBuffer(enc);
5250 if (buf == NULL) return(NULL);
5251
Daniel Veillardf403d292003-10-05 13:51:35 +00005252 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005253 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005254 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005255 return(NULL);
5256 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00005257 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5258 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005259 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005260 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005261 xmlFree(ctxt->sax);
5262 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5263 if (ctxt->sax == NULL) {
5264 xmlFree(buf);
5265 xmlFree(ctxt);
5266 return(NULL);
5267 }
5268 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5269 if (user_data != NULL)
5270 ctxt->userData = user_data;
5271 }
5272 if (filename == NULL) {
5273 ctxt->directory = NULL;
5274 } else {
5275 ctxt->directory = xmlParserGetDirectory(filename);
5276 }
5277
5278 inputStream = htmlNewInputStream(ctxt);
5279 if (inputStream == NULL) {
5280 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005281 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005282 return(NULL);
5283 }
5284
5285 if (filename == NULL)
5286 inputStream->filename = NULL;
5287 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005288 inputStream->filename = (char *)
5289 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005290 inputStream->buf = buf;
5291 inputStream->base = inputStream->buf->buffer->content;
5292 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillard5f704af2003-03-05 10:01:43 +00005293 inputStream->end =
5294 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005295
5296 inputPush(ctxt, inputStream);
5297
5298 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5299 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005300 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5301 int cur = ctxt->input->cur - ctxt->input->base;
5302
Owen Taylor3473f882001-02-23 17:55:21 +00005303 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005304
5305 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5306 ctxt->input->cur = ctxt->input->base + cur;
5307 ctxt->input->end =
5308 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005309#ifdef DEBUG_PUSH
5310 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5311#endif
5312 }
5313
5314 return(ctxt);
5315}
5316
5317/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005318 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005319 * @cur: a pointer to an array of xmlChar
5320 * @encoding: a free form C string describing the HTML document encoding, or NULL
5321 * @sax: the SAX handler block
5322 * @userData: if using SAX, this pointer will be provided on callbacks.
5323 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005324 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5325 * to handle parse events. If sax is NULL, fallback to the default DOM
5326 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00005327 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005328 * Returns the resulting document tree unless SAX is NULL or the document is
5329 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005330 */
5331
5332htmlDocPtr
5333htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5334 htmlDocPtr ret;
5335 htmlParserCtxtPtr ctxt;
5336
Daniel Veillardd0463562001-10-13 09:15:48 +00005337 xmlInitParser();
5338
Owen Taylor3473f882001-02-23 17:55:21 +00005339 if (cur == NULL) return(NULL);
5340
5341
5342 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5343 if (ctxt == NULL) return(NULL);
5344 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005345 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005346 ctxt->sax = sax;
5347 ctxt->userData = userData;
5348 }
5349
5350 htmlParseDocument(ctxt);
5351 ret = ctxt->myDoc;
5352 if (sax != NULL) {
5353 ctxt->sax = NULL;
5354 ctxt->userData = NULL;
5355 }
5356 htmlFreeParserCtxt(ctxt);
5357
5358 return(ret);
5359}
5360
5361/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005362 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005363 * @cur: a pointer to an array of xmlChar
5364 * @encoding: a free form C string describing the HTML document encoding, or NULL
5365 *
5366 * parse an HTML in-memory document and build a tree.
5367 *
5368 * Returns the resulting document tree
5369 */
5370
5371htmlDocPtr
5372htmlParseDoc(xmlChar *cur, const char *encoding) {
5373 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5374}
5375
5376
5377/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005378 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005379 * @filename: the filename
5380 * @encoding: a free form C string describing the HTML document encoding, or NULL
5381 *
5382 * Create a parser context for a file content.
5383 * Automatic support for ZLIB/Compress compressed document is provided
5384 * by default if found at compile-time.
5385 *
5386 * Returns the new parser context or NULL
5387 */
5388htmlParserCtxtPtr
5389htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5390{
5391 htmlParserCtxtPtr ctxt;
5392 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005393 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005394 /* htmlCharEncoding enc; */
5395 xmlChar *content, *content_line = (xmlChar *) "charset=";
5396
Daniel Veillarda03e3652004-11-02 18:45:30 +00005397 if (filename == NULL)
5398 return(NULL);
5399
Daniel Veillardf403d292003-10-05 13:51:35 +00005400 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005401 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00005402 return(NULL);
5403 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005404 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5405 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00005406#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005407 if (xmlDefaultSAXHandler.error != NULL) {
5408 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5409 }
Daniel Veillard87247e82004-01-13 20:42:02 +00005410#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00005411 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005412 return(NULL);
5413 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005414
5415 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5416 xmlFree(canonicFilename);
5417 if (inputStream == NULL) {
5418 xmlFreeParserCtxt(ctxt);
5419 return(NULL);
5420 }
Owen Taylor3473f882001-02-23 17:55:21 +00005421
5422 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005423
Owen Taylor3473f882001-02-23 17:55:21 +00005424 /* set encoding */
5425 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005426 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Owen Taylor3473f882001-02-23 17:55:21 +00005427 if (content) {
5428 strcpy ((char *)content, (char *)content_line);
5429 strcat ((char *)content, (char *)encoding);
5430 htmlCheckEncoding (ctxt, content);
5431 xmlFree (content);
5432 }
5433 }
5434
5435 return(ctxt);
5436}
5437
5438/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005439 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005440 * @filename: the filename
5441 * @encoding: a free form C string describing the HTML document encoding, or NULL
5442 * @sax: the SAX handler block
5443 * @userData: if using SAX, this pointer will be provided on callbacks.
5444 *
5445 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5446 * compressed document is provided by default if found at compile-time.
5447 * It use the given SAX function block to handle the parsing callback.
5448 * If sax is NULL, fallback to the default DOM tree building routines.
5449 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005450 * Returns the resulting document tree unless SAX is NULL or the document is
5451 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005452 */
5453
5454htmlDocPtr
5455htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5456 void *userData) {
5457 htmlDocPtr ret;
5458 htmlParserCtxtPtr ctxt;
5459 htmlSAXHandlerPtr oldsax = NULL;
5460
Daniel Veillardd0463562001-10-13 09:15:48 +00005461 xmlInitParser();
5462
Owen Taylor3473f882001-02-23 17:55:21 +00005463 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5464 if (ctxt == NULL) return(NULL);
5465 if (sax != NULL) {
5466 oldsax = ctxt->sax;
5467 ctxt->sax = sax;
5468 ctxt->userData = userData;
5469 }
5470
5471 htmlParseDocument(ctxt);
5472
5473 ret = ctxt->myDoc;
5474 if (sax != NULL) {
5475 ctxt->sax = oldsax;
5476 ctxt->userData = NULL;
5477 }
5478 htmlFreeParserCtxt(ctxt);
5479
5480 return(ret);
5481}
5482
5483/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005484 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005485 * @filename: the filename
5486 * @encoding: a free form C string describing the HTML document encoding, or NULL
5487 *
5488 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5489 * compressed document is provided by default if found at compile-time.
5490 *
5491 * Returns the resulting document tree
5492 */
5493
5494htmlDocPtr
5495htmlParseFile(const char *filename, const char *encoding) {
5496 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5497}
5498
5499/**
5500 * htmlHandleOmittedElem:
5501 * @val: int 0 or 1
5502 *
5503 * Set and return the previous value for handling HTML omitted tags.
5504 *
5505 * Returns the last value for 0 for no handling, 1 for auto insertion.
5506 */
5507
5508int
5509htmlHandleOmittedElem(int val) {
5510 int old = htmlOmittedDefaultValue;
5511
5512 htmlOmittedDefaultValue = val;
5513 return(old);
5514}
5515
Daniel Veillard930dfb62003-02-05 10:17:38 +00005516/**
5517 * htmlElementAllowedHere:
5518 * @parent: HTML parent element
5519 * @elt: HTML element
5520 *
5521 * Checks whether an HTML element may be a direct child of a parent element.
5522 * Note - doesn't check for deprecated elements
5523 *
5524 * Returns 1 if allowed; 0 otherwise.
5525 */
5526int
5527htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5528 const char** p ;
5529
5530 if ( ! elt || ! parent || ! parent->subelts )
5531 return 0 ;
5532
5533 for ( p = parent->subelts; *p; ++p )
5534 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5535 return 1 ;
5536
5537 return 0 ;
5538}
5539/**
5540 * htmlElementStatusHere:
5541 * @parent: HTML parent element
5542 * @elt: HTML element
5543 *
5544 * Checks whether an HTML element may be a direct child of a parent element.
5545 * and if so whether it is valid or deprecated.
5546 *
5547 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5548 */
5549htmlStatus
5550htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5551 if ( ! parent || ! elt )
5552 return HTML_INVALID ;
5553 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5554 return HTML_INVALID ;
5555
5556 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5557}
5558/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005559 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005560 * @elt: HTML element
5561 * @attr: HTML attribute
5562 * @legacy: whether to allow deprecated attributes
5563 *
5564 * Checks whether an attribute is valid for an element
5565 * Has full knowledge of Required and Deprecated attributes
5566 *
5567 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5568 */
5569htmlStatus
5570htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5571 const char** p ;
5572
5573 if ( !elt || ! attr )
5574 return HTML_INVALID ;
5575
5576 if ( elt->attrs_req )
5577 for ( p = elt->attrs_req; *p; ++p)
5578 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5579 return HTML_REQUIRED ;
5580
5581 if ( elt->attrs_opt )
5582 for ( p = elt->attrs_opt; *p; ++p)
5583 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5584 return HTML_VALID ;
5585
5586 if ( legacy && elt->attrs_depr )
5587 for ( p = elt->attrs_depr; *p; ++p)
5588 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5589 return HTML_DEPRECATED ;
5590
5591 return HTML_INVALID ;
5592}
5593/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005594 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005595 * @node: an htmlNodePtr in a tree
5596 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005597 * for Element nodes)
5598 *
5599 * Checks whether the tree node is valid. Experimental (the author
5600 * only uses the HTML enhancements in a SAX parser)
5601 *
5602 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5603 * legacy allowed) or htmlElementStatusHere (otherwise).
5604 * for Attribute nodes, a return from htmlAttrAllowed
5605 * for other nodes, HTML_NA (no checks performed)
5606 */
5607htmlStatus
5608htmlNodeStatus(const htmlNodePtr node, int legacy) {
5609 if ( ! node )
5610 return HTML_INVALID ;
5611
5612 switch ( node->type ) {
5613 case XML_ELEMENT_NODE:
5614 return legacy
5615 ? ( htmlElementAllowedHere (
5616 htmlTagLookup(node->parent->name) , node->name
5617 ) ? HTML_VALID : HTML_INVALID )
5618 : htmlElementStatusHere(
5619 htmlTagLookup(node->parent->name) ,
5620 htmlTagLookup(node->name) )
5621 ;
5622 case XML_ATTRIBUTE_NODE:
5623 return htmlAttrAllowed(
5624 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5625 default: return HTML_NA ;
5626 }
5627}
Daniel Veillard9475a352003-09-26 12:47:50 +00005628/************************************************************************
5629 * *
5630 * New set (2.6.0) of simpler and more flexible APIs *
5631 * *
5632 ************************************************************************/
5633/**
5634 * DICT_FREE:
5635 * @str: a string
5636 *
5637 * Free a string if it is not owned by the "dict" dictionnary in the
5638 * current scope
5639 */
5640#define DICT_FREE(str) \
5641 if ((str) && ((!dict) || \
5642 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5643 xmlFree((char *)(str));
5644
5645/**
5646 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00005647 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00005648 *
5649 * Reset a parser context
5650 */
5651void
5652htmlCtxtReset(htmlParserCtxtPtr ctxt)
5653{
5654 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00005655 xmlDictPtr dict;
5656
5657 if (ctxt == NULL)
5658 return;
5659
5660 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00005661
5662 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5663 xmlFreeInputStream(input);
5664 }
5665 ctxt->inputNr = 0;
5666 ctxt->input = NULL;
5667
5668 ctxt->spaceNr = 0;
5669 ctxt->spaceTab[0] = -1;
5670 ctxt->space = &ctxt->spaceTab[0];
5671
5672
5673 ctxt->nodeNr = 0;
5674 ctxt->node = NULL;
5675
5676 ctxt->nameNr = 0;
5677 ctxt->name = NULL;
5678
5679 DICT_FREE(ctxt->version);
5680 ctxt->version = NULL;
5681 DICT_FREE(ctxt->encoding);
5682 ctxt->encoding = NULL;
5683 DICT_FREE(ctxt->directory);
5684 ctxt->directory = NULL;
5685 DICT_FREE(ctxt->extSubURI);
5686 ctxt->extSubURI = NULL;
5687 DICT_FREE(ctxt->extSubSystem);
5688 ctxt->extSubSystem = NULL;
5689 if (ctxt->myDoc != NULL)
5690 xmlFreeDoc(ctxt->myDoc);
5691 ctxt->myDoc = NULL;
5692
5693 ctxt->standalone = -1;
5694 ctxt->hasExternalSubset = 0;
5695 ctxt->hasPErefs = 0;
5696 ctxt->html = 1;
5697 ctxt->external = 0;
5698 ctxt->instate = XML_PARSER_START;
5699 ctxt->token = 0;
5700
5701 ctxt->wellFormed = 1;
5702 ctxt->nsWellFormed = 1;
5703 ctxt->valid = 1;
5704 ctxt->vctxt.userData = ctxt;
5705 ctxt->vctxt.error = xmlParserValidityError;
5706 ctxt->vctxt.warning = xmlParserValidityWarning;
5707 ctxt->record_info = 0;
5708 ctxt->nbChars = 0;
5709 ctxt->checkIndex = 0;
5710 ctxt->inSubset = 0;
5711 ctxt->errNo = XML_ERR_OK;
5712 ctxt->depth = 0;
5713 ctxt->charset = XML_CHAR_ENCODING_UTF8;
5714 ctxt->catalogs = NULL;
5715 xmlInitNodeInfoSeq(&ctxt->node_seq);
5716
5717 if (ctxt->attsDefault != NULL) {
5718 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5719 ctxt->attsDefault = NULL;
5720 }
5721 if (ctxt->attsSpecial != NULL) {
5722 xmlHashFree(ctxt->attsSpecial, NULL);
5723 ctxt->attsSpecial = NULL;
5724 }
5725}
5726
5727/**
5728 * htmlCtxtUseOptions:
5729 * @ctxt: an HTML parser context
5730 * @options: a combination of htmlParserOption(s)
5731 *
5732 * Applies the options to the parser context
5733 *
5734 * Returns 0 in case of success, the set of unknown or unimplemented options
5735 * in case of error.
5736 */
5737int
5738htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5739{
Daniel Veillarda03e3652004-11-02 18:45:30 +00005740 if (ctxt == NULL)
5741 return(-1);
5742
Daniel Veillard9475a352003-09-26 12:47:50 +00005743 if (options & HTML_PARSE_NOWARNING) {
5744 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005745 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00005746 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005747 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00005748 }
5749 if (options & HTML_PARSE_NOERROR) {
5750 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005751 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00005752 ctxt->sax->fatalError = NULL;
5753 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005754 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00005755 }
5756 if (options & HTML_PARSE_PEDANTIC) {
5757 ctxt->pedantic = 1;
5758 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005759 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00005760 } else
5761 ctxt->pedantic = 0;
5762 if (options & XML_PARSE_NOBLANKS) {
5763 ctxt->keepBlanks = 0;
5764 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5765 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005766 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00005767 } else
5768 ctxt->keepBlanks = 1;
5769 ctxt->dictNames = 0;
5770 return (options);
5771}
5772
5773/**
5774 * htmlDoRead:
5775 * @ctxt: an HTML parser context
5776 * @URL: the base URL to use for the document
5777 * @encoding: the document encoding, or NULL
5778 * @options: a combination of htmlParserOption(s)
5779 * @reuse: keep the context for reuse
5780 *
5781 * Common front-end for the htmlRead functions
5782 *
5783 * Returns the resulting document tree or NULL
5784 */
5785static htmlDocPtr
5786htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5787 int options, int reuse)
5788{
5789 htmlDocPtr ret;
5790
5791 htmlCtxtUseOptions(ctxt, options);
5792 ctxt->html = 1;
5793 if (encoding != NULL) {
5794 xmlCharEncodingHandlerPtr hdlr;
5795
5796 hdlr = xmlFindCharEncodingHandler(encoding);
5797 if (hdlr != NULL)
5798 xmlSwitchToEncoding(ctxt, hdlr);
5799 }
5800 if ((URL != NULL) && (ctxt->input != NULL) &&
5801 (ctxt->input->filename == NULL))
5802 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
5803 htmlParseDocument(ctxt);
5804 ret = ctxt->myDoc;
5805 ctxt->myDoc = NULL;
5806 if (!reuse) {
5807 if ((ctxt->dictNames) &&
5808 (ret != NULL) &&
5809 (ret->dict == ctxt->dict))
5810 ctxt->dict = NULL;
5811 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00005812 }
5813 return (ret);
5814}
5815
5816/**
5817 * htmlReadDoc:
5818 * @cur: a pointer to a zero terminated string
5819 * @URL: the base URL to use for the document
5820 * @encoding: the document encoding, or NULL
5821 * @options: a combination of htmlParserOption(s)
5822 *
5823 * parse an XML in-memory document and build a tree.
5824 *
5825 * Returns the resulting document tree
5826 */
5827htmlDocPtr
5828htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
5829{
5830 htmlParserCtxtPtr ctxt;
5831
5832 if (cur == NULL)
5833 return (NULL);
5834
5835 ctxt = xmlCreateDocParserCtxt(cur);
5836 if (ctxt == NULL)
5837 return (NULL);
5838 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5839}
5840
5841/**
5842 * htmlReadFile:
5843 * @filename: a file or URL
5844 * @encoding: the document encoding, or NULL
5845 * @options: a combination of htmlParserOption(s)
5846 *
5847 * parse an XML file from the filesystem or the network.
5848 *
5849 * Returns the resulting document tree
5850 */
5851htmlDocPtr
5852htmlReadFile(const char *filename, const char *encoding, int options)
5853{
5854 htmlParserCtxtPtr ctxt;
5855
5856 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5857 if (ctxt == NULL)
5858 return (NULL);
5859 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
5860}
5861
5862/**
5863 * htmlReadMemory:
5864 * @buffer: a pointer to a char array
5865 * @size: the size of the array
5866 * @URL: the base URL to use for the document
5867 * @encoding: the document encoding, or NULL
5868 * @options: a combination of htmlParserOption(s)
5869 *
5870 * parse an XML in-memory document and build a tree.
5871 *
5872 * Returns the resulting document tree
5873 */
5874htmlDocPtr
5875htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
5876{
5877 htmlParserCtxtPtr ctxt;
5878
5879 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
5880 if (ctxt == NULL)
5881 return (NULL);
William M. Brackd43cdcd2004-08-03 15:13:29 +00005882 if (ctxt->sax != NULL)
5883 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00005884 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5885}
5886
5887/**
5888 * htmlReadFd:
5889 * @fd: an open file descriptor
5890 * @URL: the base URL to use for the document
5891 * @encoding: the document encoding, or NULL
5892 * @options: a combination of htmlParserOption(s)
5893 *
5894 * parse an XML from a file descriptor and build a tree.
5895 *
5896 * Returns the resulting document tree
5897 */
5898htmlDocPtr
5899htmlReadFd(int fd, const char *URL, const char *encoding, int options)
5900{
5901 htmlParserCtxtPtr ctxt;
5902 xmlParserInputBufferPtr input;
5903 xmlParserInputPtr stream;
5904
5905 if (fd < 0)
5906 return (NULL);
5907
5908 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
5909 if (input == NULL)
5910 return (NULL);
5911 ctxt = xmlNewParserCtxt();
5912 if (ctxt == NULL) {
5913 xmlFreeParserInputBuffer(input);
5914 return (NULL);
5915 }
5916 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5917 if (stream == NULL) {
5918 xmlFreeParserInputBuffer(input);
5919 xmlFreeParserCtxt(ctxt);
5920 return (NULL);
5921 }
5922 inputPush(ctxt, stream);
5923 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5924}
5925
5926/**
5927 * htmlReadIO:
5928 * @ioread: an I/O read function
5929 * @ioclose: an I/O close function
5930 * @ioctx: an I/O handler
5931 * @URL: the base URL to use for the document
5932 * @encoding: the document encoding, or NULL
5933 * @options: a combination of htmlParserOption(s)
5934 *
5935 * parse an HTML document from I/O functions and source and build a tree.
5936 *
5937 * Returns the resulting document tree
5938 */
5939htmlDocPtr
5940htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
5941 void *ioctx, const char *URL, const char *encoding, int options)
5942{
5943 htmlParserCtxtPtr ctxt;
5944 xmlParserInputBufferPtr input;
5945 xmlParserInputPtr stream;
5946
5947 if (ioread == NULL)
5948 return (NULL);
5949
5950 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
5951 XML_CHAR_ENCODING_NONE);
5952 if (input == NULL)
5953 return (NULL);
5954 ctxt = xmlNewParserCtxt();
5955 if (ctxt == NULL) {
5956 xmlFreeParserInputBuffer(input);
5957 return (NULL);
5958 }
5959 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5960 if (stream == NULL) {
5961 xmlFreeParserInputBuffer(input);
5962 xmlFreeParserCtxt(ctxt);
5963 return (NULL);
5964 }
5965 inputPush(ctxt, stream);
5966 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5967}
5968
5969/**
5970 * htmlCtxtReadDoc:
5971 * @ctxt: an HTML parser context
5972 * @cur: a pointer to a zero terminated string
5973 * @URL: the base URL to use for the document
5974 * @encoding: the document encoding, or NULL
5975 * @options: a combination of htmlParserOption(s)
5976 *
5977 * parse an XML in-memory document and build a tree.
5978 * This reuses the existing @ctxt parser context
5979 *
5980 * Returns the resulting document tree
5981 */
5982htmlDocPtr
5983htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
5984 const char *URL, const char *encoding, int options)
5985{
5986 xmlParserInputPtr stream;
5987
5988 if (cur == NULL)
5989 return (NULL);
5990 if (ctxt == NULL)
5991 return (NULL);
5992
5993 htmlCtxtReset(ctxt);
5994
5995 stream = xmlNewStringInputStream(ctxt, cur);
5996 if (stream == NULL) {
5997 return (NULL);
5998 }
5999 inputPush(ctxt, stream);
6000 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6001}
6002
6003/**
6004 * htmlCtxtReadFile:
6005 * @ctxt: an HTML parser context
6006 * @filename: a file or URL
6007 * @encoding: the document encoding, or NULL
6008 * @options: a combination of htmlParserOption(s)
6009 *
6010 * parse an XML file from the filesystem or the network.
6011 * This reuses the existing @ctxt parser context
6012 *
6013 * Returns the resulting document tree
6014 */
6015htmlDocPtr
6016htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6017 const char *encoding, int options)
6018{
6019 xmlParserInputPtr stream;
6020
6021 if (filename == NULL)
6022 return (NULL);
6023 if (ctxt == NULL)
6024 return (NULL);
6025
6026 htmlCtxtReset(ctxt);
6027
6028 stream = xmlNewInputFromFile(ctxt, filename);
6029 if (stream == NULL) {
6030 return (NULL);
6031 }
6032 inputPush(ctxt, stream);
6033 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6034}
6035
6036/**
6037 * htmlCtxtReadMemory:
6038 * @ctxt: an HTML parser context
6039 * @buffer: a pointer to a char array
6040 * @size: the size of the array
6041 * @URL: the base URL to use for the document
6042 * @encoding: the document encoding, or NULL
6043 * @options: a combination of htmlParserOption(s)
6044 *
6045 * parse an XML in-memory document and build a tree.
6046 * This reuses the existing @ctxt parser context
6047 *
6048 * Returns the resulting document tree
6049 */
6050htmlDocPtr
6051htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6052 const char *URL, const char *encoding, int options)
6053{
6054 xmlParserInputBufferPtr input;
6055 xmlParserInputPtr stream;
6056
6057 if (ctxt == NULL)
6058 return (NULL);
6059 if (buffer == NULL)
6060 return (NULL);
6061
6062 htmlCtxtReset(ctxt);
6063
6064 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6065 if (input == NULL) {
6066 return(NULL);
6067 }
6068
6069 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6070 if (stream == NULL) {
6071 xmlFreeParserInputBuffer(input);
6072 return(NULL);
6073 }
6074
6075 inputPush(ctxt, stream);
6076 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6077}
6078
6079/**
6080 * htmlCtxtReadFd:
6081 * @ctxt: an HTML parser context
6082 * @fd: an open file descriptor
6083 * @URL: the base URL to use for the document
6084 * @encoding: the document encoding, or NULL
6085 * @options: a combination of htmlParserOption(s)
6086 *
6087 * parse an XML from a file descriptor and build a tree.
6088 * This reuses the existing @ctxt parser context
6089 *
6090 * Returns the resulting document tree
6091 */
6092htmlDocPtr
6093htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6094 const char *URL, const char *encoding, int options)
6095{
6096 xmlParserInputBufferPtr input;
6097 xmlParserInputPtr stream;
6098
6099 if (fd < 0)
6100 return (NULL);
6101 if (ctxt == NULL)
6102 return (NULL);
6103
6104 htmlCtxtReset(ctxt);
6105
6106
6107 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6108 if (input == NULL)
6109 return (NULL);
6110 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6111 if (stream == NULL) {
6112 xmlFreeParserInputBuffer(input);
6113 return (NULL);
6114 }
6115 inputPush(ctxt, stream);
6116 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6117}
6118
6119/**
6120 * htmlCtxtReadIO:
6121 * @ctxt: an HTML parser context
6122 * @ioread: an I/O read function
6123 * @ioclose: an I/O close function
6124 * @ioctx: an I/O handler
6125 * @URL: the base URL to use for the document
6126 * @encoding: the document encoding, or NULL
6127 * @options: a combination of htmlParserOption(s)
6128 *
6129 * parse an HTML document from I/O functions and source and build a tree.
6130 * This reuses the existing @ctxt parser context
6131 *
6132 * Returns the resulting document tree
6133 */
6134htmlDocPtr
6135htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6136 xmlInputCloseCallback ioclose, void *ioctx,
6137 const char *URL,
6138 const char *encoding, int options)
6139{
6140 xmlParserInputBufferPtr input;
6141 xmlParserInputPtr stream;
6142
6143 if (ioread == NULL)
6144 return (NULL);
6145 if (ctxt == NULL)
6146 return (NULL);
6147
6148 htmlCtxtReset(ctxt);
6149
6150 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6151 XML_CHAR_ENCODING_NONE);
6152 if (input == NULL)
6153 return (NULL);
6154 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6155 if (stream == NULL) {
6156 xmlFreeParserInputBuffer(input);
6157 return (NULL);
6158 }
6159 inputPush(ctxt, stream);
6160 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6161}
6162
Owen Taylor3473f882001-02-23 17:55:21 +00006163#endif /* LIBXML_HTML_ENABLED */