blob: 919302f19e94f1c104e1c685574f3f552b256a2e [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillardf403d292003-10-05 13:51:35 +000062 * Some factorized error routines *
63 * *
64 ************************************************************************/
65
66/**
William M. Brackedb65a72004-02-06 07:36:04 +000067 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000068 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
Daniel Veillard157fee02003-10-31 10:36:03 +000076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000079 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
Daniel Veillard157fee02003-10-31 10:36:03 +0000109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000112 if (ctxt != NULL)
113 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
117 NULL, 0, 0,
118 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000119 if (ctxt != NULL)
120 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
128 * @val: integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
135{
Daniel Veillard157fee02003-10-31 10:36:03 +0000136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
138 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000139 if (ctxt != NULL)
140 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000144 if (ctxt != NULL)
145 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000146}
147
148/************************************************************************
149 * *
Owen Taylor3473f882001-02-23 17:55:21 +0000150 * Parser stacks related functions and macros *
151 * *
152 ************************************************************************/
153
Daniel Veillard1c732d22002-11-30 11:22:59 +0000154/**
155 * htmlnamePush:
156 * @ctxt: an HTML parser context
157 * @value: the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000162 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000163static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000165{
166 if (ctxt->nameNr >= ctxt->nameMax) {
167 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000168 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000169 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000170 ctxt->nameMax *
171 sizeof(ctxt->nameTab[0]));
172 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000173 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000174 return (0);
175 }
176 }
177 ctxt->nameTab[ctxt->nameNr] = value;
178 ctxt->name = value;
179 return (ctxt->nameNr++);
180}
181/**
182 * htmlnamePop:
183 * @ctxt: an HTML parser context
184 *
185 * Pops the top element name from the name stack
186 *
187 * Returns the name just removed
188 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000189static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000190htmlnamePop(htmlParserCtxtPtr ctxt)
191{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000192 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000193
Daniel Veillard1c732d22002-11-30 11:22:59 +0000194 if (ctxt->nameNr <= 0)
195 return (0);
196 ctxt->nameNr--;
197 if (ctxt->nameNr < 0)
198 return (0);
199 if (ctxt->nameNr > 0)
200 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
201 else
202 ctxt->name = NULL;
203 ret = ctxt->nameTab[ctxt->nameNr];
204 ctxt->nameTab[ctxt->nameNr] = 0;
205 return (ret);
206}
Owen Taylor3473f882001-02-23 17:55:21 +0000207
208/*
209 * Macros for accessing the content. Those should be used only by the parser,
210 * and not exported.
211 *
212 * Dirty macros, i.e. one need to make assumption on the context to use them
213 *
214 * CUR_PTR return the current pointer to the xmlChar to be parsed.
215 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
216 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
217 * in UNICODE mode. This should be used internally by the parser
218 * only to compare to ASCII values otherwise it would break when
219 * running with UTF-8 encoding.
220 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
221 * to compare on ASCII based substring.
222 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
223 * it should be used only to compare on ASCII based substring.
224 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000225 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000226 *
227 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
228 *
229 * CURRENT Returns the current char value, with the full decoding of
230 * UTF-8 if we are using this mode. It returns an int.
231 * NEXT Skip to the next character, this does the proper decoding
232 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000233 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000234 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
235 */
236
237#define UPPER (toupper(*ctxt->input->cur))
238
Daniel Veillard77a90a72003-03-22 00:04:05 +0000239#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000240
241#define NXT(val) ctxt->input->cur[(val)]
242
243#define UPP(val) (toupper(ctxt->input->cur[(val)]))
244
245#define CUR_PTR ctxt->input->cur
246
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000247#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
248 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
249 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000250
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000251#define GROW if ((ctxt->progressive == 0) && \
252 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
253 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000254
255#define CURRENT ((int) (*ctxt->input->cur))
256
257#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
258
259/* Inported from XML */
260
Daniel Veillard561b7f82002-03-20 21:55:57 +0000261/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
262#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000263#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000264
Daniel Veillard561b7f82002-03-20 21:55:57 +0000265#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000266#define NXT(val) ctxt->input->cur[(val)]
267#define CUR_PTR ctxt->input->cur
268
269
270#define NEXTL(l) do { \
271 if (*(ctxt->input->cur) == '\n') { \
272 ctxt->input->line++; ctxt->input->col = 1; \
273 } else ctxt->input->col++; \
274 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
275 } while (0)
276
277/************
278 \
279 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
280 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
281 ************/
282
283#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
284#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
285
286#define COPY_BUF(l,b,i,v) \
287 if (l == 1) b[i++] = (xmlChar) v; \
288 else i += xmlCopyChar(l,&b[i],v)
289
290/**
291 * htmlCurrentChar:
292 * @ctxt: the HTML parser context
293 * @len: pointer to the length of the char read
294 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000295 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000296 * bytes in the input buffer. Implement the end of line normalization:
297 * 2.11 End-of-Line Handling
298 * If the encoding is unspecified, in the case we find an ISO-Latin-1
299 * char, then the encoding converter is plugged in automatically.
300 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000301 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000302 */
303
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000304static int
Owen Taylor3473f882001-02-23 17:55:21 +0000305htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
306 if (ctxt->instate == XML_PARSER_EOF)
307 return(0);
308
309 if (ctxt->token != 0) {
310 *len = 0;
311 return(ctxt->token);
312 }
313 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
314 /*
315 * We are supposed to handle UTF8, check it's valid
316 * From rfc2044: encoding of the Unicode values on UTF-8:
317 *
318 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
319 * 0000 0000-0000 007F 0xxxxxxx
320 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
321 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
322 *
323 * Check for the 0x110000 limit too
324 */
325 const unsigned char *cur = ctxt->input->cur;
326 unsigned char c;
327 unsigned int val;
328
329 c = *cur;
330 if (c & 0x80) {
331 if (cur[1] == 0)
332 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
333 if ((cur[1] & 0xc0) != 0x80)
334 goto encoding_error;
335 if ((c & 0xe0) == 0xe0) {
336
337 if (cur[2] == 0)
338 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
339 if ((cur[2] & 0xc0) != 0x80)
340 goto encoding_error;
341 if ((c & 0xf0) == 0xf0) {
342 if (cur[3] == 0)
343 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
344 if (((c & 0xf8) != 0xf0) ||
345 ((cur[3] & 0xc0) != 0x80))
346 goto encoding_error;
347 /* 4-byte code */
348 *len = 4;
349 val = (cur[0] & 0x7) << 18;
350 val |= (cur[1] & 0x3f) << 12;
351 val |= (cur[2] & 0x3f) << 6;
352 val |= cur[3] & 0x3f;
353 } else {
354 /* 3-byte code */
355 *len = 3;
356 val = (cur[0] & 0xf) << 12;
357 val |= (cur[1] & 0x3f) << 6;
358 val |= cur[2] & 0x3f;
359 }
360 } else {
361 /* 2-byte code */
362 *len = 2;
363 val = (cur[0] & 0x1f) << 6;
364 val |= cur[1] & 0x3f;
365 }
366 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000367 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
368 "Char 0x%X out of allowed range\n", val);
Owen Taylor3473f882001-02-23 17:55:21 +0000369 }
370 return(val);
371 } else {
372 /* 1-byte code */
373 *len = 1;
374 return((int) *ctxt->input->cur);
375 }
376 }
377 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000378 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000379 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000380 * XML constructs only use < 128 chars
381 */
382 *len = 1;
383 if ((int) *ctxt->input->cur < 0x80)
384 return((int) *ctxt->input->cur);
385
386 /*
387 * Humm this is bad, do an automatic flow conversion
388 */
389 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
390 ctxt->charset = XML_CHAR_ENCODING_UTF8;
391 return(xmlCurrentChar(ctxt, len));
392
393encoding_error:
394 /*
395 * If we detect an UTF8 error that probably mean that the
396 * input encoding didn't get properly advertized in the
397 * declaration header. Report the error and switch the encoding
398 * to ISO-Latin-1 (if you don't like this policy, just declare the
399 * encoding !)
400 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000401 {
402 char buffer[150];
403
404 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
Owen Taylor3473f882001-02-23 17:55:21 +0000405 ctxt->input->cur[0], ctxt->input->cur[1],
406 ctxt->input->cur[2], ctxt->input->cur[3]);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000407 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
408 "Input is not proper UTF-8, indicate encoding !\n",
409 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000410 }
411
412 ctxt->charset = XML_CHAR_ENCODING_8859_1;
413 *len = 1;
414 return((int) *ctxt->input->cur);
415}
416
417/**
Owen Taylor3473f882001-02-23 17:55:21 +0000418 * htmlSkipBlankChars:
419 * @ctxt: the HTML parser context
420 *
421 * skip all blanks character found at that point in the input streams.
422 *
423 * Returns the number of space chars skipped
424 */
425
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000426static int
Owen Taylor3473f882001-02-23 17:55:21 +0000427htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
428 int res = 0;
429
William M. Brack76e95df2003-10-18 16:20:14 +0000430 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000431 if ((*ctxt->input->cur == 0) &&
432 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
433 xmlPopInput(ctxt);
434 } else {
435 if (*(ctxt->input->cur) == '\n') {
436 ctxt->input->line++; ctxt->input->col = 1;
437 } else ctxt->input->col++;
438 ctxt->input->cur++;
439 ctxt->nbChars++;
440 if (*ctxt->input->cur == 0)
441 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
442 }
443 res++;
444 }
445 return(res);
446}
447
448
449
450/************************************************************************
451 * *
452 * The list of HTML elements and their properties *
453 * *
454 ************************************************************************/
455
456/*
457 * Start Tag: 1 means the start tag can be ommited
458 * End Tag: 1 means the end tag can be ommited
459 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000460 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000461 * Depr: this element is deprecated
462 * DTD: 1 means that this element is valid only in the Loose DTD
463 * 2 means that this element is valid only in the Frameset DTD
464 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000465 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000466 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000467 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000468
469/* Definitions and a couple of vars for HTML Elements */
470
471#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000472#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000473#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000474#define NB_PHRASE 10
Daniel Veillard930dfb62003-02-05 10:17:38 +0000475#define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000476#define NB_SPECIAL 15
Daniel Veillard930dfb62003-02-05 10:17:38 +0000477#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000478#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
479#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
480#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000481#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000482#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000483#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000484#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000485#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000486#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000487#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000488#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000489#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000490#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000491#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000492#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000493#define EMPTY NULL
494
495
496static const char* html_flow[] = { FLOW, NULL } ;
497static const char* html_inline[] = { INLINE, NULL } ;
498
499/* placeholders: elts with content but no subelements */
500static const char* html_pcdata[] = { NULL } ;
501#define html_cdata html_pcdata
502
503
504/* ... and for HTML Attributes */
505
506#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000507#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000508#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000509#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000510#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000511#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000512#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000513#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000514#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000515#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000516#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000517#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000518
519static const char* html_attrs[] = { ATTRS, NULL } ;
520static const char* core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
521static const char* core_attrs[] = { COREATTRS, NULL } ;
522static const char* i18n_attrs[] = { I18N, NULL } ;
523
524
525/* Other declarations that should go inline ... */
526static const char* a_attrs[] = { ATTRS, "charset", "type", "name",
527 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
528 "tabindex", "onfocus", "onblur", NULL } ;
529static const char* target_attr[] = { "target", NULL } ;
530static const char* rows_cols_attr[] = { "rows", "cols", NULL } ;
531static const char* alt_attr[] = { "alt", NULL } ;
532static const char* src_alt_attrs[] = { "src", "alt", NULL } ;
533static const char* href_attrs[] = { "href", NULL } ;
534static const char* clear_attrs[] = { "clear", NULL } ;
535static const char* inline_p[] = { INLINE, "p", NULL } ;
536static const char* flow_param[] = { FLOW, "param", NULL } ;
537static const char* applet_attrs[] = { COREATTRS , "codebase",
538 "archive", "alt", "name", "height", "width", "align",
539 "hspace", "vspace", NULL } ;
540static const char* area_attrs[] = { "shape", "coords", "href", "nohref",
541 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
542static const char* basefont_attrs[] =
543 { "id", "size", "color", "face", NULL } ;
544static const char* quote_attrs[] = { ATTRS, "cite", NULL } ;
545static const char* body_contents[] = { FLOW, "ins", "del", NULL } ;
546static const char* body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
547static const char* body_depr[] = { "background", "bgcolor", "text",
548 "link", "vlink", "alink", NULL } ;
549static const char* button_attrs[] = { ATTRS, "name", "value", "type",
550 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
551
552
553static const char* col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
554static const char* col_elt[] = { "col", NULL } ;
555static const char* edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
556static const char* compact_attrs[] = { ATTRS, "compact", NULL } ;
557static const char* dl_contents[] = { "dt", "dd", NULL } ;
558static const char* compact_attr[] = { "compact", NULL } ;
559static const char* label_attr[] = { "label", NULL } ;
560static const char* fieldset_contents[] = { FLOW, "legend" } ;
561static const char* font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
562static const char* form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
563static const char* form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
564static const char* frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
565static const char* frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
566static const char* frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
567static const char* head_attrs[] = { I18N, "profile", NULL } ;
568static const char* head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
569static const char* hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
570static const char* version_attr[] = { "version", NULL } ;
571static const char* html_content[] = { "head", "body", "frameset", NULL } ;
572static const char* iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
573static const char* img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
574static const char* input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
575static const char* prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
576static const char* label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
577static const char* legend_attrs[] = { ATTRS, "accesskey", NULL } ;
578static const char* align_attr[] = { "align", NULL } ;
579static const char* link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
580static const char* map_contents[] = { BLOCK, "area", NULL } ;
581static const char* name_attr[] = { "name", NULL } ;
582static const char* action_attr[] = { "action", NULL } ;
583static const char* blockli_elt[] = { BLOCK, "li", NULL } ;
584static const char* meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
585static const char* content_attr[] = { "content", NULL } ;
586static const char* type_attr[] = { "type", NULL } ;
587static const char* noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
588static const char* object_contents[] = { FLOW, "param", NULL } ;
589static const char* object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
590static const char* object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
591static const char* ol_attrs[] = { "type", "compact", "start", NULL} ;
592static const char* option_elt[] = { "option", NULL } ;
593static const char* optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
594static const char* option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
595static const char* param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
596static const char* width_attr[] = { "width", NULL } ;
597static const char* pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
598static const char* script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
599static const char* language_attr[] = { "language", NULL } ;
600static const char* select_content[] = { "optgroup", "option", NULL } ;
601static const char* select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
602static const char* style_attrs[] = { I18N, "media", "title", NULL } ;
603static const char* table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
604static const char* table_depr[] = { "align", "bgcolor", NULL } ;
605static const char* table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
606static const char* tr_elt[] = { "tr", NULL } ;
607static const char* talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
608static const char* th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
609static const char* th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
610static const char* textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
611static const char* tr_contents[] = { "th", "td", NULL } ;
612static const char* bgcolor_attr[] = { "bgcolor", NULL } ;
613static const char* li_elt[] = { "li", NULL } ;
614static const char* ul_depr[] = { "type", "compact", NULL} ;
615static const char* dir_attr[] = { "dir", NULL} ;
616
617#define DECL (const char**)
618
Daniel Veillard22090732001-07-16 00:06:07 +0000619static const htmlElemDesc
620html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000621{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
622 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
623},
624{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
625 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
626},
627{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
628 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
629},
630{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
631 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
632},
633{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
634 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
635},
636{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
637 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
638},
639{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
640 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
641},
642{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
643 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
644},
645{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
646 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
647},
648{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
649 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
650},
651{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
652 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
653},
654{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
655 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
656},
657{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
658 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
659},
660{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
661 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
662},
663{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
664 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
665},
666{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
667 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
668},
669{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
670 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
671},
672{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
673 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
674},
675{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
676 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
677},
678{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
679 EMPTY , NULL , DECL col_attrs , NULL, NULL
680},
681{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
682 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
683},
684{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
685 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
686},
687{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
688 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
689},
690{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
691 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
692},
693{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
694 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
695},
696{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
697 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
698},
699{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
700 DECL dl_contents , "dd" , html_attrs, DECL compact_attr, NULL
701},
702{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
703 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
704},
705{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
706 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
707},
708{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
709 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
710},
711{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
712 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
713},
714{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
715 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
716},
717{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
718 EMPTY, NULL, NULL, DECL frame_attrs, NULL
719},
720{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
721 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
722},
723{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
724 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
725},
726{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
727 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
728},
729{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
730 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
731},
732{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
733 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
734},
735{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
736 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
737},
738{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
739 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
740},
741{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
742 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
743},
744{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
745 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
746},
747{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
748 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
749},
750{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
751 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
752},
753{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
754 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
755},
756{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
757 EMPTY, NULL, DECL img_attrs, DECL align_attr, src_alt_attrs
758},
759{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
760 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
761},
762{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
763 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
764},
765{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
766 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
767},
768{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
769 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
770},
771{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
772 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
773},
774{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
775 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
776},
777{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
778 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
779},
780{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
781 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
782},
783{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
784 DECL map_contents , NULL, DECL html_attrs , NULL, name_attr
785},
786{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
787 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
788},
789{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
790 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
791},
792{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
793 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
794},
795{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
796 DECL html_flow, "div", DECL html_attrs, NULL, NULL
797},
798{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
799 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
800},
801{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
802 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
803},
804{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
805 option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
806},
807{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
808 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
809},
810{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
811 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
812},
813{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
814 EMPTY, NULL, DECL param_attrs, NULL, name_attr
815},
816{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
817 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
818},
819{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
820 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
821},
822{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
823 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
824},
825{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
826 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
827},
828{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
829 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
830},
831{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
832 DECL select_content, NULL, DECL select_attrs, NULL, NULL
833},
834{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
835 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
836},
837{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
838 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
839},
840{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
841 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
842},
843{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
844 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
845},
846{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
847 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
848},
849{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
850 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
851},
852{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
853 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
854},
855{ "table", 0, 0, 0, 0, 0, 0, 0, "",
856 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
857},
858{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
859 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
860},
861{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
862 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
863},
864{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
865 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
866},
867{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
868 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
869},
870{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
871 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
872},
873{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
874 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
875},
876{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
877 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
878},
879{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
880 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
881},
882{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
883 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
884},
885{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
886 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
887},
888{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
889 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
890},
891{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
892 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
893}
Owen Taylor3473f882001-02-23 17:55:21 +0000894};
895
896/*
Owen Taylor3473f882001-02-23 17:55:21 +0000897 * start tags that imply the end of current element
898 */
Daniel Veillard22090732001-07-16 00:06:07 +0000899static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000900"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
901 "dl", "ul", "ol", "menu", "dir", "address", "pre",
902 "listing", "xmp", "head", NULL,
903"head", "p", NULL,
904"title", "p", NULL,
905"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +0000906"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000907"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
908 "pre", "listing", "xmp", "head", "li", NULL,
909"hr", "p", "head", NULL,
910"h1", "p", "head", NULL,
911"h2", "p", "head", NULL,
912"h3", "p", "head", NULL,
913"h4", "p", "head", NULL,
914"h5", "p", "head", NULL,
915"h6", "p", "head", NULL,
916"dir", "p", "head", NULL,
917"address", "p", "head", "ul", NULL,
918"pre", "p", "head", "ul", NULL,
919"listing", "p", "head", NULL,
920"xmp", "p", "head", NULL,
921"blockquote", "p", "head", NULL,
922"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
923 "xmp", "head", NULL,
924"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
925 "head", "dd", NULL,
926"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
927 "head", "dt", NULL,
928"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
929 "listing", "xmp", NULL,
930"ol", "p", "head", "ul", NULL,
931"menu", "p", "head", "ul", NULL,
932"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
933"div", "p", "head", NULL,
934"noscript", "p", "head", NULL,
935"center", "font", "b", "i", "p", "head", NULL,
936"a", "a", NULL,
937"caption", "p", NULL,
938"colgroup", "caption", "colgroup", "col", "p", NULL,
939"col", "caption", "col", "p", NULL,
940"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
941 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000942"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
943"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000944"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
945"thead", "caption", "col", "colgroup", NULL,
946"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
947 "tbody", "p", NULL,
948"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
949 "tfoot", "tbody", "p", NULL,
950"optgroup", "option", NULL,
951"option", "option", NULL,
952"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
953 "pre", "listing", "xmp", "a", NULL,
954NULL
955};
956
957/*
958 * The list of HTML elements which are supposed not to have
959 * CDATA content and where a p element will be implied
960 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000961 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000962 * implied paragraph
963 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000964static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000965 "html",
966 "head",
967 "body",
968 NULL
969};
970
971/*
972 * The list of HTML attributes which are of content %Script;
973 * NOTE: when adding ones, check htmlIsScriptAttribute() since
974 * it assumes the name starts with 'on'
975 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000976static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000977 "onclick",
978 "ondblclick",
979 "onmousedown",
980 "onmouseup",
981 "onmouseover",
982 "onmousemove",
983 "onmouseout",
984 "onkeypress",
985 "onkeydown",
986 "onkeyup",
987 "onload",
988 "onunload",
989 "onfocus",
990 "onblur",
991 "onsubmit",
992 "onrest",
993 "onchange",
994 "onselect"
995};
996
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000997/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000998 * This table is used by the htmlparser to know what to do with
999 * broken html pages. By assigning different priorities to different
1000 * elements the parser can decide how to handle extra endtags.
1001 * Endtags are only allowed to close elements with lower or equal
1002 * priority.
1003 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001004
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001005typedef struct {
1006 const char *name;
1007 int priority;
1008} elementPriority;
1009
Daniel Veillard22090732001-07-16 00:06:07 +00001010static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001011 {"div", 150},
1012 {"td", 160},
1013 {"th", 160},
1014 {"tr", 170},
1015 {"thead", 180},
1016 {"tbody", 180},
1017 {"tfoot", 180},
1018 {"table", 190},
1019 {"head", 200},
1020 {"body", 200},
1021 {"html", 220},
1022 {NULL, 100} /* Default priority */
1023};
Owen Taylor3473f882001-02-23 17:55:21 +00001024
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001025static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001026static int htmlStartCloseIndexinitialized = 0;
1027
1028/************************************************************************
1029 * *
1030 * functions to handle HTML specific data *
1031 * *
1032 ************************************************************************/
1033
1034/**
1035 * htmlInitAutoClose:
1036 *
1037 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1038 * This is not reentrant. Call xmlInitParser() once before processing in
1039 * case of use in multithreaded programs.
1040 */
1041void
1042htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001043 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001044
1045 if (htmlStartCloseIndexinitialized) return;
1046
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001047 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1048 indx = 0;
1049 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1050 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001051 while (htmlStartClose[i] != NULL) i++;
1052 i++;
1053 }
1054 htmlStartCloseIndexinitialized = 1;
1055}
1056
1057/**
1058 * htmlTagLookup:
1059 * @tag: The tag name in lowercase
1060 *
1061 * Lookup the HTML tag in the ElementTable
1062 *
1063 * Returns the related htmlElemDescPtr or NULL if not found.
1064 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001065const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001066htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001067 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001068
1069 for (i = 0; i < (sizeof(html40ElementTable) /
1070 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001071 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001072 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001073 }
1074 return(NULL);
1075}
1076
1077/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001078 * htmlGetEndPriority:
1079 * @name: The name of the element to look up the priority for.
1080 *
1081 * Return value: The "endtag" priority.
1082 **/
1083static int
1084htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001085 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001086
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001087 while ((htmlEndPriority[i].name != NULL) &&
1088 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1089 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001090
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001091 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001092}
1093
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001094
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001095/**
Owen Taylor3473f882001-02-23 17:55:21 +00001096 * htmlCheckAutoClose:
1097 * @newtag: The new tag name
1098 * @oldtag: The old tag name
1099 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001100 * Checks whether the new tag is one of the registered valid tags for
1101 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001102 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1103 *
1104 * Returns 0 if no, 1 if yes.
1105 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001106static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001107htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1108{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001109 int i, indx;
1110 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001111
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001112 if (htmlStartCloseIndexinitialized == 0)
1113 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001114
1115 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001116 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001117 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001118 if (closed == NULL)
1119 return (0);
1120 if (xmlStrEqual(BAD_CAST * closed, newtag))
1121 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001122 }
1123
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001124 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001125 i++;
1126 while (htmlStartClose[i] != NULL) {
1127 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001128 return (1);
1129 }
1130 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001131 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001132 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001133}
1134
1135/**
1136 * htmlAutoCloseOnClose:
1137 * @ctxt: an HTML parser context
1138 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001139 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001140 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001141 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001142 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001143static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001144htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1145{
1146 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001147 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001148
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001149 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001150
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001151 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001152
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001153 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1154 break;
1155 /*
1156 * A missplaced endtag can only close elements with lower
1157 * or equal priority, so if we find an element with higher
1158 * priority before we find an element with
1159 * matching name, we just ignore this endtag
1160 */
1161 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1162 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001163 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001164 if (i < 0)
1165 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001166
1167 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001168 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001169 if ((info != NULL) && (info->endTag == 3)) {
1170 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1171 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001172 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001173 }
1174 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1175 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001176 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001177 }
1178}
1179
1180/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001181 * htmlAutoCloseOnEnd:
1182 * @ctxt: an HTML parser context
1183 *
1184 * Close all remaining tags at the end of the stream
1185 */
1186static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001187htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1188{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001189 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001190
William M. Brack899e64a2003-09-26 18:03:42 +00001191 if (ctxt->nameNr == 0)
1192 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001193 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001194 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1195 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001196 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001197 }
1198}
1199
1200/**
Owen Taylor3473f882001-02-23 17:55:21 +00001201 * htmlAutoClose:
1202 * @ctxt: an HTML parser context
1203 * @newtag: The new tag name or NULL
1204 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001205 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001206 * The list is kept in htmlStartClose array. This function is
1207 * called when a new tag has been detected and generates the
1208 * appropriates closes if possible/needed.
1209 * If newtag is NULL this mean we are at the end of the resource
1210 * and we should check
1211 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001212static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001213htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1214{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001215 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001216 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001217 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1218 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001219 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001220 }
1221 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001222 htmlAutoCloseOnEnd(ctxt);
1223 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001224 }
1225 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001226 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1227 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1228 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001229 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1230 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001231 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001232 }
Owen Taylor3473f882001-02-23 17:55:21 +00001233}
1234
1235/**
1236 * htmlAutoCloseTag:
1237 * @doc: the HTML document
1238 * @name: The tag name
1239 * @elem: the HTML element
1240 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001241 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001242 * The list is kept in htmlStartClose array. This function checks
1243 * if the element or one of it's children would autoclose the
1244 * given tag.
1245 *
1246 * Returns 1 if autoclose, 0 otherwise
1247 */
1248int
1249htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1250 htmlNodePtr child;
1251
1252 if (elem == NULL) return(1);
1253 if (xmlStrEqual(name, elem->name)) return(0);
1254 if (htmlCheckAutoClose(elem->name, name)) return(1);
1255 child = elem->children;
1256 while (child != NULL) {
1257 if (htmlAutoCloseTag(doc, name, child)) return(1);
1258 child = child->next;
1259 }
1260 return(0);
1261}
1262
1263/**
1264 * htmlIsAutoClosed:
1265 * @doc: the HTML document
1266 * @elem: the HTML element
1267 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001268 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001269 * The list is kept in htmlStartClose array. This function checks
1270 * if a tag is autoclosed by one of it's child
1271 *
1272 * Returns 1 if autoclosed, 0 otherwise
1273 */
1274int
1275htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1276 htmlNodePtr child;
1277
1278 if (elem == NULL) return(1);
1279 child = elem->children;
1280 while (child != NULL) {
1281 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1282 child = child->next;
1283 }
1284 return(0);
1285}
1286
1287/**
1288 * htmlCheckImplied:
1289 * @ctxt: an HTML parser context
1290 * @newtag: The new tag name
1291 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001292 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001293 * called when a new tag has been detected and generates the
1294 * appropriates implicit tags if missing
1295 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001296static void
Owen Taylor3473f882001-02-23 17:55:21 +00001297htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1298 if (!htmlOmittedDefaultValue)
1299 return;
1300 if (xmlStrEqual(newtag, BAD_CAST"html"))
1301 return;
1302 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001303 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001304 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1305 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1306 }
1307 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1308 return;
1309 if ((ctxt->nameNr <= 1) &&
1310 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1311 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1312 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1313 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1314 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1315 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1316 /*
1317 * dropped OBJECT ... i you put it first BODY will be
1318 * assumed !
1319 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001320 htmlnamePush(ctxt, BAD_CAST"head");
Owen Taylor3473f882001-02-23 17:55:21 +00001321 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1322 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1323 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1324 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1325 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1326 int i;
1327 for (i = 0;i < ctxt->nameNr;i++) {
1328 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1329 return;
1330 }
1331 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1332 return;
1333 }
1334 }
1335
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001336 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001337 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1338 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1339 }
1340}
1341
1342/**
1343 * htmlCheckParagraph
1344 * @ctxt: an HTML parser context
1345 *
1346 * Check whether a p element need to be implied before inserting
1347 * characters in the current element.
1348 *
1349 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1350 * in case of error.
1351 */
1352
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001353static int
Owen Taylor3473f882001-02-23 17:55:21 +00001354htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1355 const xmlChar *tag;
1356 int i;
1357
1358 if (ctxt == NULL)
1359 return(-1);
1360 tag = ctxt->name;
1361 if (tag == NULL) {
1362 htmlAutoClose(ctxt, BAD_CAST"p");
1363 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001364 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001365 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1366 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1367 return(1);
1368 }
1369 if (!htmlOmittedDefaultValue)
1370 return(0);
1371 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1372 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001373 htmlAutoClose(ctxt, BAD_CAST"p");
1374 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001375 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001376 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1377 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1378 return(1);
1379 }
1380 }
1381 return(0);
1382}
1383
1384/**
1385 * htmlIsScriptAttribute:
1386 * @name: an attribute name
1387 *
1388 * Check if an attribute is of content type Script
1389 *
1390 * Returns 1 is the attribute is a script 0 otherwise
1391 */
1392int
1393htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001394 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001395
1396 if (name == NULL)
1397 return(0);
1398 /*
1399 * all script attributes start with 'on'
1400 */
1401 if ((name[0] != 'o') || (name[1] != 'n'))
1402 return(0);
1403 for (i = 0;
1404 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1405 i++) {
1406 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1407 return(1);
1408 }
1409 return(0);
1410}
1411
1412/************************************************************************
1413 * *
1414 * The list of HTML predefined entities *
1415 * *
1416 ************************************************************************/
1417
1418
Daniel Veillard22090732001-07-16 00:06:07 +00001419static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001420/*
1421 * the 4 absolute ones, plus apostrophe.
1422 */
1423{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1424{ 38, "amp", "ampersand, U+0026 ISOnum" },
1425{ 39, "apos", "single quote" },
1426{ 60, "lt", "less-than sign, U+003C ISOnum" },
1427{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1428
1429/*
1430 * A bunch still in the 128-255 range
1431 * Replacing them depend really on the charset used.
1432 */
1433{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1434{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1435{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1436{ 163, "pound","pound sign, U+00A3 ISOnum" },
1437{ 164, "curren","currency sign, U+00A4 ISOnum" },
1438{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1439{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1440{ 167, "sect", "section sign, U+00A7 ISOnum" },
1441{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1442{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1443{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1444{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1445{ 172, "not", "not sign, U+00AC ISOnum" },
1446{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1447{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1448{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1449{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1450{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1451{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1452{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1453{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1454{ 181, "micro","micro sign, U+00B5 ISOnum" },
1455{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1456{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1457{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1458{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1459{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1460{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1461{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1462{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1463{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1464{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1465{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1466{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1467{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1468{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1469{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1470{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1471{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1472{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1473{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1474{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1475{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1476{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1477{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1478{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1479{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1480{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1481{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1482{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1483{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1484{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1485{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1486{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1487{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1488{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1489{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1490{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1491{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1492{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1493{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1494{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1495{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1496{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1497{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1498{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1499{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1500{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1501{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1502{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1503{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1504{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1505{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1506{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1507{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1508{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1509{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1510{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1511{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1512{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1513{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1514{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1515{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1516{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1517{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1518{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1519{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1520{ 247, "divide","division sign, U+00F7 ISOnum" },
1521{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1522{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1523{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1524{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1525{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1526{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1527{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1528{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1529
1530{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1531{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1532{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1533{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1534{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1535
1536/*
1537 * Anything below should really be kept as entities references
1538 */
1539{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1540
1541{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1542{ 732, "tilde","small tilde, U+02DC ISOdia" },
1543
1544{ 913, "Alpha","greek capital letter alpha, U+0391" },
1545{ 914, "Beta", "greek capital letter beta, U+0392" },
1546{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1547{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1548{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1549{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1550{ 919, "Eta", "greek capital letter eta, U+0397" },
1551{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1552{ 921, "Iota", "greek capital letter iota, U+0399" },
1553{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001554{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001555{ 924, "Mu", "greek capital letter mu, U+039C" },
1556{ 925, "Nu", "greek capital letter nu, U+039D" },
1557{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1558{ 927, "Omicron","greek capital letter omicron, U+039F" },
1559{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1560{ 929, "Rho", "greek capital letter rho, U+03A1" },
1561{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1562{ 932, "Tau", "greek capital letter tau, U+03A4" },
1563{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1564{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1565{ 935, "Chi", "greek capital letter chi, U+03A7" },
1566{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1567{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1568
1569{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1570{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1571{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1572{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1573{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1574{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1575{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1576{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1577{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1578{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1579{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1580{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1581{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1582{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1583{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1584{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1585{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1586{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1587{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1588{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1589{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1590{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1591{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1592{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1593{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1594{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1595{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1596{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1597
1598{ 8194, "ensp", "en space, U+2002 ISOpub" },
1599{ 8195, "emsp", "em space, U+2003 ISOpub" },
1600{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1601{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1602{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1603{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1604{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1605{ 8211, "ndash","en dash, U+2013 ISOpub" },
1606{ 8212, "mdash","em dash, U+2014 ISOpub" },
1607{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1608{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1609{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1610{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1611{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1612{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1613{ 8224, "dagger","dagger, U+2020 ISOpub" },
1614{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1615
1616{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1617{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1618
1619{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1620
1621{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1622{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1623
1624{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1625{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1626
1627{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1628{ 8260, "frasl","fraction slash, U+2044 NEW" },
1629
1630{ 8364, "euro", "euro sign, U+20AC NEW" },
1631
1632{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1633{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1634{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1635{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1636{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1637{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1638{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1639{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1640{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1641{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1642{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1643{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1644{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1645{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1646{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1647{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1648
1649{ 8704, "forall","for all, U+2200 ISOtech" },
1650{ 8706, "part", "partial differential, U+2202 ISOtech" },
1651{ 8707, "exist","there exists, U+2203 ISOtech" },
1652{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1653{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1654{ 8712, "isin", "element of, U+2208 ISOtech" },
1655{ 8713, "notin","not an element of, U+2209 ISOtech" },
1656{ 8715, "ni", "contains as member, U+220B ISOtech" },
1657{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001658{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001659{ 8722, "minus","minus sign, U+2212 ISOtech" },
1660{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1661{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1662{ 8733, "prop", "proportional to, U+221D ISOtech" },
1663{ 8734, "infin","infinity, U+221E ISOtech" },
1664{ 8736, "ang", "angle, U+2220 ISOamso" },
1665{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1666{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1667{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1668{ 8746, "cup", "union = cup, U+222A ISOtech" },
1669{ 8747, "int", "integral, U+222B ISOtech" },
1670{ 8756, "there4","therefore, U+2234 ISOtech" },
1671{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1672{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1673{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1674{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1675{ 8801, "equiv","identical to, U+2261 ISOtech" },
1676{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1677{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1678{ 8834, "sub", "subset of, U+2282 ISOtech" },
1679{ 8835, "sup", "superset of, U+2283 ISOtech" },
1680{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1681{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1682{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1683{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1684{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1685{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1686{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1687{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1688{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1689{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1690{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1691{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1692{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1693{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1694
1695{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1696{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1697{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1698{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1699
1700};
1701
1702/************************************************************************
1703 * *
1704 * Commodity functions to handle entities *
1705 * *
1706 ************************************************************************/
1707
1708/*
1709 * Macro used to grow the current buffer.
1710 */
1711#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001712 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001713 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001714 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1715 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001716 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001717 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001718 return(NULL); \
1719 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001720 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001721}
1722
1723/**
1724 * htmlEntityLookup:
1725 * @name: the entity name
1726 *
1727 * Lookup the given entity in EntitiesTable
1728 *
1729 * TODO: the linear scan is really ugly, an hash table is really needed.
1730 *
1731 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1732 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001733const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001734htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001735 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001736
1737 for (i = 0;i < (sizeof(html40EntitiesTable)/
1738 sizeof(html40EntitiesTable[0]));i++) {
1739 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001740 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001741 }
1742 }
1743 return(NULL);
1744}
1745
1746/**
1747 * htmlEntityValueLookup:
1748 * @value: the entity's unicode value
1749 *
1750 * Lookup the given entity in EntitiesTable
1751 *
1752 * TODO: the linear scan is really ugly, an hash table is really needed.
1753 *
1754 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1755 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001756const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001757htmlEntityValueLookup(unsigned int value) {
1758 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001759
1760 for (i = 0;i < (sizeof(html40EntitiesTable)/
1761 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001762 if (html40EntitiesTable[i].value >= value) {
1763 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001764 break;
William M. Brack78637da2003-07-31 14:47:38 +00001765 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001766 }
Owen Taylor3473f882001-02-23 17:55:21 +00001767 }
1768 return(NULL);
1769}
1770
1771/**
1772 * UTF8ToHtml:
1773 * @out: a pointer to an array of bytes to store the result
1774 * @outlen: the length of @out
1775 * @in: a pointer to an array of UTF-8 chars
1776 * @inlen: the length of @in
1777 *
1778 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1779 * plus HTML entities block of chars out.
1780 *
1781 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1782 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001783 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001784 * The value of @outlen after return is the number of octets consumed.
1785 */
1786int
1787UTF8ToHtml(unsigned char* out, int *outlen,
1788 const unsigned char* in, int *inlen) {
1789 const unsigned char* processed = in;
1790 const unsigned char* outend;
1791 const unsigned char* outstart = out;
1792 const unsigned char* instart = in;
1793 const unsigned char* inend;
1794 unsigned int c, d;
1795 int trailing;
1796
Daniel Veillardce682bc2004-11-05 17:22:25 +00001797 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00001798 if (in == NULL) {
1799 /*
1800 * initialization nothing to do
1801 */
1802 *outlen = 0;
1803 *inlen = 0;
1804 return(0);
1805 }
1806 inend = in + (*inlen);
1807 outend = out + (*outlen);
1808 while (in < inend) {
1809 d = *in++;
1810 if (d < 0x80) { c= d; trailing= 0; }
1811 else if (d < 0xC0) {
1812 /* trailing byte in leading position */
1813 *outlen = out - outstart;
1814 *inlen = processed - instart;
1815 return(-2);
1816 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1817 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1818 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1819 else {
1820 /* no chance for this in Ascii */
1821 *outlen = out - outstart;
1822 *inlen = processed - instart;
1823 return(-2);
1824 }
1825
1826 if (inend - in < trailing) {
1827 break;
1828 }
1829
1830 for ( ; trailing; trailing--) {
1831 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1832 break;
1833 c <<= 6;
1834 c |= d & 0x3F;
1835 }
1836
1837 /* assertion: c is a single UTF-4 value */
1838 if (c < 0x80) {
1839 if (out + 1 >= outend)
1840 break;
1841 *out++ = c;
1842 } else {
1843 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001844 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001845
1846 /*
1847 * Try to lookup a predefined HTML entity for it
1848 */
1849
1850 ent = htmlEntityValueLookup(c);
1851 if (ent == NULL) {
1852 /* no chance for this in Ascii */
1853 *outlen = out - outstart;
1854 *inlen = processed - instart;
1855 return(-2);
1856 }
1857 len = strlen(ent->name);
1858 if (out + 2 + len >= outend)
1859 break;
1860 *out++ = '&';
1861 memcpy(out, ent->name, len);
1862 out += len;
1863 *out++ = ';';
1864 }
1865 processed = in;
1866 }
1867 *outlen = out - outstart;
1868 *inlen = processed - instart;
1869 return(0);
1870}
1871
1872/**
1873 * htmlEncodeEntities:
1874 * @out: a pointer to an array of bytes to store the result
1875 * @outlen: the length of @out
1876 * @in: a pointer to an array of UTF-8 chars
1877 * @inlen: the length of @in
1878 * @quoteChar: the quote character to escape (' or ") or zero.
1879 *
1880 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1881 * plus HTML entities block of chars out.
1882 *
1883 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1884 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001885 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001886 * The value of @outlen after return is the number of octets consumed.
1887 */
1888int
1889htmlEncodeEntities(unsigned char* out, int *outlen,
1890 const unsigned char* in, int *inlen, int quoteChar) {
1891 const unsigned char* processed = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001892 const unsigned char* outend;
Owen Taylor3473f882001-02-23 17:55:21 +00001893 const unsigned char* outstart = out;
1894 const unsigned char* instart = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001895 const unsigned char* inend;
Owen Taylor3473f882001-02-23 17:55:21 +00001896 unsigned int c, d;
1897 int trailing;
1898
Daniel Veillardce682bc2004-11-05 17:22:25 +00001899 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
1900 return(-1);
1901 outend = out + (*outlen);
1902 inend = in + (*inlen);
Owen Taylor3473f882001-02-23 17:55:21 +00001903 while (in < inend) {
1904 d = *in++;
1905 if (d < 0x80) { c= d; trailing= 0; }
1906 else if (d < 0xC0) {
1907 /* trailing byte in leading position */
1908 *outlen = out - outstart;
1909 *inlen = processed - instart;
1910 return(-2);
1911 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1912 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1913 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1914 else {
1915 /* no chance for this in Ascii */
1916 *outlen = out - outstart;
1917 *inlen = processed - instart;
1918 return(-2);
1919 }
1920
1921 if (inend - in < trailing)
1922 break;
1923
1924 while (trailing--) {
1925 if (((d= *in++) & 0xC0) != 0x80) {
1926 *outlen = out - outstart;
1927 *inlen = processed - instart;
1928 return(-2);
1929 }
1930 c <<= 6;
1931 c |= d & 0x3F;
1932 }
1933
1934 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001935 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1936 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001937 if (out >= outend)
1938 break;
1939 *out++ = c;
1940 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001941 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001942 const char *cp;
1943 char nbuf[16];
1944 int len;
1945
1946 /*
1947 * Try to lookup a predefined HTML entity for it
1948 */
1949 ent = htmlEntityValueLookup(c);
1950 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001951 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001952 cp = nbuf;
1953 }
1954 else
1955 cp = ent->name;
1956 len = strlen(cp);
1957 if (out + 2 + len > outend)
1958 break;
1959 *out++ = '&';
1960 memcpy(out, cp, len);
1961 out += len;
1962 *out++ = ';';
1963 }
1964 processed = in;
1965 }
1966 *outlen = out - outstart;
1967 *inlen = processed - instart;
1968 return(0);
1969}
1970
Owen Taylor3473f882001-02-23 17:55:21 +00001971/************************************************************************
1972 * *
1973 * Commodity functions to handle streams *
1974 * *
1975 ************************************************************************/
1976
1977/**
Owen Taylor3473f882001-02-23 17:55:21 +00001978 * htmlNewInputStream:
1979 * @ctxt: an HTML parser context
1980 *
1981 * Create a new input stream structure
1982 * Returns the new input stream or NULL
1983 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001984static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001985htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1986 htmlParserInputPtr input;
1987
1988 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1989 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00001990 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001991 return(NULL);
1992 }
1993 memset(input, 0, sizeof(htmlParserInput));
1994 input->filename = NULL;
1995 input->directory = NULL;
1996 input->base = NULL;
1997 input->cur = NULL;
1998 input->buf = NULL;
1999 input->line = 1;
2000 input->col = 1;
2001 input->buf = NULL;
2002 input->free = NULL;
2003 input->version = NULL;
2004 input->consumed = 0;
2005 input->length = 0;
2006 return(input);
2007}
2008
2009
2010/************************************************************************
2011 * *
2012 * Commodity functions, cleanup needed ? *
2013 * *
2014 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002015/*
2016 * all tags allowing pc data from the html 4.01 loose dtd
2017 * NOTE: it might be more apropriate to integrate this information
2018 * into the html40ElementTable array but I don't want to risk any
2019 * binary incomptibility
2020 */
2021static const char *allowPCData[] = {
2022 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2023 "blockquote", "body", "button", "caption", "center", "cite", "code",
2024 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2025 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2026 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2027 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2028};
Owen Taylor3473f882001-02-23 17:55:21 +00002029
2030/**
2031 * areBlanks:
2032 * @ctxt: an HTML parser context
2033 * @str: a xmlChar *
2034 * @len: the size of @str
2035 *
2036 * Is this a sequence of blank chars that one can ignore ?
2037 *
2038 * Returns 1 if ignorable 0 otherwise.
2039 */
2040
2041static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002042 unsigned int i;
2043 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002044 xmlNodePtr lastChild;
2045
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002046 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002047 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002048
2049 if (CUR == 0) return(1);
2050 if (CUR != '<') return(0);
2051 if (ctxt->name == NULL)
2052 return(1);
2053 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2054 return(1);
2055 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2056 return(1);
2057 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
2058 return(1);
2059 if (ctxt->node == NULL) return(0);
2060 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002061 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2062 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002063 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002064 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2065 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002066 /* keep ws in constructs like ...<b> </b>...
2067 for all tags "b" allowing PCDATA */
2068 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2069 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2070 return(0);
2071 }
2072 }
Owen Taylor3473f882001-02-23 17:55:21 +00002073 } else if (xmlNodeIsText(lastChild)) {
2074 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002075 } else {
2076 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2077 for all tags "p" allowing PCDATA */
2078 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2079 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2080 return(0);
2081 }
2082 }
Owen Taylor3473f882001-02-23 17:55:21 +00002083 }
2084 return(1);
2085}
2086
2087/**
Owen Taylor3473f882001-02-23 17:55:21 +00002088 * htmlNewDocNoDtD:
2089 * @URI: URI for the dtd, or NULL
2090 * @ExternalID: the external ID of the DTD, or NULL
2091 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002092 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2093 * are NULL
2094 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002095 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002096 */
2097htmlDocPtr
2098htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2099 xmlDocPtr cur;
2100
2101 /*
2102 * Allocate a new document and fill the fields.
2103 */
2104 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2105 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002106 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002107 return(NULL);
2108 }
2109 memset(cur, 0, sizeof(xmlDoc));
2110
2111 cur->type = XML_HTML_DOCUMENT_NODE;
2112 cur->version = NULL;
2113 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002114 cur->doc = cur;
2115 cur->name = NULL;
2116 cur->children = NULL;
2117 cur->extSubset = NULL;
2118 cur->oldNs = NULL;
2119 cur->encoding = NULL;
2120 cur->standalone = 1;
2121 cur->compression = 0;
2122 cur->ids = NULL;
2123 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002124 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002125 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002126 if ((ExternalID != NULL) ||
2127 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002128 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002129 return(cur);
2130}
2131
2132/**
2133 * htmlNewDoc:
2134 * @URI: URI for the dtd, or NULL
2135 * @ExternalID: the external ID of the DTD, or NULL
2136 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002137 * Creates a new HTML document
2138 *
Owen Taylor3473f882001-02-23 17:55:21 +00002139 * Returns a new document
2140 */
2141htmlDocPtr
2142htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2143 if ((URI == NULL) && (ExternalID == NULL))
2144 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002145 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2146 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002147
2148 return(htmlNewDocNoDtD(URI, ExternalID));
2149}
2150
2151
2152/************************************************************************
2153 * *
2154 * The parser itself *
2155 * Relates to http://www.w3.org/TR/html40 *
2156 * *
2157 ************************************************************************/
2158
2159/************************************************************************
2160 * *
2161 * The parser itself *
2162 * *
2163 ************************************************************************/
2164
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002165static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002166
Owen Taylor3473f882001-02-23 17:55:21 +00002167/**
2168 * htmlParseHTMLName:
2169 * @ctxt: an HTML parser context
2170 *
2171 * parse an HTML tag or attribute name, note that we convert it to lowercase
2172 * since HTML names are not case-sensitive.
2173 *
2174 * Returns the Tag Name parsed or NULL
2175 */
2176
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002177static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002178htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002179 int i = 0;
2180 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2181
William M. Brackd1757ab2004-10-02 22:07:48 +00002182 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Owen Taylor3473f882001-02-23 17:55:21 +00002183 (CUR != ':')) return(NULL);
2184
2185 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002186 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Owen Taylor3473f882001-02-23 17:55:21 +00002187 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2188 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2189 else loc[i] = CUR;
2190 i++;
2191
2192 NEXT;
2193 }
2194
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002195 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002196}
2197
2198/**
2199 * htmlParseName:
2200 * @ctxt: an HTML parser context
2201 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002202 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002203 *
2204 * Returns the Name parsed or NULL
2205 */
2206
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002207static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002208htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002209 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002210 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002211 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002212
2213 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002214
2215 /*
2216 * Accelerator for simple ASCII names
2217 */
2218 in = ctxt->input->cur;
2219 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2220 ((*in >= 0x41) && (*in <= 0x5A)) ||
2221 (*in == '_') || (*in == ':')) {
2222 in++;
2223 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2224 ((*in >= 0x41) && (*in <= 0x5A)) ||
2225 ((*in >= 0x30) && (*in <= 0x39)) ||
2226 (*in == '_') || (*in == '-') ||
2227 (*in == ':') || (*in == '.'))
2228 in++;
2229 if ((*in > 0) && (*in < 0x80)) {
2230 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002231 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002232 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002233 ctxt->nbChars += count;
2234 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002235 return(ret);
2236 }
2237 }
2238 return(htmlParseNameComplex(ctxt));
2239}
2240
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002241static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002242htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002243 int len = 0, l;
2244 int c;
2245 int count = 0;
2246
2247 /*
2248 * Handler for more complex cases
2249 */
2250 GROW;
2251 c = CUR_CHAR(l);
2252 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2253 (!IS_LETTER(c) && (c != '_') &&
2254 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002255 return(NULL);
2256 }
2257
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002258 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2259 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2260 (c == '.') || (c == '-') ||
2261 (c == '_') || (c == ':') ||
2262 (IS_COMBINING(c)) ||
2263 (IS_EXTENDER(c)))) {
2264 if (count++ > 100) {
2265 count = 0;
2266 GROW;
2267 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002268 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002269 NEXTL(l);
2270 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002271 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002272 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002273}
2274
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002275
Owen Taylor3473f882001-02-23 17:55:21 +00002276/**
2277 * htmlParseHTMLAttribute:
2278 * @ctxt: an HTML parser context
2279 * @stop: a char stop value
2280 *
2281 * parse an HTML attribute value till the stop (quote), if
2282 * stop is 0 then it stops at the first space
2283 *
2284 * Returns the attribute parsed or NULL
2285 */
2286
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002287static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002288htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2289 xmlChar *buffer = NULL;
2290 int buffer_size = 0;
2291 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002292 const xmlChar *name = NULL;
2293 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002294 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002295
2296 /*
2297 * allocate a translation buffer.
2298 */
2299 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002300 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002301 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002302 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002303 return(NULL);
2304 }
2305 out = buffer;
2306
2307 /*
2308 * Ok loop until we reach one of the ending chars
2309 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002310 while ((CUR != 0) && (CUR != stop)) {
2311 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002312 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002313 if (CUR == '&') {
2314 if (NXT(1) == '#') {
2315 unsigned int c;
2316 int bits;
2317
2318 c = htmlParseCharRef(ctxt);
2319 if (c < 0x80)
2320 { *out++ = c; bits= -6; }
2321 else if (c < 0x800)
2322 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2323 else if (c < 0x10000)
2324 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2325 else
2326 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2327
2328 for ( ; bits >= 0; bits-= 6) {
2329 *out++ = ((c >> bits) & 0x3F) | 0x80;
2330 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002331
2332 if (out - buffer > buffer_size - 100) {
2333 int indx = out - buffer;
2334
2335 growBuffer(buffer);
2336 out = &buffer[indx];
2337 }
Owen Taylor3473f882001-02-23 17:55:21 +00002338 } else {
2339 ent = htmlParseEntityRef(ctxt, &name);
2340 if (name == NULL) {
2341 *out++ = '&';
2342 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002343 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002344
2345 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002346 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002347 }
2348 } else if (ent == NULL) {
2349 *out++ = '&';
2350 cur = name;
2351 while (*cur != 0) {
2352 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002353 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002354
2355 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002356 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002357 }
2358 *out++ = *cur++;
2359 }
Owen Taylor3473f882001-02-23 17:55:21 +00002360 } else {
2361 unsigned int c;
2362 int bits;
2363
2364 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002365 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002366
2367 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002368 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002369 }
2370 c = (xmlChar)ent->value;
2371 if (c < 0x80)
2372 { *out++ = c; bits= -6; }
2373 else if (c < 0x800)
2374 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2375 else if (c < 0x10000)
2376 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2377 else
2378 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2379
2380 for ( ; bits >= 0; bits-= 6) {
2381 *out++ = ((c >> bits) & 0x3F) | 0x80;
2382 }
Owen Taylor3473f882001-02-23 17:55:21 +00002383 }
2384 }
2385 } else {
2386 unsigned int c;
2387 int bits, l;
2388
2389 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002390 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002391
2392 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002393 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002394 }
2395 c = CUR_CHAR(l);
2396 if (c < 0x80)
2397 { *out++ = c; bits= -6; }
2398 else if (c < 0x800)
2399 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2400 else if (c < 0x10000)
2401 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2402 else
2403 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2404
2405 for ( ; bits >= 0; bits-= 6) {
2406 *out++ = ((c >> bits) & 0x3F) | 0x80;
2407 }
2408 NEXT;
2409 }
2410 }
2411 *out++ = 0;
2412 return(buffer);
2413}
2414
2415/**
Owen Taylor3473f882001-02-23 17:55:21 +00002416 * htmlParseEntityRef:
2417 * @ctxt: an HTML parser context
2418 * @str: location to store the entity name
2419 *
2420 * parse an HTML ENTITY references
2421 *
2422 * [68] EntityRef ::= '&' Name ';'
2423 *
2424 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2425 * if non-NULL *str will have to be freed by the caller.
2426 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002427const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002428htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2429 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002430 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002431 *str = NULL;
2432
2433 if (CUR == '&') {
2434 NEXT;
2435 name = htmlParseName(ctxt);
2436 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002437 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2438 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002439 } else {
2440 GROW;
2441 if (CUR == ';') {
2442 *str = name;
2443
2444 /*
2445 * Lookup the entity in the table.
2446 */
2447 ent = htmlEntityLookup(name);
2448 if (ent != NULL) /* OK that's ugly !!! */
2449 NEXT;
2450 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002451 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2452 "htmlParseEntityRef: expecting ';'\n",
2453 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002454 *str = name;
2455 }
2456 }
2457 }
2458 return(ent);
2459}
2460
2461/**
2462 * htmlParseAttValue:
2463 * @ctxt: an HTML parser context
2464 *
2465 * parse a value for an attribute
2466 * Note: the parser won't do substitution of entities here, this
2467 * will be handled later in xmlStringGetNodeList, unless it was
2468 * asked for ctxt->replaceEntities != 0
2469 *
2470 * Returns the AttValue parsed or NULL.
2471 */
2472
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002473static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002474htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2475 xmlChar *ret = NULL;
2476
2477 if (CUR == '"') {
2478 NEXT;
2479 ret = htmlParseHTMLAttribute(ctxt, '"');
2480 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002481 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2482 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002483 } else
2484 NEXT;
2485 } else if (CUR == '\'') {
2486 NEXT;
2487 ret = htmlParseHTMLAttribute(ctxt, '\'');
2488 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002489 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2490 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002491 } else
2492 NEXT;
2493 } else {
2494 /*
2495 * That's an HTMLism, the attribute value may not be quoted
2496 */
2497 ret = htmlParseHTMLAttribute(ctxt, 0);
2498 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002499 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2500 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002501 }
2502 }
2503 return(ret);
2504}
2505
2506/**
2507 * htmlParseSystemLiteral:
2508 * @ctxt: an HTML parser context
2509 *
2510 * parse an HTML Literal
2511 *
2512 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2513 *
2514 * Returns the SystemLiteral parsed or NULL
2515 */
2516
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002517static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002518htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2519 const xmlChar *q;
2520 xmlChar *ret = NULL;
2521
2522 if (CUR == '"') {
2523 NEXT;
2524 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002525 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002526 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002527 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002528 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2529 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002530 } else {
2531 ret = xmlStrndup(q, CUR_PTR - q);
2532 NEXT;
2533 }
2534 } else if (CUR == '\'') {
2535 NEXT;
2536 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002537 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002538 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002539 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002540 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2541 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002542 } else {
2543 ret = xmlStrndup(q, CUR_PTR - q);
2544 NEXT;
2545 }
2546 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002547 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2548 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002549 }
2550
2551 return(ret);
2552}
2553
2554/**
2555 * htmlParsePubidLiteral:
2556 * @ctxt: an HTML parser context
2557 *
2558 * parse an HTML public literal
2559 *
2560 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2561 *
2562 * Returns the PubidLiteral parsed or NULL.
2563 */
2564
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002565static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002566htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2567 const xmlChar *q;
2568 xmlChar *ret = NULL;
2569 /*
2570 * Name ::= (Letter | '_') (NameChar)*
2571 */
2572 if (CUR == '"') {
2573 NEXT;
2574 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002575 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002576 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002577 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2578 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002579 } else {
2580 ret = xmlStrndup(q, CUR_PTR - q);
2581 NEXT;
2582 }
2583 } else if (CUR == '\'') {
2584 NEXT;
2585 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002586 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002587 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002588 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002589 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2590 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002591 } else {
2592 ret = xmlStrndup(q, CUR_PTR - q);
2593 NEXT;
2594 }
2595 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002596 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2597 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002598 }
2599
2600 return(ret);
2601}
2602
2603/**
2604 * htmlParseScript:
2605 * @ctxt: an HTML parser context
2606 *
2607 * parse the content of an HTML SCRIPT or STYLE element
2608 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2609 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2610 * http://www.w3.org/TR/html4/types.html#type-script
2611 * http://www.w3.org/TR/html4/types.html#h-6.15
2612 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2613 *
2614 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2615 * element and the value of intrinsic event attributes. User agents must
2616 * not evaluate script data as HTML markup but instead must pass it on as
2617 * data to a script engine.
2618 * NOTES:
2619 * - The content is passed like CDATA
2620 * - the attributes for style and scripting "onXXX" are also described
2621 * as CDATA but SGML allows entities references in attributes so their
2622 * processing is identical as other attributes
2623 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002624static void
Owen Taylor3473f882001-02-23 17:55:21 +00002625htmlParseScript(htmlParserCtxtPtr ctxt) {
2626 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2627 int nbchar = 0;
2628 xmlChar cur;
2629
2630 SHRINK;
2631 cur = CUR;
William M. Brack76e95df2003-10-18 16:20:14 +00002632 while (IS_CHAR_CH(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002633 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2634 (NXT(3) == '-')) {
2635 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2636 if (ctxt->sax->cdataBlock!= NULL) {
2637 /*
2638 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2639 */
2640 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002641 } else if (ctxt->sax->characters != NULL) {
2642 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Daniel Veillardc1f78342001-11-10 11:43:05 +00002643 }
2644 }
2645 nbchar = 0;
2646 htmlParseComment(ctxt);
2647 cur = CUR;
2648 continue;
2649 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002650 /*
2651 * One should break here, the specification is clear:
2652 * Authors should therefore escape "</" within the content.
2653 * Escape mechanisms are specific to each scripting or
2654 * style sheet language.
2655 */
2656 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2657 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2658 break; /* while */
2659 }
2660 buf[nbchar++] = cur;
2661 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2662 if (ctxt->sax->cdataBlock!= NULL) {
2663 /*
2664 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2665 */
2666 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002667 } else if (ctxt->sax->characters != NULL) {
2668 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002669 }
2670 nbchar = 0;
2671 }
2672 NEXT;
2673 cur = CUR;
2674 }
William M. Brack76e95df2003-10-18 16:20:14 +00002675 if (!(IS_CHAR_CH(cur))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002676 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2677 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002678 NEXT;
2679 }
2680
2681 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2682 if (ctxt->sax->cdataBlock!= NULL) {
2683 /*
2684 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2685 */
2686 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002687 } else if (ctxt->sax->characters != NULL) {
2688 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002689 }
2690 }
2691}
2692
2693
2694/**
2695 * htmlParseCharData:
2696 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002697 *
2698 * parse a CharData section.
2699 * if we are within a CDATA section ']]>' marks an end of section.
2700 *
2701 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2702 */
2703
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002704static void
2705htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002706 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2707 int nbchar = 0;
2708 int cur, l;
2709
2710 SHRINK;
2711 cur = CUR_CHAR(l);
2712 while (((cur != '<') || (ctxt->token == '<')) &&
2713 ((cur != '&') || (ctxt->token == '&')) &&
2714 (IS_CHAR(cur))) {
2715 COPY_BUF(l,buf,nbchar,cur);
2716 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2717 /*
2718 * Ok the segment is to be consumed as chars.
2719 */
2720 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2721 if (areBlanks(ctxt, buf, nbchar)) {
2722 if (ctxt->sax->ignorableWhitespace != NULL)
2723 ctxt->sax->ignorableWhitespace(ctxt->userData,
2724 buf, nbchar);
2725 } else {
2726 htmlCheckParagraph(ctxt);
2727 if (ctxt->sax->characters != NULL)
2728 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2729 }
2730 }
2731 nbchar = 0;
2732 }
2733 NEXTL(l);
2734 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002735 if (cur == 0) {
2736 SHRINK;
2737 GROW;
2738 cur = CUR_CHAR(l);
2739 }
Owen Taylor3473f882001-02-23 17:55:21 +00002740 }
2741 if (nbchar != 0) {
2742 /*
2743 * Ok the segment is to be consumed as chars.
2744 */
2745 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2746 if (areBlanks(ctxt, buf, nbchar)) {
2747 if (ctxt->sax->ignorableWhitespace != NULL)
2748 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2749 } else {
2750 htmlCheckParagraph(ctxt);
2751 if (ctxt->sax->characters != NULL)
2752 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2753 }
2754 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002755 } else {
2756 /*
2757 * Loop detection
2758 */
2759 if (cur == 0)
2760 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002761 }
2762}
2763
2764/**
2765 * htmlParseExternalID:
2766 * @ctxt: an HTML parser context
2767 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002768 *
2769 * Parse an External ID or a Public ID
2770 *
Owen Taylor3473f882001-02-23 17:55:21 +00002771 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2772 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2773 *
2774 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2775 *
2776 * Returns the function returns SystemLiteral and in the second
2777 * case publicID receives PubidLiteral, is strict is off
2778 * it is possible to return NULL and have publicID set.
2779 */
2780
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002781static xmlChar *
2782htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002783 xmlChar *URI = NULL;
2784
2785 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2786 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2787 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2788 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002789 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002790 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2791 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002792 }
2793 SKIP_BLANKS;
2794 URI = htmlParseSystemLiteral(ctxt);
2795 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002796 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2797 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002798 }
2799 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2800 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2801 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2802 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002803 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002804 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2805 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002806 }
2807 SKIP_BLANKS;
2808 *publicID = htmlParsePubidLiteral(ctxt);
2809 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002810 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2811 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2812 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002813 }
2814 SKIP_BLANKS;
2815 if ((CUR == '"') || (CUR == '\'')) {
2816 URI = htmlParseSystemLiteral(ctxt);
2817 }
2818 }
2819 return(URI);
2820}
2821
2822/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00002823 * xmlParsePI:
2824 * @ctxt: an XML parser context
2825 *
2826 * parse an XML Processing Instruction.
2827 *
2828 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2829 */
2830static void
2831htmlParsePI(htmlParserCtxtPtr ctxt) {
2832 xmlChar *buf = NULL;
2833 int len = 0;
2834 int size = HTML_PARSER_BUFFER_SIZE;
2835 int cur, l;
2836 const xmlChar *target;
2837 xmlParserInputState state;
2838 int count = 0;
2839
2840 if ((RAW == '<') && (NXT(1) == '?')) {
2841 state = ctxt->instate;
2842 ctxt->instate = XML_PARSER_PI;
2843 /*
2844 * this is a Processing Instruction.
2845 */
2846 SKIP(2);
2847 SHRINK;
2848
2849 /*
2850 * Parse the target name and check for special support like
2851 * namespace.
2852 */
2853 target = htmlParseName(ctxt);
2854 if (target != NULL) {
2855 if (RAW == '>') {
2856 SKIP(1);
2857
2858 /*
2859 * SAX: PI detected.
2860 */
2861 if ((ctxt->sax) && (!ctxt->disableSAX) &&
2862 (ctxt->sax->processingInstruction != NULL))
2863 ctxt->sax->processingInstruction(ctxt->userData,
2864 target, NULL);
2865 ctxt->instate = state;
2866 return;
2867 }
2868 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
2869 if (buf == NULL) {
2870 htmlErrMemory(ctxt, NULL);
2871 ctxt->instate = state;
2872 return;
2873 }
2874 cur = CUR;
2875 if (!IS_BLANK(cur)) {
2876 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2877 "ParsePI: PI %s space expected\n", target, NULL);
2878 }
2879 SKIP_BLANKS;
2880 cur = CUR_CHAR(l);
2881 while (IS_CHAR(cur) && (cur != '>')) {
2882 if (len + 5 >= size) {
2883 xmlChar *tmp;
2884
2885 size *= 2;
2886 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2887 if (tmp == NULL) {
2888 htmlErrMemory(ctxt, NULL);
2889 xmlFree(buf);
2890 ctxt->instate = state;
2891 return;
2892 }
2893 buf = tmp;
2894 }
2895 count++;
2896 if (count > 50) {
2897 GROW;
2898 count = 0;
2899 }
2900 COPY_BUF(l,buf,len,cur);
2901 NEXTL(l);
2902 cur = CUR_CHAR(l);
2903 if (cur == 0) {
2904 SHRINK;
2905 GROW;
2906 cur = CUR_CHAR(l);
2907 }
2908 }
2909 buf[len] = 0;
2910 if (cur != '>') {
2911 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
2912 "ParsePI: PI %s never end ...\n", target, NULL);
2913 } else {
2914 SKIP(1);
2915
2916 /*
2917 * SAX: PI detected.
2918 */
2919 if ((ctxt->sax) && (!ctxt->disableSAX) &&
2920 (ctxt->sax->processingInstruction != NULL))
2921 ctxt->sax->processingInstruction(ctxt->userData,
2922 target, buf);
2923 }
2924 xmlFree(buf);
2925 } else {
2926 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
2927 "PI is not started correctly", NULL, NULL);
2928 }
2929 ctxt->instate = state;
2930 }
2931}
2932
2933/**
Owen Taylor3473f882001-02-23 17:55:21 +00002934 * htmlParseComment:
2935 * @ctxt: an HTML parser context
2936 *
2937 * Parse an XML (SGML) comment <!-- .... -->
2938 *
2939 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2940 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002941static void
Owen Taylor3473f882001-02-23 17:55:21 +00002942htmlParseComment(htmlParserCtxtPtr ctxt) {
2943 xmlChar *buf = NULL;
2944 int len;
2945 int size = HTML_PARSER_BUFFER_SIZE;
2946 int q, ql;
2947 int r, rl;
2948 int cur, l;
2949 xmlParserInputState state;
2950
2951 /*
2952 * Check that there is a comment right here.
2953 */
2954 if ((RAW != '<') || (NXT(1) != '!') ||
2955 (NXT(2) != '-') || (NXT(3) != '-')) return;
2956
2957 state = ctxt->instate;
2958 ctxt->instate = XML_PARSER_COMMENT;
2959 SHRINK;
2960 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002961 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002962 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002963 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002964 ctxt->instate = state;
2965 return;
2966 }
2967 q = CUR_CHAR(ql);
2968 NEXTL(ql);
2969 r = CUR_CHAR(rl);
2970 NEXTL(rl);
2971 cur = CUR_CHAR(l);
2972 len = 0;
2973 while (IS_CHAR(cur) &&
2974 ((cur != '>') ||
2975 (r != '-') || (q != '-'))) {
2976 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00002977 xmlChar *tmp;
2978
Owen Taylor3473f882001-02-23 17:55:21 +00002979 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00002980 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2981 if (tmp == NULL) {
2982 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00002983 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002984 ctxt->instate = state;
2985 return;
2986 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00002987 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00002988 }
2989 COPY_BUF(ql,buf,len,q);
2990 q = r;
2991 ql = rl;
2992 r = cur;
2993 rl = l;
2994 NEXTL(l);
2995 cur = CUR_CHAR(l);
2996 if (cur == 0) {
2997 SHRINK;
2998 GROW;
2999 cur = CUR_CHAR(l);
3000 }
3001 }
3002 buf[len] = 0;
3003 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003004 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3005 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003006 xmlFree(buf);
3007 } else {
3008 NEXT;
3009 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3010 (!ctxt->disableSAX))
3011 ctxt->sax->comment(ctxt->userData, buf);
3012 xmlFree(buf);
3013 }
3014 ctxt->instate = state;
3015}
3016
3017/**
3018 * htmlParseCharRef:
3019 * @ctxt: an HTML parser context
3020 *
3021 * parse Reference declarations
3022 *
3023 * [66] CharRef ::= '&#' [0-9]+ ';' |
3024 * '&#x' [0-9a-fA-F]+ ';'
3025 *
3026 * Returns the value parsed (as an int)
3027 */
3028int
3029htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3030 int val = 0;
3031
Daniel Veillarda03e3652004-11-02 18:45:30 +00003032 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3033 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3034 "htmlParseCharRef: context error\n",
3035 NULL, NULL);
3036 return(0);
3037 }
Owen Taylor3473f882001-02-23 17:55:21 +00003038 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003039 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003040 SKIP(3);
3041 while (CUR != ';') {
3042 if ((CUR >= '0') && (CUR <= '9'))
3043 val = val * 16 + (CUR - '0');
3044 else if ((CUR >= 'a') && (CUR <= 'f'))
3045 val = val * 16 + (CUR - 'a') + 10;
3046 else if ((CUR >= 'A') && (CUR <= 'F'))
3047 val = val * 16 + (CUR - 'A') + 10;
3048 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003049 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3050 "htmlParseCharRef: invalid hexadecimal value\n",
3051 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003052 return(0);
3053 }
3054 NEXT;
3055 }
3056 if (CUR == ';')
3057 NEXT;
3058 } else if ((CUR == '&') && (NXT(1) == '#')) {
3059 SKIP(2);
3060 while (CUR != ';') {
3061 if ((CUR >= '0') && (CUR <= '9'))
3062 val = val * 10 + (CUR - '0');
3063 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003064 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3065 "htmlParseCharRef: invalid decimal value\n",
3066 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003067 return(0);
3068 }
3069 NEXT;
3070 }
3071 if (CUR == ';')
3072 NEXT;
3073 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003074 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3075 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003076 }
3077 /*
3078 * Check the value IS_CHAR ...
3079 */
3080 if (IS_CHAR(val)) {
3081 return(val);
3082 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003083 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3084 "htmlParseCharRef: invalid xmlChar value %d\n",
3085 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003086 }
3087 return(0);
3088}
3089
3090
3091/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003092 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003093 * @ctxt: an HTML parser context
3094 *
3095 * parse a DOCTYPE declaration
3096 *
3097 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3098 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3099 */
3100
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003101static void
Owen Taylor3473f882001-02-23 17:55:21 +00003102htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003103 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003104 xmlChar *ExternalID = NULL;
3105 xmlChar *URI = NULL;
3106
3107 /*
3108 * We know that '<!DOCTYPE' has been detected.
3109 */
3110 SKIP(9);
3111
3112 SKIP_BLANKS;
3113
3114 /*
3115 * Parse the DOCTYPE name.
3116 */
3117 name = htmlParseName(ctxt);
3118 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003119 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3120 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3121 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003122 }
3123 /*
3124 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3125 */
3126
3127 SKIP_BLANKS;
3128
3129 /*
3130 * Check for SystemID and ExternalID
3131 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003132 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003133 SKIP_BLANKS;
3134
3135 /*
3136 * We should be at the end of the DOCTYPE declaration.
3137 */
3138 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003139 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3140 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003141 /* We shouldn't try to resynchronize ... */
3142 }
3143 NEXT;
3144
3145 /*
3146 * Create or update the document accordingly to the DOCTYPE
3147 */
3148 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3149 (!ctxt->disableSAX))
3150 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3151
3152 /*
3153 * Cleanup, since we don't use all those identifiers
3154 */
3155 if (URI != NULL) xmlFree(URI);
3156 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003157}
3158
3159/**
3160 * htmlParseAttribute:
3161 * @ctxt: an HTML parser context
3162 * @value: a xmlChar ** used to store the value of the attribute
3163 *
3164 * parse an attribute
3165 *
3166 * [41] Attribute ::= Name Eq AttValue
3167 *
3168 * [25] Eq ::= S? '=' S?
3169 *
3170 * With namespace:
3171 *
3172 * [NS 11] Attribute ::= QName Eq AttValue
3173 *
3174 * Also the case QName == xmlns:??? is handled independently as a namespace
3175 * definition.
3176 *
3177 * Returns the attribute name, and the value in *value.
3178 */
3179
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003180static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003181htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003182 const xmlChar *name;
3183 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003184
3185 *value = NULL;
3186 name = htmlParseHTMLName(ctxt);
3187 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003188 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3189 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003190 return(NULL);
3191 }
3192
3193 /*
3194 * read the value
3195 */
3196 SKIP_BLANKS;
3197 if (CUR == '=') {
3198 NEXT;
3199 SKIP_BLANKS;
3200 val = htmlParseAttValue(ctxt);
3201 /******
3202 } else {
3203 * TODO : some attribute must have values, some may not
3204 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3205 ctxt->sax->warning(ctxt->userData,
3206 "No value for attribute %s\n", name); */
3207 }
3208
3209 *value = val;
3210 return(name);
3211}
3212
3213/**
3214 * htmlCheckEncoding:
3215 * @ctxt: an HTML parser context
3216 * @attvalue: the attribute value
3217 *
3218 * Checks an http-equiv attribute from a Meta tag to detect
3219 * the encoding
3220 * If a new encoding is detected the parser is switched to decode
3221 * it and pass UTF8
3222 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003223static void
Owen Taylor3473f882001-02-23 17:55:21 +00003224htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3225 const xmlChar *encoding;
3226
3227 if ((ctxt == NULL) || (attvalue == NULL))
3228 return;
3229
3230 /* do not change encoding */
3231 if (ctxt->input->encoding != NULL)
3232 return;
3233
3234 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3235 if (encoding != NULL) {
3236 encoding += 8;
3237 } else {
3238 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3239 if (encoding != NULL)
3240 encoding += 9;
3241 }
3242 if (encoding != NULL) {
3243 xmlCharEncoding enc;
3244 xmlCharEncodingHandlerPtr handler;
3245
3246 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3247
3248 if (ctxt->input->encoding != NULL)
3249 xmlFree((xmlChar *) ctxt->input->encoding);
3250 ctxt->input->encoding = xmlStrdup(encoding);
3251
3252 enc = xmlParseCharEncoding((const char *) encoding);
3253 /*
3254 * registered set of known encodings
3255 */
3256 if (enc != XML_CHAR_ENCODING_ERROR) {
3257 xmlSwitchEncoding(ctxt, enc);
3258 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3259 } else {
3260 /*
3261 * fallback for unknown encodings
3262 */
3263 handler = xmlFindCharEncodingHandler((const char *) encoding);
3264 if (handler != NULL) {
3265 xmlSwitchToEncoding(ctxt, handler);
3266 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3267 } else {
3268 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3269 }
3270 }
3271
3272 if ((ctxt->input->buf != NULL) &&
3273 (ctxt->input->buf->encoder != NULL) &&
3274 (ctxt->input->buf->raw != NULL) &&
3275 (ctxt->input->buf->buffer != NULL)) {
3276 int nbchars;
3277 int processed;
3278
3279 /*
3280 * convert as much as possible to the parser reading buffer.
3281 */
3282 processed = ctxt->input->cur - ctxt->input->base;
3283 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3284 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3285 ctxt->input->buf->buffer,
3286 ctxt->input->buf->raw);
3287 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003288 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3289 "htmlCheckEncoding: encoder error\n",
3290 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003291 }
3292 ctxt->input->base =
3293 ctxt->input->cur = ctxt->input->buf->buffer->content;
3294 }
3295 }
3296}
3297
3298/**
3299 * htmlCheckMeta:
3300 * @ctxt: an HTML parser context
3301 * @atts: the attributes values
3302 *
3303 * Checks an attributes from a Meta tag
3304 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003305static void
Owen Taylor3473f882001-02-23 17:55:21 +00003306htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3307 int i;
3308 const xmlChar *att, *value;
3309 int http = 0;
3310 const xmlChar *content = NULL;
3311
3312 if ((ctxt == NULL) || (atts == NULL))
3313 return;
3314
3315 i = 0;
3316 att = atts[i++];
3317 while (att != NULL) {
3318 value = atts[i++];
3319 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3320 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3321 http = 1;
3322 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3323 content = value;
3324 att = atts[i++];
3325 }
3326 if ((http) && (content != NULL))
3327 htmlCheckEncoding(ctxt, content);
3328
3329}
3330
3331/**
3332 * htmlParseStartTag:
3333 * @ctxt: an HTML parser context
3334 *
3335 * parse a start of tag either for rule element or
3336 * EmptyElement. In both case we don't parse the tag closing chars.
3337 *
3338 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3339 *
3340 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3341 *
3342 * With namespace:
3343 *
3344 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3345 *
3346 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3347 *
3348 */
3349
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003350static void
Owen Taylor3473f882001-02-23 17:55:21 +00003351htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003352 const xmlChar *name;
3353 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003354 xmlChar *attvalue;
Daniel Veillardf403d292003-10-05 13:51:35 +00003355 const xmlChar **atts = ctxt->atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003356 int nbatts = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +00003357 int maxatts = ctxt->maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003358 int meta = 0;
3359 int i;
3360
Daniel Veillarda03e3652004-11-02 18:45:30 +00003361 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3362 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3363 "htmlParseStartTag: context error\n", NULL, NULL);
3364 return;
3365 }
Owen Taylor3473f882001-02-23 17:55:21 +00003366 if (CUR != '<') return;
3367 NEXT;
3368
3369 GROW;
3370 name = htmlParseHTMLName(ctxt);
3371 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003372 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3373 "htmlParseStartTag: invalid element name\n",
3374 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003375 /* Dump the bogus tag like browsers do */
William M. Brack76e95df2003-10-18 16:20:14 +00003376 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Owen Taylor3473f882001-02-23 17:55:21 +00003377 NEXT;
3378 return;
3379 }
3380 if (xmlStrEqual(name, BAD_CAST"meta"))
3381 meta = 1;
3382
3383 /*
3384 * Check for auto-closure of HTML elements.
3385 */
3386 htmlAutoClose(ctxt, name);
3387
3388 /*
3389 * Check for implied HTML elements.
3390 */
3391 htmlCheckImplied(ctxt, name);
3392
3393 /*
3394 * Avoid html at any level > 0, head at any level != 1
3395 * or any attempt to recurse body
3396 */
3397 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003398 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3399 "htmlParseStartTag: misplaced <html> tag\n",
3400 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003401 return;
3402 }
3403 if ((ctxt->nameNr != 1) &&
3404 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003405 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3406 "htmlParseStartTag: misplaced <head> tag\n",
3407 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003408 return;
3409 }
3410 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003411 int indx;
3412 for (indx = 0;indx < ctxt->nameNr;indx++) {
3413 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003414 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3415 "htmlParseStartTag: misplaced <body> tag\n",
3416 name, NULL);
Daniel Veillardc59d8262003-11-20 21:59:12 +00003417 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3418 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003419 return;
3420 }
3421 }
3422 }
3423
3424 /*
3425 * Now parse the attributes, it ends up with the ending
3426 *
3427 * (S Attribute)* S?
3428 */
3429 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003430 while ((IS_CHAR_CH(CUR)) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003431 (CUR != '>') &&
3432 ((CUR != '/') || (NXT(1) != '>'))) {
3433 long cons = ctxt->nbChars;
3434
3435 GROW;
3436 attname = htmlParseAttribute(ctxt, &attvalue);
3437 if (attname != NULL) {
3438
3439 /*
3440 * Well formedness requires at most one declaration of an attribute
3441 */
3442 for (i = 0; i < nbatts;i += 2) {
3443 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003444 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3445 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003446 if (attvalue != NULL)
3447 xmlFree(attvalue);
3448 goto failed;
3449 }
3450 }
3451
3452 /*
3453 * Add the pair to atts
3454 */
3455 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003456 maxatts = 22; /* allow for 10 attrs by default */
3457 atts = (const xmlChar **)
3458 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003459 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003460 htmlErrMemory(ctxt, NULL);
3461 if (attvalue != NULL)
3462 xmlFree(attvalue);
3463 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003464 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003465 ctxt->atts = atts;
3466 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003467 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003468 const xmlChar **n;
3469
Owen Taylor3473f882001-02-23 17:55:21 +00003470 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003471 n = (const xmlChar **) xmlRealloc((void *) atts,
3472 maxatts * sizeof(const xmlChar *));
3473 if (n == NULL) {
3474 htmlErrMemory(ctxt, NULL);
3475 if (attvalue != NULL)
3476 xmlFree(attvalue);
3477 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003478 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003479 atts = n;
3480 ctxt->atts = atts;
3481 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003482 }
3483 atts[nbatts++] = attname;
3484 atts[nbatts++] = attvalue;
3485 atts[nbatts] = NULL;
3486 atts[nbatts + 1] = NULL;
3487 }
3488 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003489 if (attvalue != NULL)
3490 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003491 /* Dump the bogus attribute string up to the next blank or
3492 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003493 while ((IS_CHAR_CH(CUR)) &&
3494 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003495 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003496 NEXT;
3497 }
3498
3499failed:
3500 SKIP_BLANKS;
3501 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003502 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3503 "htmlParseStartTag: problem parsing attributes\n",
3504 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003505 break;
3506 }
3507 }
3508
3509 /*
3510 * Handle specific association to the META tag
3511 */
3512 if (meta)
3513 htmlCheckMeta(ctxt, atts);
3514
3515 /*
3516 * SAX: Start of Element !
3517 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003518 htmlnamePush(ctxt, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003519 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3520 if (nbatts != 0)
3521 ctxt->sax->startElement(ctxt->userData, name, atts);
3522 else
3523 ctxt->sax->startElement(ctxt->userData, name, NULL);
3524 }
Owen Taylor3473f882001-02-23 17:55:21 +00003525
3526 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003527 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003528 if (atts[i] != NULL)
3529 xmlFree((xmlChar *) atts[i]);
3530 }
Owen Taylor3473f882001-02-23 17:55:21 +00003531 }
Owen Taylor3473f882001-02-23 17:55:21 +00003532}
3533
3534/**
3535 * htmlParseEndTag:
3536 * @ctxt: an HTML parser context
3537 *
3538 * parse an end of tag
3539 *
3540 * [42] ETag ::= '</' Name S? '>'
3541 *
3542 * With namespace
3543 *
3544 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003545 *
3546 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003547 */
3548
Daniel Veillardf420ac52001-07-04 16:04:09 +00003549static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003550htmlParseEndTag(htmlParserCtxtPtr ctxt)
3551{
3552 const xmlChar *name;
3553 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003554 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003555
3556 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003557 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3558 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003559 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003560 }
3561 SKIP(2);
3562
3563 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003564 if (name == NULL)
3565 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003566
3567 /*
3568 * We should definitely be at the ending "S? '>'" part
3569 */
3570 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003571 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003572 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3573 "End tag : expected '>'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003574 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003575 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003576
3577 /*
3578 * If the name read is not one of the element in the parsing stack
3579 * then return, it's just an error.
3580 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003581 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3582 if (xmlStrEqual(name, ctxt->nameTab[i]))
3583 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003584 }
3585 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003586 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3587 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003588 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003589 }
3590
3591
3592 /*
3593 * Check for auto-closure of HTML elements.
3594 */
3595
3596 htmlAutoCloseOnClose(ctxt, name);
3597
3598 /*
3599 * Well formedness constraints, opening and closing must match.
3600 * With the exception that the autoclose may have popped stuff out
3601 * of the stack.
3602 */
3603 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003604 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003605 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3606 "Opening and ending tag mismatch: %s and %s\n",
3607 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003608 }
3609 }
3610
3611 /*
3612 * SAX: End of Tag
3613 */
3614 oldname = ctxt->name;
3615 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003616 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3617 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003618 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003619 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003620 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003621 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003622 }
3623
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003624 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003625}
3626
3627
3628/**
3629 * htmlParseReference:
3630 * @ctxt: an HTML parser context
3631 *
3632 * parse and handle entity references in content,
3633 * this will end-up in a call to character() since this is either a
3634 * CharRef, or a predefined entity.
3635 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003636static void
Owen Taylor3473f882001-02-23 17:55:21 +00003637htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003638 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003639 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003640 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003641 if (CUR != '&') return;
3642
3643 if (NXT(1) == '#') {
3644 unsigned int c;
3645 int bits, i = 0;
3646
3647 c = htmlParseCharRef(ctxt);
3648 if (c == 0)
3649 return;
3650
3651 if (c < 0x80) { out[i++]= c; bits= -6; }
3652 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3653 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3654 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3655
3656 for ( ; bits >= 0; bits-= 6) {
3657 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3658 }
3659 out[i] = 0;
3660
3661 htmlCheckParagraph(ctxt);
3662 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3663 ctxt->sax->characters(ctxt->userData, out, i);
3664 } else {
3665 ent = htmlParseEntityRef(ctxt, &name);
3666 if (name == NULL) {
3667 htmlCheckParagraph(ctxt);
3668 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3669 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3670 return;
3671 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003672 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003673 htmlCheckParagraph(ctxt);
3674 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3675 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3676 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3677 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3678 }
3679 } else {
3680 unsigned int c;
3681 int bits, i = 0;
3682
3683 c = ent->value;
3684 if (c < 0x80)
3685 { out[i++]= c; bits= -6; }
3686 else if (c < 0x800)
3687 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3688 else if (c < 0x10000)
3689 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3690 else
3691 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3692
3693 for ( ; bits >= 0; bits-= 6) {
3694 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3695 }
3696 out[i] = 0;
3697
3698 htmlCheckParagraph(ctxt);
3699 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3700 ctxt->sax->characters(ctxt->userData, out, i);
3701 }
Owen Taylor3473f882001-02-23 17:55:21 +00003702 }
3703}
3704
3705/**
3706 * htmlParseContent:
3707 * @ctxt: an HTML parser context
3708 * @name: the node name
3709 *
3710 * Parse a content: comment, sub-element, reference or text.
3711 *
3712 */
3713
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003714static void
Owen Taylor3473f882001-02-23 17:55:21 +00003715htmlParseContent(htmlParserCtxtPtr ctxt) {
3716 xmlChar *currentNode;
3717 int depth;
3718
3719 currentNode = xmlStrdup(ctxt->name);
3720 depth = ctxt->nameNr;
3721 while (1) {
3722 long cons = ctxt->nbChars;
3723
3724 GROW;
3725 /*
3726 * Our tag or one of it's parent or children is ending.
3727 */
3728 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003729 if (htmlParseEndTag(ctxt) &&
3730 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3731 if (currentNode != NULL)
3732 xmlFree(currentNode);
3733 return;
3734 }
3735 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003736 }
3737
3738 /*
3739 * Has this node been popped out during parsing of
3740 * the next element
3741 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003742 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3743 (!xmlStrEqual(currentNode, ctxt->name)))
3744 {
Owen Taylor3473f882001-02-23 17:55:21 +00003745 if (currentNode != NULL) xmlFree(currentNode);
3746 return;
3747 }
3748
Daniel Veillardf9533d12001-03-03 10:04:57 +00003749 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3750 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003751 /*
3752 * Handle SCRIPT/STYLE separately
3753 */
3754 htmlParseScript(ctxt);
3755 } else {
3756 /*
3757 * Sometimes DOCTYPE arrives in the middle of the document
3758 */
3759 if ((CUR == '<') && (NXT(1) == '!') &&
3760 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3761 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3762 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3763 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003764 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3765 "Misplaced DOCTYPE declaration\n",
3766 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003767 htmlParseDocTypeDecl(ctxt);
3768 }
3769
3770 /*
3771 * First case : a comment
3772 */
3773 if ((CUR == '<') && (NXT(1) == '!') &&
3774 (NXT(2) == '-') && (NXT(3) == '-')) {
3775 htmlParseComment(ctxt);
3776 }
3777
3778 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003779 * Second case : a Processing Instruction.
3780 */
3781 else if ((CUR == '<') && (NXT(1) == '?')) {
3782 htmlParsePI(ctxt);
3783 }
3784
3785 /*
3786 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00003787 */
3788 else if (CUR == '<') {
3789 htmlParseElement(ctxt);
3790 }
3791
3792 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003793 * Fourth case : a reference. If if has not been resolved,
Owen Taylor3473f882001-02-23 17:55:21 +00003794 * parsing returns it's Name, create the node
3795 */
3796 else if (CUR == '&') {
3797 htmlParseReference(ctxt);
3798 }
3799
3800 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003801 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00003802 */
3803 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003804 htmlAutoCloseOnEnd(ctxt);
3805 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003806 }
3807
3808 /*
3809 * Last case, text. Note that References are handled directly.
3810 */
3811 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003812 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003813 }
3814
3815 if (cons == ctxt->nbChars) {
3816 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003817 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3818 "detected an error in element content\n",
3819 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003820 }
3821 break;
3822 }
3823 }
3824 GROW;
3825 }
3826 if (currentNode != NULL) xmlFree(currentNode);
3827}
3828
3829/**
3830 * htmlParseElement:
3831 * @ctxt: an HTML parser context
3832 *
3833 * parse an HTML element, this is highly recursive
3834 *
3835 * [39] element ::= EmptyElemTag | STag content ETag
3836 *
3837 * [41] Attribute ::= Name Eq AttValue
3838 */
3839
3840void
3841htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003842 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003843 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003844 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003845 htmlParserNodeInfo node_info;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003846 const xmlChar *oldname;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003847 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003848 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003849
Daniel Veillarda03e3652004-11-02 18:45:30 +00003850 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3851 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3852 "htmlParseStartTag: context error\n", NULL, NULL);
3853 return;
3854 }
3855 depth = ctxt->nameNr;
Owen Taylor3473f882001-02-23 17:55:21 +00003856 /* Capture start position */
3857 if (ctxt->record_info) {
3858 node_info.begin_pos = ctxt->input->consumed +
3859 (CUR_PTR - ctxt->input->base);
3860 node_info.begin_line = ctxt->input->line;
3861 }
3862
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003863 oldname = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00003864 htmlParseStartTag(ctxt);
3865 name = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00003866 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3867 (name == NULL)) {
3868 if (CUR == '>')
3869 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003870 return;
3871 }
Owen Taylor3473f882001-02-23 17:55:21 +00003872
3873 /*
3874 * Lookup the info for that element.
3875 */
3876 info = htmlTagLookup(name);
3877 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003878 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
3879 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003880 }
3881
3882 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003883 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003884 */
3885 if ((CUR == '/') && (NXT(1) == '>')) {
3886 SKIP(2);
3887 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3888 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003889 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003890 return;
3891 }
3892
3893 if (CUR == '>') {
3894 NEXT;
3895 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003896 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3897 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003898
3899 /*
3900 * end of parsing of this node.
3901 */
3902 if (xmlStrEqual(name, ctxt->name)) {
3903 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00003904 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003905 }
3906
3907 /*
3908 * Capture end position and add node
3909 */
3910 if ( currentNode != NULL && ctxt->record_info ) {
3911 node_info.end_pos = ctxt->input->consumed +
3912 (CUR_PTR - ctxt->input->base);
3913 node_info.end_line = ctxt->input->line;
3914 node_info.node = ctxt->node;
3915 xmlParserAddNodeInfo(ctxt, &node_info);
3916 }
3917 return;
3918 }
3919
3920 /*
3921 * Check for an Empty Element from DTD definition
3922 */
3923 if ((info != NULL) && (info->empty)) {
3924 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3925 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003926 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003927 return;
3928 }
3929
3930 /*
3931 * Parse the content of the element:
3932 */
3933 currentNode = xmlStrdup(ctxt->name);
3934 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00003935 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003936 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003937 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003938 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003939 if (ctxt->nameNr < depth) break;
3940 }
3941
Owen Taylor3473f882001-02-23 17:55:21 +00003942 /*
3943 * Capture end position and add node
3944 */
3945 if ( currentNode != NULL && ctxt->record_info ) {
3946 node_info.end_pos = ctxt->input->consumed +
3947 (CUR_PTR - ctxt->input->base);
3948 node_info.end_line = ctxt->input->line;
3949 node_info.node = ctxt->node;
3950 xmlParserAddNodeInfo(ctxt, &node_info);
3951 }
William M. Brack76e95df2003-10-18 16:20:14 +00003952 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003953 htmlAutoCloseOnEnd(ctxt);
3954 }
3955
Owen Taylor3473f882001-02-23 17:55:21 +00003956 if (currentNode != NULL)
3957 xmlFree(currentNode);
3958}
3959
3960/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003961 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00003962 * @ctxt: an HTML parser context
3963 *
3964 * parse an HTML document (and build a tree if using the standard SAX
3965 * interface).
3966 *
3967 * Returns 0, -1 in case of error. the parser context is augmented
3968 * as a result of the parsing.
3969 */
3970
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00003971int
Owen Taylor3473f882001-02-23 17:55:21 +00003972htmlParseDocument(htmlParserCtxtPtr ctxt) {
3973 xmlDtdPtr dtd;
3974
Daniel Veillardd0463562001-10-13 09:15:48 +00003975 xmlInitParser();
3976
Owen Taylor3473f882001-02-23 17:55:21 +00003977 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00003978
Daniel Veillarda03e3652004-11-02 18:45:30 +00003979 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3980 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3981 "htmlParseDocument: context error\n", NULL, NULL);
3982 return(XML_ERR_INTERNAL_ERROR);
3983 }
3984 ctxt->html = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00003985 GROW;
3986 /*
3987 * SAX: beginning of the document processing.
3988 */
3989 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3990 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3991
3992 /*
3993 * Wipe out everything which is before the first '<'
3994 */
3995 SKIP_BLANKS;
3996 if (CUR == 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003997 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
3998 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003999 }
4000
4001 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4002 ctxt->sax->startDocument(ctxt->userData);
4003
4004
4005 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004006 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004007 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004008 while (((CUR == '<') && (NXT(1) == '!') &&
4009 (NXT(2) == '-') && (NXT(3) == '-')) ||
4010 ((CUR == '<') && (NXT(1) == '?'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004011 htmlParseComment(ctxt);
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004012 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004013 SKIP_BLANKS;
4014 }
4015
4016
4017 /*
4018 * Then possibly doc type declaration(s) and more Misc
4019 * (doctypedecl Misc*)?
4020 */
4021 if ((CUR == '<') && (NXT(1) == '!') &&
4022 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4023 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4024 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4025 (UPP(8) == 'E')) {
4026 htmlParseDocTypeDecl(ctxt);
4027 }
4028 SKIP_BLANKS;
4029
4030 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004031 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004032 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004033 while (((CUR == '<') && (NXT(1) == '!') &&
4034 (NXT(2) == '-') && (NXT(3) == '-')) ||
4035 ((CUR == '<') && (NXT(1) == '?'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004036 htmlParseComment(ctxt);
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004037 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004038 SKIP_BLANKS;
4039 }
4040
4041 /*
4042 * Time to start parsing the tree itself
4043 */
4044 htmlParseContent(ctxt);
4045
4046 /*
4047 * autoclose
4048 */
4049 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004050 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004051
4052
4053 /*
4054 * SAX: end of the document processing.
4055 */
4056 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4057 ctxt->sax->endDocument(ctxt->userData);
4058
4059 if (ctxt->myDoc != NULL) {
4060 dtd = xmlGetIntSubset(ctxt->myDoc);
4061 if (dtd == NULL)
4062 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00004063 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004064 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4065 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4066 }
4067 if (! ctxt->wellFormed) return(-1);
4068 return(0);
4069}
4070
4071
4072/************************************************************************
4073 * *
4074 * Parser contexts handling *
4075 * *
4076 ************************************************************************/
4077
4078/**
William M. Brackedb65a72004-02-06 07:36:04 +00004079 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004080 * @ctxt: an HTML parser context
4081 *
4082 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004083 *
4084 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004085 */
4086
Daniel Veillardf403d292003-10-05 13:51:35 +00004087static int
Owen Taylor3473f882001-02-23 17:55:21 +00004088htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4089{
4090 htmlSAXHandler *sax;
4091
Daniel Veillardf403d292003-10-05 13:51:35 +00004092 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004093 memset(ctxt, 0, sizeof(htmlParserCtxt));
4094
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004095 ctxt->dict = xmlDictCreate();
4096 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004097 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4098 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004099 }
Owen Taylor3473f882001-02-23 17:55:21 +00004100 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4101 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004102 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4103 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004104 }
4105 else
4106 memset(sax, 0, sizeof(htmlSAXHandler));
4107
4108 /* Allocate the Input stack */
4109 ctxt->inputTab = (htmlParserInputPtr *)
4110 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4111 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004112 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004113 ctxt->inputNr = 0;
4114 ctxt->inputMax = 0;
4115 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004116 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004117 }
4118 ctxt->inputNr = 0;
4119 ctxt->inputMax = 5;
4120 ctxt->input = NULL;
4121 ctxt->version = NULL;
4122 ctxt->encoding = NULL;
4123 ctxt->standalone = -1;
4124 ctxt->instate = XML_PARSER_START;
4125
4126 /* Allocate the Node stack */
4127 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4128 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004129 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004130 ctxt->nodeNr = 0;
4131 ctxt->nodeMax = 0;
4132 ctxt->node = NULL;
4133 ctxt->inputNr = 0;
4134 ctxt->inputMax = 0;
4135 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004136 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004137 }
4138 ctxt->nodeNr = 0;
4139 ctxt->nodeMax = 10;
4140 ctxt->node = NULL;
4141
4142 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004143 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004144 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004145 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004146 ctxt->nameNr = 0;
4147 ctxt->nameMax = 10;
4148 ctxt->name = NULL;
4149 ctxt->nodeNr = 0;
4150 ctxt->nodeMax = 0;
4151 ctxt->node = NULL;
4152 ctxt->inputNr = 0;
4153 ctxt->inputMax = 0;
4154 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004155 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004156 }
4157 ctxt->nameNr = 0;
4158 ctxt->nameMax = 10;
4159 ctxt->name = NULL;
4160
Daniel Veillard092643b2003-09-25 14:29:29 +00004161 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004162 else {
4163 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004164 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004165 }
4166 ctxt->userData = ctxt;
4167 ctxt->myDoc = NULL;
4168 ctxt->wellFormed = 1;
4169 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004170 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004171 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004172 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004173 ctxt->vctxt.userData = ctxt;
4174 ctxt->vctxt.error = xmlParserValidityError;
4175 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004176 ctxt->record_info = 0;
4177 ctxt->validate = 0;
4178 ctxt->nbChars = 0;
4179 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004180 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004181 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004182 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004183}
4184
4185/**
4186 * htmlFreeParserCtxt:
4187 * @ctxt: an HTML parser context
4188 *
4189 * Free all the memory used by a parser context. However the parsed
4190 * document in ctxt->myDoc is not freed.
4191 */
4192
4193void
4194htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4195{
4196 xmlFreeParserCtxt(ctxt);
4197}
4198
4199/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004200 * htmlNewParserCtxt:
4201 *
4202 * Allocate and initialize a new parser context.
4203 *
4204 * Returns the xmlParserCtxtPtr or NULL
4205 */
4206
4207static htmlParserCtxtPtr
4208htmlNewParserCtxt(void)
4209{
4210 xmlParserCtxtPtr ctxt;
4211
4212 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4213 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004214 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004215 return(NULL);
4216 }
4217 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004218 if (htmlInitParserCtxt(ctxt) < 0) {
4219 htmlFreeParserCtxt(ctxt);
4220 return(NULL);
4221 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004222 return(ctxt);
4223}
4224
4225/**
4226 * htmlCreateMemoryParserCtxt:
4227 * @buffer: a pointer to a char array
4228 * @size: the size of the array
4229 *
4230 * Create a parser context for an HTML in-memory document.
4231 *
4232 * Returns the new parser context or NULL
4233 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004234htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004235htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4236 xmlParserCtxtPtr ctxt;
4237 xmlParserInputPtr input;
4238 xmlParserInputBufferPtr buf;
4239
4240 if (buffer == NULL)
4241 return(NULL);
4242 if (size <= 0)
4243 return(NULL);
4244
4245 ctxt = htmlNewParserCtxt();
4246 if (ctxt == NULL)
4247 return(NULL);
4248
4249 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4250 if (buf == NULL) return(NULL);
4251
4252 input = xmlNewInputStream(ctxt);
4253 if (input == NULL) {
4254 xmlFreeParserCtxt(ctxt);
4255 return(NULL);
4256 }
4257
4258 input->filename = NULL;
4259 input->buf = buf;
4260 input->base = input->buf->buffer->content;
4261 input->cur = input->buf->buffer->content;
4262 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4263
4264 inputPush(ctxt, input);
4265 return(ctxt);
4266}
4267
4268/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004269 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004270 * @cur: a pointer to an array of xmlChar
4271 * @encoding: a free form C string describing the HTML document encoding, or NULL
4272 *
4273 * Create a parser context for an HTML document.
4274 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004275 * TODO: check the need to add encoding handling there
4276 *
Owen Taylor3473f882001-02-23 17:55:21 +00004277 * Returns the new parser context or NULL
4278 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004279static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00004280htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004281 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004282 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004283
Daniel Veillard1d995272002-07-22 16:43:32 +00004284 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004285 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004286 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004287 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4288
4289 if (encoding != NULL) {
4290 xmlCharEncoding enc;
4291 xmlCharEncodingHandlerPtr handler;
4292
4293 if (ctxt->input->encoding != NULL)
4294 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004295 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004296
4297 enc = xmlParseCharEncoding(encoding);
4298 /*
4299 * registered set of known encodings
4300 */
4301 if (enc != XML_CHAR_ENCODING_ERROR) {
4302 xmlSwitchEncoding(ctxt, enc);
4303 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004304 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4305 "Unsupported encoding %s\n",
4306 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004307 }
4308 } else {
4309 /*
4310 * fallback for unknown encodings
4311 */
4312 handler = xmlFindCharEncodingHandler((const char *) encoding);
4313 if (handler != NULL) {
4314 xmlSwitchToEncoding(ctxt, handler);
4315 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004316 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4317 "Unsupported encoding %s\n",
4318 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004319 }
4320 }
4321 }
4322 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004323}
4324
Daniel Veillard73b013f2003-09-30 12:36:01 +00004325#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004326/************************************************************************
4327 * *
4328 * Progressive parsing interfaces *
4329 * *
4330 ************************************************************************/
4331
4332/**
4333 * htmlParseLookupSequence:
4334 * @ctxt: an HTML parser context
4335 * @first: the first char to lookup
4336 * @next: the next char to lookup or zero
4337 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004338 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004339 *
4340 * Try to find if a sequence (first, next, third) or just (first next) or
4341 * (first) is available in the input stream.
4342 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4343 * to avoid rescanning sequences of bytes, it DOES change the state of the
4344 * parser, do not use liberally.
4345 * This is basically similar to xmlParseLookupSequence()
4346 *
4347 * Returns the index to the current parsing point if the full sequence
4348 * is available, -1 otherwise.
4349 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004350static int
Owen Taylor3473f882001-02-23 17:55:21 +00004351htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
William M. Brackc1939562003-08-05 15:52:22 +00004352 xmlChar next, xmlChar third, int iscomment) {
Owen Taylor3473f882001-02-23 17:55:21 +00004353 int base, len;
4354 htmlParserInputPtr in;
4355 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004356 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004357
4358 in = ctxt->input;
4359 if (in == NULL) return(-1);
4360 base = in->cur - in->base;
4361 if (base < 0) return(-1);
4362 if (ctxt->checkIndex > base)
4363 base = ctxt->checkIndex;
4364 if (in->buf == NULL) {
4365 buf = in->base;
4366 len = in->length;
4367 } else {
4368 buf = in->buf->buffer->content;
4369 len = in->buf->buffer->use;
4370 }
4371 /* take into account the sequence length */
4372 if (third) len -= 2;
4373 else if (next) len --;
4374 for (;base < len;base++) {
William M. Brackc1939562003-08-05 15:52:22 +00004375 if (!incomment && (base + 4 < len) && !iscomment) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004376 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4377 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4378 incomment = 1;
Daniel Veillard97e01882003-07-30 18:59:19 +00004379 /* do not increment past <! - some people use <!--> */
4380 base += 2;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004381 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004382 }
4383 if (incomment) {
William M. Brack4a557d92003-07-29 04:28:04 +00004384 if (base + 3 > len)
Daniel Veillardc1f78342001-11-10 11:43:05 +00004385 return(-1);
4386 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4387 (buf[base + 2] == '>')) {
4388 incomment = 0;
4389 base += 2;
4390 }
4391 continue;
4392 }
Owen Taylor3473f882001-02-23 17:55:21 +00004393 if (buf[base] == first) {
4394 if (third != 0) {
4395 if ((buf[base + 1] != next) ||
4396 (buf[base + 2] != third)) continue;
4397 } else if (next != 0) {
4398 if (buf[base + 1] != next) continue;
4399 }
4400 ctxt->checkIndex = 0;
4401#ifdef DEBUG_PUSH
4402 if (next == 0)
4403 xmlGenericError(xmlGenericErrorContext,
4404 "HPP: lookup '%c' found at %d\n",
4405 first, base);
4406 else if (third == 0)
4407 xmlGenericError(xmlGenericErrorContext,
4408 "HPP: lookup '%c%c' found at %d\n",
4409 first, next, base);
4410 else
4411 xmlGenericError(xmlGenericErrorContext,
4412 "HPP: lookup '%c%c%c' found at %d\n",
4413 first, next, third, base);
4414#endif
4415 return(base - (in->cur - in->base));
4416 }
4417 }
4418 ctxt->checkIndex = base;
4419#ifdef DEBUG_PUSH
4420 if (next == 0)
4421 xmlGenericError(xmlGenericErrorContext,
4422 "HPP: lookup '%c' failed\n", first);
4423 else if (third == 0)
4424 xmlGenericError(xmlGenericErrorContext,
4425 "HPP: lookup '%c%c' failed\n", first, next);
4426 else
4427 xmlGenericError(xmlGenericErrorContext,
4428 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4429#endif
4430 return(-1);
4431}
4432
4433/**
4434 * htmlParseTryOrFinish:
4435 * @ctxt: an HTML parser context
4436 * @terminate: last chunk indicator
4437 *
4438 * Try to progress on parsing
4439 *
4440 * Returns zero if no parsing was possible
4441 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004442static int
Owen Taylor3473f882001-02-23 17:55:21 +00004443htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4444 int ret = 0;
4445 htmlParserInputPtr in;
4446 int avail = 0;
4447 xmlChar cur, next;
4448
4449#ifdef DEBUG_PUSH
4450 switch (ctxt->instate) {
4451 case XML_PARSER_EOF:
4452 xmlGenericError(xmlGenericErrorContext,
4453 "HPP: try EOF\n"); break;
4454 case XML_PARSER_START:
4455 xmlGenericError(xmlGenericErrorContext,
4456 "HPP: try START\n"); break;
4457 case XML_PARSER_MISC:
4458 xmlGenericError(xmlGenericErrorContext,
4459 "HPP: try MISC\n");break;
4460 case XML_PARSER_COMMENT:
4461 xmlGenericError(xmlGenericErrorContext,
4462 "HPP: try COMMENT\n");break;
4463 case XML_PARSER_PROLOG:
4464 xmlGenericError(xmlGenericErrorContext,
4465 "HPP: try PROLOG\n");break;
4466 case XML_PARSER_START_TAG:
4467 xmlGenericError(xmlGenericErrorContext,
4468 "HPP: try START_TAG\n");break;
4469 case XML_PARSER_CONTENT:
4470 xmlGenericError(xmlGenericErrorContext,
4471 "HPP: try CONTENT\n");break;
4472 case XML_PARSER_CDATA_SECTION:
4473 xmlGenericError(xmlGenericErrorContext,
4474 "HPP: try CDATA_SECTION\n");break;
4475 case XML_PARSER_END_TAG:
4476 xmlGenericError(xmlGenericErrorContext,
4477 "HPP: try END_TAG\n");break;
4478 case XML_PARSER_ENTITY_DECL:
4479 xmlGenericError(xmlGenericErrorContext,
4480 "HPP: try ENTITY_DECL\n");break;
4481 case XML_PARSER_ENTITY_VALUE:
4482 xmlGenericError(xmlGenericErrorContext,
4483 "HPP: try ENTITY_VALUE\n");break;
4484 case XML_PARSER_ATTRIBUTE_VALUE:
4485 xmlGenericError(xmlGenericErrorContext,
4486 "HPP: try ATTRIBUTE_VALUE\n");break;
4487 case XML_PARSER_DTD:
4488 xmlGenericError(xmlGenericErrorContext,
4489 "HPP: try DTD\n");break;
4490 case XML_PARSER_EPILOG:
4491 xmlGenericError(xmlGenericErrorContext,
4492 "HPP: try EPILOG\n");break;
4493 case XML_PARSER_PI:
4494 xmlGenericError(xmlGenericErrorContext,
4495 "HPP: try PI\n");break;
4496 case XML_PARSER_SYSTEM_LITERAL:
4497 xmlGenericError(xmlGenericErrorContext,
4498 "HPP: try SYSTEM_LITERAL\n");break;
4499 }
4500#endif
4501
4502 while (1) {
4503
4504 in = ctxt->input;
4505 if (in == NULL) break;
4506 if (in->buf == NULL)
4507 avail = in->length - (in->cur - in->base);
4508 else
4509 avail = in->buf->buffer->use - (in->cur - in->base);
4510 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004511 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004512 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4513 /*
4514 * SAX: end of the document processing.
4515 */
4516 ctxt->instate = XML_PARSER_EOF;
4517 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4518 ctxt->sax->endDocument(ctxt->userData);
4519 }
4520 }
4521 if (avail < 1)
4522 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004523 cur = in->cur[0];
4524 if (cur == 0) {
4525 SKIP(1);
4526 continue;
4527 }
4528
Owen Taylor3473f882001-02-23 17:55:21 +00004529 switch (ctxt->instate) {
4530 case XML_PARSER_EOF:
4531 /*
4532 * Document parsing is done !
4533 */
4534 goto done;
4535 case XML_PARSER_START:
4536 /*
4537 * Very first chars read from the document flow.
4538 */
4539 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004540 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004541 SKIP_BLANKS;
4542 if (in->buf == NULL)
4543 avail = in->length - (in->cur - in->base);
4544 else
4545 avail = in->buf->buffer->use - (in->cur - in->base);
4546 }
4547 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4548 ctxt->sax->setDocumentLocator(ctxt->userData,
4549 &xmlDefaultSAXLocator);
4550 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4551 (!ctxt->disableSAX))
4552 ctxt->sax->startDocument(ctxt->userData);
4553
4554 cur = in->cur[0];
4555 next = in->cur[1];
4556 if ((cur == '<') && (next == '!') &&
4557 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4558 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4559 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4560 (UPP(8) == 'E')) {
4561 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004562 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004563 goto done;
4564#ifdef DEBUG_PUSH
4565 xmlGenericError(xmlGenericErrorContext,
4566 "HPP: Parsing internal subset\n");
4567#endif
4568 htmlParseDocTypeDecl(ctxt);
4569 ctxt->instate = XML_PARSER_PROLOG;
4570#ifdef DEBUG_PUSH
4571 xmlGenericError(xmlGenericErrorContext,
4572 "HPP: entering PROLOG\n");
4573#endif
4574 } else {
4575 ctxt->instate = XML_PARSER_MISC;
4576 }
4577#ifdef DEBUG_PUSH
4578 xmlGenericError(xmlGenericErrorContext,
4579 "HPP: entering MISC\n");
4580#endif
4581 break;
4582 case XML_PARSER_MISC:
4583 SKIP_BLANKS;
4584 if (in->buf == NULL)
4585 avail = in->length - (in->cur - in->base);
4586 else
4587 avail = in->buf->buffer->use - (in->cur - in->base);
4588 if (avail < 2)
4589 goto done;
4590 cur = in->cur[0];
4591 next = in->cur[1];
4592 if ((cur == '<') && (next == '!') &&
4593 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4594 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004595 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004596 goto done;
4597#ifdef DEBUG_PUSH
4598 xmlGenericError(xmlGenericErrorContext,
4599 "HPP: Parsing Comment\n");
4600#endif
4601 htmlParseComment(ctxt);
4602 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004603 } else if ((cur == '<') && (next == '?')) {
4604 if ((!terminate) &&
4605 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4606 goto done;
4607#ifdef DEBUG_PUSH
4608 xmlGenericError(xmlGenericErrorContext,
4609 "HPP: Parsing PI\n");
4610#endif
4611 htmlParsePI(ctxt);
4612 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004613 } else if ((cur == '<') && (next == '!') &&
4614 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4615 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4616 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4617 (UPP(8) == 'E')) {
4618 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004619 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004620 goto done;
4621#ifdef DEBUG_PUSH
4622 xmlGenericError(xmlGenericErrorContext,
4623 "HPP: Parsing internal subset\n");
4624#endif
4625 htmlParseDocTypeDecl(ctxt);
4626 ctxt->instate = XML_PARSER_PROLOG;
4627#ifdef DEBUG_PUSH
4628 xmlGenericError(xmlGenericErrorContext,
4629 "HPP: entering PROLOG\n");
4630#endif
4631 } else if ((cur == '<') && (next == '!') &&
4632 (avail < 9)) {
4633 goto done;
4634 } else {
4635 ctxt->instate = XML_PARSER_START_TAG;
4636#ifdef DEBUG_PUSH
4637 xmlGenericError(xmlGenericErrorContext,
4638 "HPP: entering START_TAG\n");
4639#endif
4640 }
4641 break;
4642 case XML_PARSER_PROLOG:
4643 SKIP_BLANKS;
4644 if (in->buf == NULL)
4645 avail = in->length - (in->cur - in->base);
4646 else
4647 avail = in->buf->buffer->use - (in->cur - in->base);
4648 if (avail < 2)
4649 goto done;
4650 cur = in->cur[0];
4651 next = in->cur[1];
4652 if ((cur == '<') && (next == '!') &&
4653 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4654 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004655 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004656 goto done;
4657#ifdef DEBUG_PUSH
4658 xmlGenericError(xmlGenericErrorContext,
4659 "HPP: Parsing Comment\n");
4660#endif
4661 htmlParseComment(ctxt);
4662 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004663 } else if ((cur == '<') && (next == '?')) {
4664 if ((!terminate) &&
4665 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4666 goto done;
4667#ifdef DEBUG_PUSH
4668 xmlGenericError(xmlGenericErrorContext,
4669 "HPP: Parsing PI\n");
4670#endif
4671 htmlParsePI(ctxt);
4672 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004673 } else if ((cur == '<') && (next == '!') &&
4674 (avail < 4)) {
4675 goto done;
4676 } else {
4677 ctxt->instate = XML_PARSER_START_TAG;
4678#ifdef DEBUG_PUSH
4679 xmlGenericError(xmlGenericErrorContext,
4680 "HPP: entering START_TAG\n");
4681#endif
4682 }
4683 break;
4684 case XML_PARSER_EPILOG:
4685 if (in->buf == NULL)
4686 avail = in->length - (in->cur - in->base);
4687 else
4688 avail = in->buf->buffer->use - (in->cur - in->base);
4689 if (avail < 1)
4690 goto done;
4691 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004692 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004693 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004694 goto done;
4695 }
4696 if (avail < 2)
4697 goto done;
4698 next = in->cur[1];
4699 if ((cur == '<') && (next == '!') &&
4700 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4701 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004702 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004703 goto done;
4704#ifdef DEBUG_PUSH
4705 xmlGenericError(xmlGenericErrorContext,
4706 "HPP: Parsing Comment\n");
4707#endif
4708 htmlParseComment(ctxt);
4709 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004710 } else if ((cur == '<') && (next == '?')) {
4711 if ((!terminate) &&
4712 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4713 goto done;
4714#ifdef DEBUG_PUSH
4715 xmlGenericError(xmlGenericErrorContext,
4716 "HPP: Parsing PI\n");
4717#endif
4718 htmlParsePI(ctxt);
4719 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004720 } else if ((cur == '<') && (next == '!') &&
4721 (avail < 4)) {
4722 goto done;
4723 } else {
4724 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004725 ctxt->wellFormed = 0;
4726 ctxt->instate = XML_PARSER_EOF;
4727#ifdef DEBUG_PUSH
4728 xmlGenericError(xmlGenericErrorContext,
4729 "HPP: entering EOF\n");
4730#endif
4731 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4732 ctxt->sax->endDocument(ctxt->userData);
4733 goto done;
4734 }
4735 break;
4736 case XML_PARSER_START_TAG: {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004737 const xmlChar *name, *oldname;
Owen Taylor3473f882001-02-23 17:55:21 +00004738 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004739 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004740
4741 if (avail < 2)
4742 goto done;
4743 cur = in->cur[0];
4744 if (cur != '<') {
4745 ctxt->instate = XML_PARSER_CONTENT;
4746#ifdef DEBUG_PUSH
4747 xmlGenericError(xmlGenericErrorContext,
4748 "HPP: entering CONTENT\n");
4749#endif
4750 break;
4751 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004752 if (in->cur[1] == '/') {
4753 ctxt->instate = XML_PARSER_END_TAG;
4754 ctxt->checkIndex = 0;
4755#ifdef DEBUG_PUSH
4756 xmlGenericError(xmlGenericErrorContext,
4757 "HPP: entering END_TAG\n");
4758#endif
4759 break;
4760 }
Owen Taylor3473f882001-02-23 17:55:21 +00004761 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004762 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004763 goto done;
4764
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004765 oldname = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00004766 htmlParseStartTag(ctxt);
4767 name = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00004768 if (((depth == ctxt->nameNr) &&
4769 (xmlStrEqual(oldname, ctxt->name))) ||
4770 (name == NULL)) {
4771 if (CUR == '>')
4772 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004773 break;
4774 }
Owen Taylor3473f882001-02-23 17:55:21 +00004775
4776 /*
4777 * Lookup the info for that element.
4778 */
4779 info = htmlTagLookup(name);
4780 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004781 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4782 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004783 }
4784
4785 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004786 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004787 */
4788 if ((CUR == '/') && (NXT(1) == '>')) {
4789 SKIP(2);
4790 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4791 ctxt->sax->endElement(ctxt->userData, name);
4792 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004793 ctxt->instate = XML_PARSER_CONTENT;
4794#ifdef DEBUG_PUSH
4795 xmlGenericError(xmlGenericErrorContext,
4796 "HPP: entering CONTENT\n");
4797#endif
4798 break;
4799 }
4800
4801 if (CUR == '>') {
4802 NEXT;
4803 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004804 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4805 "Couldn't find end of Start Tag %s\n",
4806 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004807
4808 /*
4809 * end of parsing of this node.
4810 */
4811 if (xmlStrEqual(name, ctxt->name)) {
4812 nodePop(ctxt);
4813 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004814 }
4815
4816 ctxt->instate = XML_PARSER_CONTENT;
4817#ifdef DEBUG_PUSH
4818 xmlGenericError(xmlGenericErrorContext,
4819 "HPP: entering CONTENT\n");
4820#endif
4821 break;
4822 }
4823
4824 /*
4825 * Check for an Empty Element from DTD definition
4826 */
4827 if ((info != NULL) && (info->empty)) {
4828 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4829 ctxt->sax->endElement(ctxt->userData, name);
4830 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004831 }
4832 ctxt->instate = XML_PARSER_CONTENT;
4833#ifdef DEBUG_PUSH
4834 xmlGenericError(xmlGenericErrorContext,
4835 "HPP: entering CONTENT\n");
4836#endif
4837 break;
4838 }
4839 case XML_PARSER_CONTENT: {
4840 long cons;
4841 /*
4842 * Handle preparsed entities and charRef
4843 */
4844 if (ctxt->token != 0) {
4845 xmlChar chr[2] = { 0 , 0 } ;
4846
4847 chr[0] = (xmlChar) ctxt->token;
4848 htmlCheckParagraph(ctxt);
4849 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4850 ctxt->sax->characters(ctxt->userData, chr, 1);
4851 ctxt->token = 0;
4852 ctxt->checkIndex = 0;
4853 }
4854 if ((avail == 1) && (terminate)) {
4855 cur = in->cur[0];
4856 if ((cur != '<') && (cur != '&')) {
4857 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00004858 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004859 if (ctxt->sax->ignorableWhitespace != NULL)
4860 ctxt->sax->ignorableWhitespace(
4861 ctxt->userData, &cur, 1);
4862 } else {
4863 htmlCheckParagraph(ctxt);
4864 if (ctxt->sax->characters != NULL)
4865 ctxt->sax->characters(
4866 ctxt->userData, &cur, 1);
4867 }
4868 }
4869 ctxt->token = 0;
4870 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00004871 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00004872 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004873 }
Owen Taylor3473f882001-02-23 17:55:21 +00004874 }
4875 if (avail < 2)
4876 goto done;
4877 cur = in->cur[0];
4878 next = in->cur[1];
4879 cons = ctxt->nbChars;
4880 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4881 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4882 /*
4883 * Handle SCRIPT/STYLE separately
4884 */
4885 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004886 (htmlParseLookupSequence(ctxt, '<', '/', 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004887 goto done;
4888 htmlParseScript(ctxt);
4889 if ((cur == '<') && (next == '/')) {
4890 ctxt->instate = XML_PARSER_END_TAG;
4891 ctxt->checkIndex = 0;
4892#ifdef DEBUG_PUSH
4893 xmlGenericError(xmlGenericErrorContext,
4894 "HPP: entering END_TAG\n");
4895#endif
4896 break;
4897 }
4898 } else {
4899 /*
4900 * Sometimes DOCTYPE arrives in the middle of the document
4901 */
4902 if ((cur == '<') && (next == '!') &&
4903 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4904 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4905 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4906 (UPP(8) == 'E')) {
4907 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004908 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004909 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00004910 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4911 "Misplaced DOCTYPE declaration\n",
4912 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004913 htmlParseDocTypeDecl(ctxt);
4914 } else if ((cur == '<') && (next == '!') &&
4915 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4916 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004917 (htmlParseLookupSequence(
4918 ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004919 goto done;
4920#ifdef DEBUG_PUSH
4921 xmlGenericError(xmlGenericErrorContext,
4922 "HPP: Parsing Comment\n");
4923#endif
4924 htmlParseComment(ctxt);
4925 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004926 } else if ((cur == '<') && (next == '?')) {
4927 if ((!terminate) &&
4928 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4929 goto done;
4930#ifdef DEBUG_PUSH
4931 xmlGenericError(xmlGenericErrorContext,
4932 "HPP: Parsing PI\n");
4933#endif
4934 htmlParsePI(ctxt);
4935 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00004936 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4937 goto done;
4938 } else if ((cur == '<') && (next == '/')) {
4939 ctxt->instate = XML_PARSER_END_TAG;
4940 ctxt->checkIndex = 0;
4941#ifdef DEBUG_PUSH
4942 xmlGenericError(xmlGenericErrorContext,
4943 "HPP: entering END_TAG\n");
4944#endif
4945 break;
4946 } else if (cur == '<') {
4947 ctxt->instate = XML_PARSER_START_TAG;
4948 ctxt->checkIndex = 0;
4949#ifdef DEBUG_PUSH
4950 xmlGenericError(xmlGenericErrorContext,
4951 "HPP: entering START_TAG\n");
4952#endif
4953 break;
4954 } else if (cur == '&') {
4955 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004956 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004957 goto done;
4958#ifdef DEBUG_PUSH
4959 xmlGenericError(xmlGenericErrorContext,
4960 "HPP: Parsing Reference\n");
4961#endif
4962 /* TODO: check generation of subtrees if noent !!! */
4963 htmlParseReference(ctxt);
4964 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00004965 /*
4966 * check that the text sequence is complete
4967 * before handing out the data to the parser
4968 * to avoid problems with erroneous end of
4969 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00004970 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00004971 if ((!terminate) &&
4972 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
4973 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00004974 ctxt->checkIndex = 0;
4975#ifdef DEBUG_PUSH
4976 xmlGenericError(xmlGenericErrorContext,
4977 "HPP: Parsing char data\n");
4978#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004979 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004980 }
4981 }
4982 if (cons == ctxt->nbChars) {
4983 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004984 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4985 "detected an error in element content\n",
4986 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004987 }
4988 NEXT;
4989 break;
4990 }
4991
4992 break;
4993 }
4994 case XML_PARSER_END_TAG:
4995 if (avail < 2)
4996 goto done;
4997 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004998 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004999 goto done;
5000 htmlParseEndTag(ctxt);
5001 if (ctxt->nameNr == 0) {
5002 ctxt->instate = XML_PARSER_EPILOG;
5003 } else {
5004 ctxt->instate = XML_PARSER_CONTENT;
5005 }
5006 ctxt->checkIndex = 0;
5007#ifdef DEBUG_PUSH
5008 xmlGenericError(xmlGenericErrorContext,
5009 "HPP: entering CONTENT\n");
5010#endif
5011 break;
5012 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005013 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5014 "HPP: internal error, state == CDATA\n",
5015 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005016 ctxt->instate = XML_PARSER_CONTENT;
5017 ctxt->checkIndex = 0;
5018#ifdef DEBUG_PUSH
5019 xmlGenericError(xmlGenericErrorContext,
5020 "HPP: entering CONTENT\n");
5021#endif
5022 break;
5023 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005024 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5025 "HPP: internal error, state == DTD\n",
5026 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005027 ctxt->instate = XML_PARSER_CONTENT;
5028 ctxt->checkIndex = 0;
5029#ifdef DEBUG_PUSH
5030 xmlGenericError(xmlGenericErrorContext,
5031 "HPP: entering CONTENT\n");
5032#endif
5033 break;
5034 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005035 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5036 "HPP: internal error, state == COMMENT\n",
5037 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005038 ctxt->instate = XML_PARSER_CONTENT;
5039 ctxt->checkIndex = 0;
5040#ifdef DEBUG_PUSH
5041 xmlGenericError(xmlGenericErrorContext,
5042 "HPP: entering CONTENT\n");
5043#endif
5044 break;
5045 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005046 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5047 "HPP: internal error, state == PI\n",
5048 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005049 ctxt->instate = XML_PARSER_CONTENT;
5050 ctxt->checkIndex = 0;
5051#ifdef DEBUG_PUSH
5052 xmlGenericError(xmlGenericErrorContext,
5053 "HPP: entering CONTENT\n");
5054#endif
5055 break;
5056 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005057 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5058 "HPP: internal error, state == ENTITY_DECL\n",
5059 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005060 ctxt->instate = XML_PARSER_CONTENT;
5061 ctxt->checkIndex = 0;
5062#ifdef DEBUG_PUSH
5063 xmlGenericError(xmlGenericErrorContext,
5064 "HPP: entering CONTENT\n");
5065#endif
5066 break;
5067 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005068 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5069 "HPP: internal error, state == ENTITY_VALUE\n",
5070 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005071 ctxt->instate = XML_PARSER_CONTENT;
5072 ctxt->checkIndex = 0;
5073#ifdef DEBUG_PUSH
5074 xmlGenericError(xmlGenericErrorContext,
5075 "HPP: entering DTD\n");
5076#endif
5077 break;
5078 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005079 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5080 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5081 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005082 ctxt->instate = XML_PARSER_START_TAG;
5083 ctxt->checkIndex = 0;
5084#ifdef DEBUG_PUSH
5085 xmlGenericError(xmlGenericErrorContext,
5086 "HPP: entering START_TAG\n");
5087#endif
5088 break;
5089 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005090 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5091 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5092 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005093 ctxt->instate = XML_PARSER_CONTENT;
5094 ctxt->checkIndex = 0;
5095#ifdef DEBUG_PUSH
5096 xmlGenericError(xmlGenericErrorContext,
5097 "HPP: entering CONTENT\n");
5098#endif
5099 break;
5100 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005101 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5102 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5103 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005104 ctxt->instate = XML_PARSER_CONTENT;
5105 ctxt->checkIndex = 0;
5106#ifdef DEBUG_PUSH
5107 xmlGenericError(xmlGenericErrorContext,
5108 "HPP: entering CONTENT\n");
5109#endif
5110 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005111 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005112 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5113 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5114 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005115 ctxt->instate = XML_PARSER_CONTENT;
5116 ctxt->checkIndex = 0;
5117#ifdef DEBUG_PUSH
5118 xmlGenericError(xmlGenericErrorContext,
5119 "HPP: entering CONTENT\n");
5120#endif
5121 break;
5122
Owen Taylor3473f882001-02-23 17:55:21 +00005123 }
5124 }
5125done:
5126 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005127 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005128 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5129 /*
5130 * SAX: end of the document processing.
5131 */
5132 ctxt->instate = XML_PARSER_EOF;
5133 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5134 ctxt->sax->endDocument(ctxt->userData);
5135 }
5136 }
5137 if ((ctxt->myDoc != NULL) &&
5138 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5139 (ctxt->instate == XML_PARSER_EPILOG))) {
5140 xmlDtdPtr dtd;
5141 dtd = xmlGetIntSubset(ctxt->myDoc);
5142 if (dtd == NULL)
5143 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00005144 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005145 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5146 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5147 }
5148#ifdef DEBUG_PUSH
5149 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5150#endif
5151 return(ret);
5152}
5153
5154/**
Owen Taylor3473f882001-02-23 17:55:21 +00005155 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00005156 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00005157 * @chunk: an char array
5158 * @size: the size in byte of the chunk
5159 * @terminate: last chunk indicator
5160 *
5161 * Parse a Chunk of memory
5162 *
5163 * Returns zero if no error, the xmlParserErrors otherwise.
5164 */
5165int
5166htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5167 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00005168 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5169 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5170 "htmlParseChunk: context error\n", NULL, NULL);
5171 return(XML_ERR_INTERNAL_ERROR);
5172 }
Owen Taylor3473f882001-02-23 17:55:21 +00005173 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5174 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5175 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5176 int cur = ctxt->input->cur - ctxt->input->base;
5177
5178 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5179 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5180 ctxt->input->cur = ctxt->input->base + cur;
5181#ifdef DEBUG_PUSH
5182 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5183#endif
5184
Daniel Veillard14f752c2003-08-09 11:44:50 +00005185#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00005186 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5187 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005188#endif
Owen Taylor3473f882001-02-23 17:55:21 +00005189 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005190 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5191 xmlParserInputBufferPtr in = ctxt->input->buf;
5192 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5193 (in->raw != NULL)) {
5194 int nbchars;
5195
5196 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5197 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005198 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5199 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005200 return(XML_ERR_INVALID_ENCODING);
5201 }
5202 }
5203 }
Owen Taylor3473f882001-02-23 17:55:21 +00005204 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00005205 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00005206 if (terminate) {
5207 if ((ctxt->instate != XML_PARSER_EOF) &&
5208 (ctxt->instate != XML_PARSER_EPILOG) &&
5209 (ctxt->instate != XML_PARSER_MISC)) {
5210 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005211 ctxt->wellFormed = 0;
5212 }
5213 if (ctxt->instate != XML_PARSER_EOF) {
5214 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5215 ctxt->sax->endDocument(ctxt->userData);
5216 }
5217 ctxt->instate = XML_PARSER_EOF;
5218 }
5219 return((xmlParserErrors) ctxt->errNo);
5220}
Daniel Veillard73b013f2003-09-30 12:36:01 +00005221#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00005222
5223/************************************************************************
5224 * *
5225 * User entry points *
5226 * *
5227 ************************************************************************/
5228
5229/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005230 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005231 * @sax: a SAX handler
5232 * @user_data: The user data returned on SAX callbacks
5233 * @chunk: a pointer to an array of chars
5234 * @size: number of chars in the array
5235 * @filename: an optional file name or URI
5236 * @enc: an optional encoding
5237 *
5238 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005239 * The value of @filename is used for fetching external entities
5240 * and error/warning reports.
5241 *
5242 * Returns the new parser context or NULL
5243 */
5244htmlParserCtxtPtr
5245htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5246 const char *chunk, int size, const char *filename,
5247 xmlCharEncoding enc) {
5248 htmlParserCtxtPtr ctxt;
5249 htmlParserInputPtr inputStream;
5250 xmlParserInputBufferPtr buf;
5251
Daniel Veillardd0463562001-10-13 09:15:48 +00005252 xmlInitParser();
5253
Owen Taylor3473f882001-02-23 17:55:21 +00005254 buf = xmlAllocParserInputBuffer(enc);
5255 if (buf == NULL) return(NULL);
5256
Daniel Veillardf403d292003-10-05 13:51:35 +00005257 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005258 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005259 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005260 return(NULL);
5261 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00005262 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5263 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005264 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005265 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005266 xmlFree(ctxt->sax);
5267 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5268 if (ctxt->sax == NULL) {
5269 xmlFree(buf);
5270 xmlFree(ctxt);
5271 return(NULL);
5272 }
5273 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5274 if (user_data != NULL)
5275 ctxt->userData = user_data;
5276 }
5277 if (filename == NULL) {
5278 ctxt->directory = NULL;
5279 } else {
5280 ctxt->directory = xmlParserGetDirectory(filename);
5281 }
5282
5283 inputStream = htmlNewInputStream(ctxt);
5284 if (inputStream == NULL) {
5285 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005286 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005287 return(NULL);
5288 }
5289
5290 if (filename == NULL)
5291 inputStream->filename = NULL;
5292 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005293 inputStream->filename = (char *)
5294 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005295 inputStream->buf = buf;
5296 inputStream->base = inputStream->buf->buffer->content;
5297 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillard5f704af2003-03-05 10:01:43 +00005298 inputStream->end =
5299 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005300
5301 inputPush(ctxt, inputStream);
5302
5303 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5304 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005305 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5306 int cur = ctxt->input->cur - ctxt->input->base;
5307
Owen Taylor3473f882001-02-23 17:55:21 +00005308 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005309
5310 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5311 ctxt->input->cur = ctxt->input->base + cur;
5312 ctxt->input->end =
5313 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005314#ifdef DEBUG_PUSH
5315 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5316#endif
5317 }
5318
5319 return(ctxt);
5320}
5321
5322/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005323 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005324 * @cur: a pointer to an array of xmlChar
5325 * @encoding: a free form C string describing the HTML document encoding, or NULL
5326 * @sax: the SAX handler block
5327 * @userData: if using SAX, this pointer will be provided on callbacks.
5328 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005329 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5330 * to handle parse events. If sax is NULL, fallback to the default DOM
5331 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00005332 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005333 * Returns the resulting document tree unless SAX is NULL or the document is
5334 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005335 */
5336
5337htmlDocPtr
5338htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5339 htmlDocPtr ret;
5340 htmlParserCtxtPtr ctxt;
5341
Daniel Veillardd0463562001-10-13 09:15:48 +00005342 xmlInitParser();
5343
Owen Taylor3473f882001-02-23 17:55:21 +00005344 if (cur == NULL) return(NULL);
5345
5346
5347 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5348 if (ctxt == NULL) return(NULL);
5349 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005350 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005351 ctxt->sax = sax;
5352 ctxt->userData = userData;
5353 }
5354
5355 htmlParseDocument(ctxt);
5356 ret = ctxt->myDoc;
5357 if (sax != NULL) {
5358 ctxt->sax = NULL;
5359 ctxt->userData = NULL;
5360 }
5361 htmlFreeParserCtxt(ctxt);
5362
5363 return(ret);
5364}
5365
5366/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005367 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005368 * @cur: a pointer to an array of xmlChar
5369 * @encoding: a free form C string describing the HTML document encoding, or NULL
5370 *
5371 * parse an HTML in-memory document and build a tree.
5372 *
5373 * Returns the resulting document tree
5374 */
5375
5376htmlDocPtr
5377htmlParseDoc(xmlChar *cur, const char *encoding) {
5378 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5379}
5380
5381
5382/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005383 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005384 * @filename: the filename
5385 * @encoding: a free form C string describing the HTML document encoding, or NULL
5386 *
5387 * Create a parser context for a file content.
5388 * Automatic support for ZLIB/Compress compressed document is provided
5389 * by default if found at compile-time.
5390 *
5391 * Returns the new parser context or NULL
5392 */
5393htmlParserCtxtPtr
5394htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5395{
5396 htmlParserCtxtPtr ctxt;
5397 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005398 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005399 /* htmlCharEncoding enc; */
5400 xmlChar *content, *content_line = (xmlChar *) "charset=";
5401
Daniel Veillarda03e3652004-11-02 18:45:30 +00005402 if (filename == NULL)
5403 return(NULL);
5404
Daniel Veillardf403d292003-10-05 13:51:35 +00005405 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005406 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00005407 return(NULL);
5408 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005409 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5410 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00005411#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005412 if (xmlDefaultSAXHandler.error != NULL) {
5413 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5414 }
Daniel Veillard87247e82004-01-13 20:42:02 +00005415#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00005416 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005417 return(NULL);
5418 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005419
5420 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5421 xmlFree(canonicFilename);
5422 if (inputStream == NULL) {
5423 xmlFreeParserCtxt(ctxt);
5424 return(NULL);
5425 }
Owen Taylor3473f882001-02-23 17:55:21 +00005426
5427 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005428
Owen Taylor3473f882001-02-23 17:55:21 +00005429 /* set encoding */
5430 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005431 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Owen Taylor3473f882001-02-23 17:55:21 +00005432 if (content) {
5433 strcpy ((char *)content, (char *)content_line);
5434 strcat ((char *)content, (char *)encoding);
5435 htmlCheckEncoding (ctxt, content);
5436 xmlFree (content);
5437 }
5438 }
5439
5440 return(ctxt);
5441}
5442
5443/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005444 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005445 * @filename: the filename
5446 * @encoding: a free form C string describing the HTML document encoding, or NULL
5447 * @sax: the SAX handler block
5448 * @userData: if using SAX, this pointer will be provided on callbacks.
5449 *
5450 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5451 * compressed document is provided by default if found at compile-time.
5452 * It use the given SAX function block to handle the parsing callback.
5453 * If sax is NULL, fallback to the default DOM tree building routines.
5454 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005455 * Returns the resulting document tree unless SAX is NULL or the document is
5456 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005457 */
5458
5459htmlDocPtr
5460htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5461 void *userData) {
5462 htmlDocPtr ret;
5463 htmlParserCtxtPtr ctxt;
5464 htmlSAXHandlerPtr oldsax = NULL;
5465
Daniel Veillardd0463562001-10-13 09:15:48 +00005466 xmlInitParser();
5467
Owen Taylor3473f882001-02-23 17:55:21 +00005468 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5469 if (ctxt == NULL) return(NULL);
5470 if (sax != NULL) {
5471 oldsax = ctxt->sax;
5472 ctxt->sax = sax;
5473 ctxt->userData = userData;
5474 }
5475
5476 htmlParseDocument(ctxt);
5477
5478 ret = ctxt->myDoc;
5479 if (sax != NULL) {
5480 ctxt->sax = oldsax;
5481 ctxt->userData = NULL;
5482 }
5483 htmlFreeParserCtxt(ctxt);
5484
5485 return(ret);
5486}
5487
5488/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005489 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005490 * @filename: the filename
5491 * @encoding: a free form C string describing the HTML document encoding, or NULL
5492 *
5493 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5494 * compressed document is provided by default if found at compile-time.
5495 *
5496 * Returns the resulting document tree
5497 */
5498
5499htmlDocPtr
5500htmlParseFile(const char *filename, const char *encoding) {
5501 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5502}
5503
5504/**
5505 * htmlHandleOmittedElem:
5506 * @val: int 0 or 1
5507 *
5508 * Set and return the previous value for handling HTML omitted tags.
5509 *
5510 * Returns the last value for 0 for no handling, 1 for auto insertion.
5511 */
5512
5513int
5514htmlHandleOmittedElem(int val) {
5515 int old = htmlOmittedDefaultValue;
5516
5517 htmlOmittedDefaultValue = val;
5518 return(old);
5519}
5520
Daniel Veillard930dfb62003-02-05 10:17:38 +00005521/**
5522 * htmlElementAllowedHere:
5523 * @parent: HTML parent element
5524 * @elt: HTML element
5525 *
5526 * Checks whether an HTML element may be a direct child of a parent element.
5527 * Note - doesn't check for deprecated elements
5528 *
5529 * Returns 1 if allowed; 0 otherwise.
5530 */
5531int
5532htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5533 const char** p ;
5534
5535 if ( ! elt || ! parent || ! parent->subelts )
5536 return 0 ;
5537
5538 for ( p = parent->subelts; *p; ++p )
5539 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5540 return 1 ;
5541
5542 return 0 ;
5543}
5544/**
5545 * htmlElementStatusHere:
5546 * @parent: HTML parent element
5547 * @elt: HTML element
5548 *
5549 * Checks whether an HTML element may be a direct child of a parent element.
5550 * and if so whether it is valid or deprecated.
5551 *
5552 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5553 */
5554htmlStatus
5555htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5556 if ( ! parent || ! elt )
5557 return HTML_INVALID ;
5558 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5559 return HTML_INVALID ;
5560
5561 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5562}
5563/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005564 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005565 * @elt: HTML element
5566 * @attr: HTML attribute
5567 * @legacy: whether to allow deprecated attributes
5568 *
5569 * Checks whether an attribute is valid for an element
5570 * Has full knowledge of Required and Deprecated attributes
5571 *
5572 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5573 */
5574htmlStatus
5575htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5576 const char** p ;
5577
5578 if ( !elt || ! attr )
5579 return HTML_INVALID ;
5580
5581 if ( elt->attrs_req )
5582 for ( p = elt->attrs_req; *p; ++p)
5583 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5584 return HTML_REQUIRED ;
5585
5586 if ( elt->attrs_opt )
5587 for ( p = elt->attrs_opt; *p; ++p)
5588 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5589 return HTML_VALID ;
5590
5591 if ( legacy && elt->attrs_depr )
5592 for ( p = elt->attrs_depr; *p; ++p)
5593 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5594 return HTML_DEPRECATED ;
5595
5596 return HTML_INVALID ;
5597}
5598/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005599 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005600 * @node: an htmlNodePtr in a tree
5601 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005602 * for Element nodes)
5603 *
5604 * Checks whether the tree node is valid. Experimental (the author
5605 * only uses the HTML enhancements in a SAX parser)
5606 *
5607 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5608 * legacy allowed) or htmlElementStatusHere (otherwise).
5609 * for Attribute nodes, a return from htmlAttrAllowed
5610 * for other nodes, HTML_NA (no checks performed)
5611 */
5612htmlStatus
5613htmlNodeStatus(const htmlNodePtr node, int legacy) {
5614 if ( ! node )
5615 return HTML_INVALID ;
5616
5617 switch ( node->type ) {
5618 case XML_ELEMENT_NODE:
5619 return legacy
5620 ? ( htmlElementAllowedHere (
5621 htmlTagLookup(node->parent->name) , node->name
5622 ) ? HTML_VALID : HTML_INVALID )
5623 : htmlElementStatusHere(
5624 htmlTagLookup(node->parent->name) ,
5625 htmlTagLookup(node->name) )
5626 ;
5627 case XML_ATTRIBUTE_NODE:
5628 return htmlAttrAllowed(
5629 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5630 default: return HTML_NA ;
5631 }
5632}
Daniel Veillard9475a352003-09-26 12:47:50 +00005633/************************************************************************
5634 * *
5635 * New set (2.6.0) of simpler and more flexible APIs *
5636 * *
5637 ************************************************************************/
5638/**
5639 * DICT_FREE:
5640 * @str: a string
5641 *
5642 * Free a string if it is not owned by the "dict" dictionnary in the
5643 * current scope
5644 */
5645#define DICT_FREE(str) \
5646 if ((str) && ((!dict) || \
5647 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5648 xmlFree((char *)(str));
5649
5650/**
5651 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00005652 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00005653 *
5654 * Reset a parser context
5655 */
5656void
5657htmlCtxtReset(htmlParserCtxtPtr ctxt)
5658{
5659 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00005660 xmlDictPtr dict;
5661
5662 if (ctxt == NULL)
5663 return;
5664
5665 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00005666
5667 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5668 xmlFreeInputStream(input);
5669 }
5670 ctxt->inputNr = 0;
5671 ctxt->input = NULL;
5672
5673 ctxt->spaceNr = 0;
5674 ctxt->spaceTab[0] = -1;
5675 ctxt->space = &ctxt->spaceTab[0];
5676
5677
5678 ctxt->nodeNr = 0;
5679 ctxt->node = NULL;
5680
5681 ctxt->nameNr = 0;
5682 ctxt->name = NULL;
5683
5684 DICT_FREE(ctxt->version);
5685 ctxt->version = NULL;
5686 DICT_FREE(ctxt->encoding);
5687 ctxt->encoding = NULL;
5688 DICT_FREE(ctxt->directory);
5689 ctxt->directory = NULL;
5690 DICT_FREE(ctxt->extSubURI);
5691 ctxt->extSubURI = NULL;
5692 DICT_FREE(ctxt->extSubSystem);
5693 ctxt->extSubSystem = NULL;
5694 if (ctxt->myDoc != NULL)
5695 xmlFreeDoc(ctxt->myDoc);
5696 ctxt->myDoc = NULL;
5697
5698 ctxt->standalone = -1;
5699 ctxt->hasExternalSubset = 0;
5700 ctxt->hasPErefs = 0;
5701 ctxt->html = 1;
5702 ctxt->external = 0;
5703 ctxt->instate = XML_PARSER_START;
5704 ctxt->token = 0;
5705
5706 ctxt->wellFormed = 1;
5707 ctxt->nsWellFormed = 1;
5708 ctxt->valid = 1;
5709 ctxt->vctxt.userData = ctxt;
5710 ctxt->vctxt.error = xmlParserValidityError;
5711 ctxt->vctxt.warning = xmlParserValidityWarning;
5712 ctxt->record_info = 0;
5713 ctxt->nbChars = 0;
5714 ctxt->checkIndex = 0;
5715 ctxt->inSubset = 0;
5716 ctxt->errNo = XML_ERR_OK;
5717 ctxt->depth = 0;
5718 ctxt->charset = XML_CHAR_ENCODING_UTF8;
5719 ctxt->catalogs = NULL;
5720 xmlInitNodeInfoSeq(&ctxt->node_seq);
5721
5722 if (ctxt->attsDefault != NULL) {
5723 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5724 ctxt->attsDefault = NULL;
5725 }
5726 if (ctxt->attsSpecial != NULL) {
5727 xmlHashFree(ctxt->attsSpecial, NULL);
5728 ctxt->attsSpecial = NULL;
5729 }
5730}
5731
5732/**
5733 * htmlCtxtUseOptions:
5734 * @ctxt: an HTML parser context
5735 * @options: a combination of htmlParserOption(s)
5736 *
5737 * Applies the options to the parser context
5738 *
5739 * Returns 0 in case of success, the set of unknown or unimplemented options
5740 * in case of error.
5741 */
5742int
5743htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5744{
Daniel Veillarda03e3652004-11-02 18:45:30 +00005745 if (ctxt == NULL)
5746 return(-1);
5747
Daniel Veillard9475a352003-09-26 12:47:50 +00005748 if (options & HTML_PARSE_NOWARNING) {
5749 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005750 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00005751 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005752 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00005753 }
5754 if (options & HTML_PARSE_NOERROR) {
5755 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005756 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00005757 ctxt->sax->fatalError = NULL;
5758 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005759 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00005760 }
5761 if (options & HTML_PARSE_PEDANTIC) {
5762 ctxt->pedantic = 1;
5763 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005764 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00005765 } else
5766 ctxt->pedantic = 0;
5767 if (options & XML_PARSE_NOBLANKS) {
5768 ctxt->keepBlanks = 0;
5769 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5770 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005771 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00005772 } else
5773 ctxt->keepBlanks = 1;
5774 ctxt->dictNames = 0;
5775 return (options);
5776}
5777
5778/**
5779 * htmlDoRead:
5780 * @ctxt: an HTML parser context
5781 * @URL: the base URL to use for the document
5782 * @encoding: the document encoding, or NULL
5783 * @options: a combination of htmlParserOption(s)
5784 * @reuse: keep the context for reuse
5785 *
5786 * Common front-end for the htmlRead functions
5787 *
5788 * Returns the resulting document tree or NULL
5789 */
5790static htmlDocPtr
5791htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5792 int options, int reuse)
5793{
5794 htmlDocPtr ret;
5795
5796 htmlCtxtUseOptions(ctxt, options);
5797 ctxt->html = 1;
5798 if (encoding != NULL) {
5799 xmlCharEncodingHandlerPtr hdlr;
5800
5801 hdlr = xmlFindCharEncodingHandler(encoding);
5802 if (hdlr != NULL)
5803 xmlSwitchToEncoding(ctxt, hdlr);
5804 }
5805 if ((URL != NULL) && (ctxt->input != NULL) &&
5806 (ctxt->input->filename == NULL))
5807 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
5808 htmlParseDocument(ctxt);
5809 ret = ctxt->myDoc;
5810 ctxt->myDoc = NULL;
5811 if (!reuse) {
5812 if ((ctxt->dictNames) &&
5813 (ret != NULL) &&
5814 (ret->dict == ctxt->dict))
5815 ctxt->dict = NULL;
5816 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00005817 }
5818 return (ret);
5819}
5820
5821/**
5822 * htmlReadDoc:
5823 * @cur: a pointer to a zero terminated string
5824 * @URL: the base URL to use for the document
5825 * @encoding: the document encoding, or NULL
5826 * @options: a combination of htmlParserOption(s)
5827 *
5828 * parse an XML in-memory document and build a tree.
5829 *
5830 * Returns the resulting document tree
5831 */
5832htmlDocPtr
5833htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
5834{
5835 htmlParserCtxtPtr ctxt;
5836
5837 if (cur == NULL)
5838 return (NULL);
5839
5840 ctxt = xmlCreateDocParserCtxt(cur);
5841 if (ctxt == NULL)
5842 return (NULL);
5843 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5844}
5845
5846/**
5847 * htmlReadFile:
5848 * @filename: a file or URL
5849 * @encoding: the document encoding, or NULL
5850 * @options: a combination of htmlParserOption(s)
5851 *
5852 * parse an XML file from the filesystem or the network.
5853 *
5854 * Returns the resulting document tree
5855 */
5856htmlDocPtr
5857htmlReadFile(const char *filename, const char *encoding, int options)
5858{
5859 htmlParserCtxtPtr ctxt;
5860
5861 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5862 if (ctxt == NULL)
5863 return (NULL);
5864 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
5865}
5866
5867/**
5868 * htmlReadMemory:
5869 * @buffer: a pointer to a char array
5870 * @size: the size of the array
5871 * @URL: the base URL to use for the document
5872 * @encoding: the document encoding, or NULL
5873 * @options: a combination of htmlParserOption(s)
5874 *
5875 * parse an XML in-memory document and build a tree.
5876 *
5877 * Returns the resulting document tree
5878 */
5879htmlDocPtr
5880htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
5881{
5882 htmlParserCtxtPtr ctxt;
5883
5884 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
5885 if (ctxt == NULL)
5886 return (NULL);
William M. Brackd43cdcd2004-08-03 15:13:29 +00005887 if (ctxt->sax != NULL)
5888 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00005889 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5890}
5891
5892/**
5893 * htmlReadFd:
5894 * @fd: an open file descriptor
5895 * @URL: the base URL to use for the document
5896 * @encoding: the document encoding, or NULL
5897 * @options: a combination of htmlParserOption(s)
5898 *
5899 * parse an XML from a file descriptor and build a tree.
5900 *
5901 * Returns the resulting document tree
5902 */
5903htmlDocPtr
5904htmlReadFd(int fd, const char *URL, const char *encoding, int options)
5905{
5906 htmlParserCtxtPtr ctxt;
5907 xmlParserInputBufferPtr input;
5908 xmlParserInputPtr stream;
5909
5910 if (fd < 0)
5911 return (NULL);
5912
5913 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
5914 if (input == NULL)
5915 return (NULL);
5916 ctxt = xmlNewParserCtxt();
5917 if (ctxt == NULL) {
5918 xmlFreeParserInputBuffer(input);
5919 return (NULL);
5920 }
5921 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5922 if (stream == NULL) {
5923 xmlFreeParserInputBuffer(input);
5924 xmlFreeParserCtxt(ctxt);
5925 return (NULL);
5926 }
5927 inputPush(ctxt, stream);
5928 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5929}
5930
5931/**
5932 * htmlReadIO:
5933 * @ioread: an I/O read function
5934 * @ioclose: an I/O close function
5935 * @ioctx: an I/O handler
5936 * @URL: the base URL to use for the document
5937 * @encoding: the document encoding, or NULL
5938 * @options: a combination of htmlParserOption(s)
5939 *
5940 * parse an HTML document from I/O functions and source and build a tree.
5941 *
5942 * Returns the resulting document tree
5943 */
5944htmlDocPtr
5945htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
5946 void *ioctx, const char *URL, const char *encoding, int options)
5947{
5948 htmlParserCtxtPtr ctxt;
5949 xmlParserInputBufferPtr input;
5950 xmlParserInputPtr stream;
5951
5952 if (ioread == NULL)
5953 return (NULL);
5954
5955 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
5956 XML_CHAR_ENCODING_NONE);
5957 if (input == NULL)
5958 return (NULL);
5959 ctxt = xmlNewParserCtxt();
5960 if (ctxt == NULL) {
5961 xmlFreeParserInputBuffer(input);
5962 return (NULL);
5963 }
5964 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5965 if (stream == NULL) {
5966 xmlFreeParserInputBuffer(input);
5967 xmlFreeParserCtxt(ctxt);
5968 return (NULL);
5969 }
5970 inputPush(ctxt, stream);
5971 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5972}
5973
5974/**
5975 * htmlCtxtReadDoc:
5976 * @ctxt: an HTML parser context
5977 * @cur: a pointer to a zero terminated string
5978 * @URL: the base URL to use for the document
5979 * @encoding: the document encoding, or NULL
5980 * @options: a combination of htmlParserOption(s)
5981 *
5982 * parse an XML in-memory document and build a tree.
5983 * This reuses the existing @ctxt parser context
5984 *
5985 * Returns the resulting document tree
5986 */
5987htmlDocPtr
5988htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
5989 const char *URL, const char *encoding, int options)
5990{
5991 xmlParserInputPtr stream;
5992
5993 if (cur == NULL)
5994 return (NULL);
5995 if (ctxt == NULL)
5996 return (NULL);
5997
5998 htmlCtxtReset(ctxt);
5999
6000 stream = xmlNewStringInputStream(ctxt, cur);
6001 if (stream == NULL) {
6002 return (NULL);
6003 }
6004 inputPush(ctxt, stream);
6005 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6006}
6007
6008/**
6009 * htmlCtxtReadFile:
6010 * @ctxt: an HTML parser context
6011 * @filename: a file or URL
6012 * @encoding: the document encoding, or NULL
6013 * @options: a combination of htmlParserOption(s)
6014 *
6015 * parse an XML file from the filesystem or the network.
6016 * This reuses the existing @ctxt parser context
6017 *
6018 * Returns the resulting document tree
6019 */
6020htmlDocPtr
6021htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6022 const char *encoding, int options)
6023{
6024 xmlParserInputPtr stream;
6025
6026 if (filename == NULL)
6027 return (NULL);
6028 if (ctxt == NULL)
6029 return (NULL);
6030
6031 htmlCtxtReset(ctxt);
6032
6033 stream = xmlNewInputFromFile(ctxt, filename);
6034 if (stream == NULL) {
6035 return (NULL);
6036 }
6037 inputPush(ctxt, stream);
6038 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6039}
6040
6041/**
6042 * htmlCtxtReadMemory:
6043 * @ctxt: an HTML parser context
6044 * @buffer: a pointer to a char array
6045 * @size: the size of the array
6046 * @URL: the base URL to use for the document
6047 * @encoding: the document encoding, or NULL
6048 * @options: a combination of htmlParserOption(s)
6049 *
6050 * parse an XML in-memory document and build a tree.
6051 * This reuses the existing @ctxt parser context
6052 *
6053 * Returns the resulting document tree
6054 */
6055htmlDocPtr
6056htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6057 const char *URL, const char *encoding, int options)
6058{
6059 xmlParserInputBufferPtr input;
6060 xmlParserInputPtr stream;
6061
6062 if (ctxt == NULL)
6063 return (NULL);
6064 if (buffer == NULL)
6065 return (NULL);
6066
6067 htmlCtxtReset(ctxt);
6068
6069 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6070 if (input == NULL) {
6071 return(NULL);
6072 }
6073
6074 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6075 if (stream == NULL) {
6076 xmlFreeParserInputBuffer(input);
6077 return(NULL);
6078 }
6079
6080 inputPush(ctxt, stream);
6081 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6082}
6083
6084/**
6085 * htmlCtxtReadFd:
6086 * @ctxt: an HTML parser context
6087 * @fd: an open file descriptor
6088 * @URL: the base URL to use for the document
6089 * @encoding: the document encoding, or NULL
6090 * @options: a combination of htmlParserOption(s)
6091 *
6092 * parse an XML from a file descriptor and build a tree.
6093 * This reuses the existing @ctxt parser context
6094 *
6095 * Returns the resulting document tree
6096 */
6097htmlDocPtr
6098htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6099 const char *URL, const char *encoding, int options)
6100{
6101 xmlParserInputBufferPtr input;
6102 xmlParserInputPtr stream;
6103
6104 if (fd < 0)
6105 return (NULL);
6106 if (ctxt == NULL)
6107 return (NULL);
6108
6109 htmlCtxtReset(ctxt);
6110
6111
6112 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6113 if (input == NULL)
6114 return (NULL);
6115 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6116 if (stream == NULL) {
6117 xmlFreeParserInputBuffer(input);
6118 return (NULL);
6119 }
6120 inputPush(ctxt, stream);
6121 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6122}
6123
6124/**
6125 * htmlCtxtReadIO:
6126 * @ctxt: an HTML parser context
6127 * @ioread: an I/O read function
6128 * @ioclose: an I/O close function
6129 * @ioctx: an I/O handler
6130 * @URL: the base URL to use for the document
6131 * @encoding: the document encoding, or NULL
6132 * @options: a combination of htmlParserOption(s)
6133 *
6134 * parse an HTML document from I/O functions and source and build a tree.
6135 * This reuses the existing @ctxt parser context
6136 *
6137 * Returns the resulting document tree
6138 */
6139htmlDocPtr
6140htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6141 xmlInputCloseCallback ioclose, void *ioctx,
6142 const char *URL,
6143 const char *encoding, int options)
6144{
6145 xmlParserInputBufferPtr input;
6146 xmlParserInputPtr stream;
6147
6148 if (ioread == NULL)
6149 return (NULL);
6150 if (ctxt == NULL)
6151 return (NULL);
6152
6153 htmlCtxtReset(ctxt);
6154
6155 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6156 XML_CHAR_ENCODING_NONE);
6157 if (input == NULL)
6158 return (NULL);
6159 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6160 if (stream == NULL) {
6161 xmlFreeParserInputBuffer(input);
6162 return (NULL);
6163 }
6164 inputPush(ctxt, stream);
6165 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6166}
6167
Owen Taylor3473f882001-02-23 17:55:21 +00006168#endif /* LIBXML_HTML_ENABLED */