blob: 55557662bc6e471d7f0c6d1dde6ab967c3af1756 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillardf403d292003-10-05 13:51:35 +000062 * Some factorized error routines *
63 * *
64 ************************************************************************/
65
66/**
William M. Brackedb65a72004-02-06 07:36:04 +000067 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000068 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
Daniel Veillard157fee02003-10-31 10:36:03 +000076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000079 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
Daniel Veillard157fee02003-10-31 10:36:03 +0000109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000112 if (ctxt != NULL)
113 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
117 NULL, 0, 0,
118 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000119 if (ctxt != NULL)
120 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
128 * @val: integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
135{
Daniel Veillard157fee02003-10-31 10:36:03 +0000136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
138 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000139 if (ctxt != NULL)
140 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000144 if (ctxt != NULL)
145 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000146}
147
148/************************************************************************
149 * *
Owen Taylor3473f882001-02-23 17:55:21 +0000150 * Parser stacks related functions and macros *
151 * *
152 ************************************************************************/
153
Daniel Veillard1c732d22002-11-30 11:22:59 +0000154/**
155 * htmlnamePush:
156 * @ctxt: an HTML parser context
157 * @value: the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000162 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000163static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000165{
166 if (ctxt->nameNr >= ctxt->nameMax) {
167 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000168 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000169 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000170 ctxt->nameMax *
171 sizeof(ctxt->nameTab[0]));
172 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000173 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000174 return (0);
175 }
176 }
177 ctxt->nameTab[ctxt->nameNr] = value;
178 ctxt->name = value;
179 return (ctxt->nameNr++);
180}
181/**
182 * htmlnamePop:
183 * @ctxt: an HTML parser context
184 *
185 * Pops the top element name from the name stack
186 *
187 * Returns the name just removed
188 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000189static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000190htmlnamePop(htmlParserCtxtPtr ctxt)
191{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000192 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000193
Daniel Veillard1c732d22002-11-30 11:22:59 +0000194 if (ctxt->nameNr <= 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000195 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000196 ctxt->nameNr--;
197 if (ctxt->nameNr < 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000198 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000199 if (ctxt->nameNr > 0)
200 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
201 else
202 ctxt->name = NULL;
203 ret = ctxt->nameTab[ctxt->nameNr];
Daniel Veillard24505b02005-07-28 23:49:35 +0000204 ctxt->nameTab[ctxt->nameNr] = NULL;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000205 return (ret);
206}
Owen Taylor3473f882001-02-23 17:55:21 +0000207
208/*
209 * Macros for accessing the content. Those should be used only by the parser,
210 * and not exported.
211 *
212 * Dirty macros, i.e. one need to make assumption on the context to use them
213 *
214 * CUR_PTR return the current pointer to the xmlChar to be parsed.
215 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
216 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
217 * in UNICODE mode. This should be used internally by the parser
218 * only to compare to ASCII values otherwise it would break when
219 * running with UTF-8 encoding.
220 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
221 * to compare on ASCII based substring.
222 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
223 * it should be used only to compare on ASCII based substring.
224 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000225 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000226 *
227 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
228 *
229 * CURRENT Returns the current char value, with the full decoding of
230 * UTF-8 if we are using this mode. It returns an int.
231 * NEXT Skip to the next character, this does the proper decoding
232 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000233 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000234 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
235 */
236
237#define UPPER (toupper(*ctxt->input->cur))
238
Daniel Veillard77a90a72003-03-22 00:04:05 +0000239#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000240
241#define NXT(val) ctxt->input->cur[(val)]
242
243#define UPP(val) (toupper(ctxt->input->cur[(val)]))
244
245#define CUR_PTR ctxt->input->cur
246
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000247#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
248 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
249 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000250
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000251#define GROW if ((ctxt->progressive == 0) && \
252 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
253 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000254
255#define CURRENT ((int) (*ctxt->input->cur))
256
257#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
258
259/* Inported from XML */
260
Daniel Veillard561b7f82002-03-20 21:55:57 +0000261/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
262#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000263#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000264
Daniel Veillard561b7f82002-03-20 21:55:57 +0000265#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000266#define NXT(val) ctxt->input->cur[(val)]
267#define CUR_PTR ctxt->input->cur
268
269
270#define NEXTL(l) do { \
271 if (*(ctxt->input->cur) == '\n') { \
272 ctxt->input->line++; ctxt->input->col = 1; \
273 } else ctxt->input->col++; \
274 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
275 } while (0)
276
277/************
278 \
279 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
280 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
281 ************/
282
283#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
284#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
285
286#define COPY_BUF(l,b,i,v) \
287 if (l == 1) b[i++] = (xmlChar) v; \
288 else i += xmlCopyChar(l,&b[i],v)
289
290/**
291 * htmlCurrentChar:
292 * @ctxt: the HTML parser context
293 * @len: pointer to the length of the char read
294 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000295 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000296 * bytes in the input buffer. Implement the end of line normalization:
297 * 2.11 End-of-Line Handling
298 * If the encoding is unspecified, in the case we find an ISO-Latin-1
299 * char, then the encoding converter is plugged in automatically.
300 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000301 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000302 */
303
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000304static int
Owen Taylor3473f882001-02-23 17:55:21 +0000305htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
306 if (ctxt->instate == XML_PARSER_EOF)
307 return(0);
308
309 if (ctxt->token != 0) {
310 *len = 0;
311 return(ctxt->token);
312 }
313 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
314 /*
315 * We are supposed to handle UTF8, check it's valid
316 * From rfc2044: encoding of the Unicode values on UTF-8:
317 *
318 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
319 * 0000 0000-0000 007F 0xxxxxxx
320 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
321 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
322 *
323 * Check for the 0x110000 limit too
324 */
325 const unsigned char *cur = ctxt->input->cur;
326 unsigned char c;
327 unsigned int val;
328
329 c = *cur;
330 if (c & 0x80) {
331 if (cur[1] == 0)
332 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
333 if ((cur[1] & 0xc0) != 0x80)
334 goto encoding_error;
335 if ((c & 0xe0) == 0xe0) {
336
337 if (cur[2] == 0)
338 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
339 if ((cur[2] & 0xc0) != 0x80)
340 goto encoding_error;
341 if ((c & 0xf0) == 0xf0) {
342 if (cur[3] == 0)
343 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
344 if (((c & 0xf8) != 0xf0) ||
345 ((cur[3] & 0xc0) != 0x80))
346 goto encoding_error;
347 /* 4-byte code */
348 *len = 4;
349 val = (cur[0] & 0x7) << 18;
350 val |= (cur[1] & 0x3f) << 12;
351 val |= (cur[2] & 0x3f) << 6;
352 val |= cur[3] & 0x3f;
353 } else {
354 /* 3-byte code */
355 *len = 3;
356 val = (cur[0] & 0xf) << 12;
357 val |= (cur[1] & 0x3f) << 6;
358 val |= cur[2] & 0x3f;
359 }
360 } else {
361 /* 2-byte code */
362 *len = 2;
363 val = (cur[0] & 0x1f) << 6;
364 val |= cur[1] & 0x3f;
365 }
366 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000367 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
368 "Char 0x%X out of allowed range\n", val);
Owen Taylor3473f882001-02-23 17:55:21 +0000369 }
370 return(val);
371 } else {
372 /* 1-byte code */
373 *len = 1;
374 return((int) *ctxt->input->cur);
375 }
376 }
377 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000378 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000379 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000380 * XML constructs only use < 128 chars
381 */
382 *len = 1;
383 if ((int) *ctxt->input->cur < 0x80)
384 return((int) *ctxt->input->cur);
385
386 /*
387 * Humm this is bad, do an automatic flow conversion
388 */
389 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
390 ctxt->charset = XML_CHAR_ENCODING_UTF8;
391 return(xmlCurrentChar(ctxt, len));
392
393encoding_error:
394 /*
395 * If we detect an UTF8 error that probably mean that the
396 * input encoding didn't get properly advertized in the
397 * declaration header. Report the error and switch the encoding
398 * to ISO-Latin-1 (if you don't like this policy, just declare the
399 * encoding !)
400 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000401 {
402 char buffer[150];
403
404 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
Owen Taylor3473f882001-02-23 17:55:21 +0000405 ctxt->input->cur[0], ctxt->input->cur[1],
406 ctxt->input->cur[2], ctxt->input->cur[3]);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000407 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
408 "Input is not proper UTF-8, indicate encoding !\n",
409 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000410 }
411
412 ctxt->charset = XML_CHAR_ENCODING_8859_1;
413 *len = 1;
414 return((int) *ctxt->input->cur);
415}
416
417/**
Owen Taylor3473f882001-02-23 17:55:21 +0000418 * htmlSkipBlankChars:
419 * @ctxt: the HTML parser context
420 *
421 * skip all blanks character found at that point in the input streams.
422 *
423 * Returns the number of space chars skipped
424 */
425
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000426static int
Owen Taylor3473f882001-02-23 17:55:21 +0000427htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
428 int res = 0;
429
William M. Brack76e95df2003-10-18 16:20:14 +0000430 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000431 if ((*ctxt->input->cur == 0) &&
432 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
433 xmlPopInput(ctxt);
434 } else {
435 if (*(ctxt->input->cur) == '\n') {
436 ctxt->input->line++; ctxt->input->col = 1;
437 } else ctxt->input->col++;
438 ctxt->input->cur++;
439 ctxt->nbChars++;
440 if (*ctxt->input->cur == 0)
441 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
442 }
443 res++;
444 }
445 return(res);
446}
447
448
449
450/************************************************************************
451 * *
452 * The list of HTML elements and their properties *
453 * *
454 ************************************************************************/
455
456/*
457 * Start Tag: 1 means the start tag can be ommited
458 * End Tag: 1 means the end tag can be ommited
459 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000460 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000461 * Depr: this element is deprecated
462 * DTD: 1 means that this element is valid only in the Loose DTD
463 * 2 means that this element is valid only in the Frameset DTD
464 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000465 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000466 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000467 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000468
469/* Definitions and a couple of vars for HTML Elements */
470
471#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000472#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000473#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000474#define NB_PHRASE 10
Daniel Veillard930dfb62003-02-05 10:17:38 +0000475#define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000476#define NB_SPECIAL 15
Daniel Veillard930dfb62003-02-05 10:17:38 +0000477#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000478#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
479#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
480#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000481#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000482#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000483#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000484#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000485#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000486#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000487#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000488#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000489#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000490#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000491#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000492#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000493#define EMPTY NULL
494
495
Daniel Veillard065abe82006-07-03 08:55:04 +0000496static const char* const html_flow[] = { FLOW, NULL } ;
497static const char* const html_inline[] = { INLINE, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000498
499/* placeholders: elts with content but no subelements */
Daniel Veillard065abe82006-07-03 08:55:04 +0000500static const char* const html_pcdata[] = { NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000501#define html_cdata html_pcdata
502
503
504/* ... and for HTML Attributes */
505
506#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000507#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000508#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000509#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000510#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000511#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000512#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000513#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000514#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000515#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000516#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000517#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000518
Daniel Veillard065abe82006-07-03 08:55:04 +0000519static const char* const html_attrs[] = { ATTRS, NULL } ;
520static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
521static const char* const core_attrs[] = { COREATTRS, NULL } ;
522static const char* const i18n_attrs[] = { I18N, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000523
524
525/* Other declarations that should go inline ... */
Daniel Veillard065abe82006-07-03 08:55:04 +0000526static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000527 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
528 "tabindex", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000529static const char* const target_attr[] = { "target", NULL } ;
530static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
531static const char* const alt_attr[] = { "alt", NULL } ;
532static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
533static const char* const href_attrs[] = { "href", NULL } ;
534static const char* const clear_attrs[] = { "clear", NULL } ;
535static const char* const inline_p[] = { INLINE, "p", NULL } ;
536
537static const char* const flow_param[] = { FLOW, "param", NULL } ;
538static const char* const applet_attrs[] = { COREATTRS , "codebase",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000539 "archive", "alt", "name", "height", "width", "align",
540 "hspace", "vspace", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000541static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000542 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000543static const char* const basefont_attrs[] =
Daniel Veillard930dfb62003-02-05 10:17:38 +0000544 { "id", "size", "color", "face", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000545static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
546static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
547static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
548static const char* const body_depr[] = { "background", "bgcolor", "text",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000549 "link", "vlink", "alink", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000550static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000551 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
552
553
Daniel Veillard065abe82006-07-03 08:55:04 +0000554static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
555static const char* const col_elt[] = { "col", NULL } ;
556static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
557static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
558static const char* const dl_contents[] = { "dt", "dd", NULL } ;
559static const char* const compact_attr[] = { "compact", NULL } ;
560static const char* const label_attr[] = { "label", NULL } ;
561static const char* const fieldset_contents[] = { FLOW, "legend" } ;
562static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
563static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
564static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
565static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
566static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
567static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
568static const char* const head_attrs[] = { I18N, "profile", NULL } ;
569static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
570static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
571static const char* const version_attr[] = { "version", NULL } ;
572static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
573static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
574static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
575static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
576static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
577static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
578static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
579static const char* const align_attr[] = { "align", NULL } ;
580static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
581static const char* const map_contents[] = { BLOCK, "area", NULL } ;
582static const char* const name_attr[] = { "name", NULL } ;
583static const char* const action_attr[] = { "action", NULL } ;
584static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
585static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
586static const char* const content_attr[] = { "content", NULL } ;
587static const char* const type_attr[] = { "type", NULL } ;
588static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
589static const char* const object_contents[] = { FLOW, "param", NULL } ;
590static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
591static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
592static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
593static const char* const option_elt[] = { "option", NULL } ;
594static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
595static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
596static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
597static const char* const width_attr[] = { "width", NULL } ;
598static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
599static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
600static const char* const language_attr[] = { "language", NULL } ;
601static const char* const select_content[] = { "optgroup", "option", NULL } ;
602static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
603static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
604static const char* const table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
605static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
606static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
607static const char* const tr_elt[] = { "tr", NULL } ;
608static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
609static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
610static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
611static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
612static const char* const tr_contents[] = { "th", "td", NULL } ;
613static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
614static const char* const li_elt[] = { "li", NULL } ;
615static const char* const ul_depr[] = { "type", "compact", NULL} ;
616static const char* const dir_attr[] = { "dir", NULL} ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000617
618#define DECL (const char**)
619
Daniel Veillard22090732001-07-16 00:06:07 +0000620static const htmlElemDesc
621html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000622{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
623 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
624},
625{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
626 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
627},
628{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
629 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
630},
631{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
632 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
633},
634{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
635 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
636},
637{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
638 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
639},
640{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
641 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
642},
643{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
644 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
645},
646{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
647 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
648},
649{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
650 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
651},
652{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
653 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
654},
655{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
656 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
657},
658{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
659 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
660},
661{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
662 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
663},
664{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
665 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
666},
667{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
668 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
669},
670{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
671 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
672},
673{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
674 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
675},
676{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
677 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
678},
679{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
680 EMPTY , NULL , DECL col_attrs , NULL, NULL
681},
682{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
683 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
684},
685{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
686 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
687},
688{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
689 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
690},
691{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
692 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
693},
694{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
695 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
696},
697{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
698 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
699},
700{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
701 DECL dl_contents , "dd" , html_attrs, DECL compact_attr, NULL
702},
703{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
704 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
705},
706{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
707 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
708},
709{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
710 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
711},
712{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
713 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
714},
715{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
716 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
717},
718{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
719 EMPTY, NULL, NULL, DECL frame_attrs, NULL
720},
721{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
722 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
723},
724{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
725 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
726},
727{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
728 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
729},
730{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
731 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
732},
733{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
734 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
735},
736{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
737 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
738},
739{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
740 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
741},
742{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
743 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
744},
745{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
746 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
747},
748{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
749 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
750},
751{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
752 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
753},
754{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
755 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
756},
757{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
758 EMPTY, NULL, DECL img_attrs, DECL align_attr, src_alt_attrs
759},
760{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
761 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
762},
763{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
764 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
765},
766{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
767 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
768},
769{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
770 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
771},
772{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
773 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
774},
775{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
776 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
777},
778{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
779 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
780},
781{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
782 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
783},
784{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
785 DECL map_contents , NULL, DECL html_attrs , NULL, name_attr
786},
787{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
788 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
789},
790{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
791 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
792},
793{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
794 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
795},
796{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
797 DECL html_flow, "div", DECL html_attrs, NULL, NULL
798},
799{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
800 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
801},
802{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
803 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
804},
805{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
806 option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
807},
808{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
809 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
810},
811{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
812 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
813},
814{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
815 EMPTY, NULL, DECL param_attrs, NULL, name_attr
816},
817{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
818 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
819},
820{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
821 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
822},
823{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
824 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
825},
826{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
827 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
828},
829{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
830 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
831},
832{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
833 DECL select_content, NULL, DECL select_attrs, NULL, NULL
834},
835{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
836 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
837},
838{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
839 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
840},
841{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
842 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
843},
844{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
845 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
846},
847{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
848 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
849},
850{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
851 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
852},
853{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
854 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
855},
856{ "table", 0, 0, 0, 0, 0, 0, 0, "",
857 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
858},
859{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
860 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
861},
862{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
863 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
864},
865{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
866 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
867},
868{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
869 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
870},
871{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
872 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
873},
874{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
875 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
876},
877{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
878 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
879},
880{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
881 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
882},
883{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
884 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
885},
886{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
887 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
888},
889{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
890 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
891},
892{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
893 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
894}
Owen Taylor3473f882001-02-23 17:55:21 +0000895};
896
897/*
Owen Taylor3473f882001-02-23 17:55:21 +0000898 * start tags that imply the end of current element
899 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000900static const char * const htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000901"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
902 "dl", "ul", "ol", "menu", "dir", "address", "pre",
903 "listing", "xmp", "head", NULL,
904"head", "p", NULL,
905"title", "p", NULL,
906"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +0000907"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000908"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
909 "pre", "listing", "xmp", "head", "li", NULL,
910"hr", "p", "head", NULL,
911"h1", "p", "head", NULL,
912"h2", "p", "head", NULL,
913"h3", "p", "head", NULL,
914"h4", "p", "head", NULL,
915"h5", "p", "head", NULL,
916"h6", "p", "head", NULL,
917"dir", "p", "head", NULL,
918"address", "p", "head", "ul", NULL,
919"pre", "p", "head", "ul", NULL,
920"listing", "p", "head", NULL,
921"xmp", "p", "head", NULL,
922"blockquote", "p", "head", NULL,
923"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
924 "xmp", "head", NULL,
925"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
926 "head", "dd", NULL,
927"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
928 "head", "dt", NULL,
929"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
930 "listing", "xmp", NULL,
931"ol", "p", "head", "ul", NULL,
932"menu", "p", "head", "ul", NULL,
933"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
934"div", "p", "head", NULL,
935"noscript", "p", "head", NULL,
936"center", "font", "b", "i", "p", "head", NULL,
937"a", "a", NULL,
938"caption", "p", NULL,
939"colgroup", "caption", "colgroup", "col", "p", NULL,
940"col", "caption", "col", "p", NULL,
941"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
942 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000943"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
944"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000945"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
946"thead", "caption", "col", "colgroup", NULL,
947"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
948 "tbody", "p", NULL,
949"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
950 "tfoot", "tbody", "p", NULL,
951"optgroup", "option", NULL,
952"option", "option", NULL,
953"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
954 "pre", "listing", "xmp", "a", NULL,
955NULL
956};
957
958/*
959 * The list of HTML elements which are supposed not to have
960 * CDATA content and where a p element will be implied
961 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000962 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000963 * implied paragraph
964 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000965static const char *const htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000966 "html",
967 "head",
Owen Taylor3473f882001-02-23 17:55:21 +0000968 NULL
969};
970
971/*
972 * The list of HTML attributes which are of content %Script;
973 * NOTE: when adding ones, check htmlIsScriptAttribute() since
974 * it assumes the name starts with 'on'
975 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000976static const char *const htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000977 "onclick",
978 "ondblclick",
979 "onmousedown",
980 "onmouseup",
981 "onmouseover",
982 "onmousemove",
983 "onmouseout",
984 "onkeypress",
985 "onkeydown",
986 "onkeyup",
987 "onload",
988 "onunload",
989 "onfocus",
990 "onblur",
991 "onsubmit",
992 "onrest",
993 "onchange",
994 "onselect"
995};
996
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000997/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000998 * This table is used by the htmlparser to know what to do with
999 * broken html pages. By assigning different priorities to different
1000 * elements the parser can decide how to handle extra endtags.
1001 * Endtags are only allowed to close elements with lower or equal
1002 * priority.
1003 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001004
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001005typedef struct {
1006 const char *name;
1007 int priority;
1008} elementPriority;
1009
Daniel Veillard22090732001-07-16 00:06:07 +00001010static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001011 {"div", 150},
1012 {"td", 160},
1013 {"th", 160},
1014 {"tr", 170},
1015 {"thead", 180},
1016 {"tbody", 180},
1017 {"tfoot", 180},
1018 {"table", 190},
1019 {"head", 200},
1020 {"body", 200},
1021 {"html", 220},
1022 {NULL, 100} /* Default priority */
1023};
Owen Taylor3473f882001-02-23 17:55:21 +00001024
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001025static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001026static int htmlStartCloseIndexinitialized = 0;
1027
1028/************************************************************************
1029 * *
1030 * functions to handle HTML specific data *
1031 * *
1032 ************************************************************************/
1033
1034/**
1035 * htmlInitAutoClose:
1036 *
1037 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1038 * This is not reentrant. Call xmlInitParser() once before processing in
1039 * case of use in multithreaded programs.
1040 */
1041void
1042htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001043 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001044
1045 if (htmlStartCloseIndexinitialized) return;
1046
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001047 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1048 indx = 0;
1049 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
Daniel Veillard28aac0b2006-10-16 08:31:18 +00001050 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001051 while (htmlStartClose[i] != NULL) i++;
1052 i++;
1053 }
1054 htmlStartCloseIndexinitialized = 1;
1055}
1056
1057/**
1058 * htmlTagLookup:
1059 * @tag: The tag name in lowercase
1060 *
1061 * Lookup the HTML tag in the ElementTable
1062 *
1063 * Returns the related htmlElemDescPtr or NULL if not found.
1064 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001065const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001066htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001067 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001068
1069 for (i = 0; i < (sizeof(html40ElementTable) /
1070 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001071 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001072 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001073 }
1074 return(NULL);
1075}
1076
1077/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001078 * htmlGetEndPriority:
1079 * @name: The name of the element to look up the priority for.
1080 *
1081 * Return value: The "endtag" priority.
1082 **/
1083static int
1084htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001085 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001086
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001087 while ((htmlEndPriority[i].name != NULL) &&
1088 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1089 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001090
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001091 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001092}
1093
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001094
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001095/**
Owen Taylor3473f882001-02-23 17:55:21 +00001096 * htmlCheckAutoClose:
1097 * @newtag: The new tag name
1098 * @oldtag: The old tag name
1099 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001100 * Checks whether the new tag is one of the registered valid tags for
1101 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001102 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1103 *
1104 * Returns 0 if no, 1 if yes.
1105 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001106static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001107htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1108{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001109 int i, indx;
1110 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001111
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001112 if (htmlStartCloseIndexinitialized == 0)
1113 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001114
1115 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001116 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001117 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001118 if (closed == NULL)
1119 return (0);
1120 if (xmlStrEqual(BAD_CAST * closed, newtag))
1121 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001122 }
1123
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001124 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001125 i++;
1126 while (htmlStartClose[i] != NULL) {
1127 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001128 return (1);
1129 }
1130 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001131 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001132 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001133}
1134
1135/**
1136 * htmlAutoCloseOnClose:
1137 * @ctxt: an HTML parser context
1138 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001139 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001140 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001141 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001142 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001143static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001144htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1145{
1146 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001147 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001148
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001149 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001150
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001151 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001152
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001153 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1154 break;
1155 /*
1156 * A missplaced endtag can only close elements with lower
1157 * or equal priority, so if we find an element with higher
1158 * priority before we find an element with
1159 * matching name, we just ignore this endtag
1160 */
1161 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1162 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001163 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001164 if (i < 0)
1165 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001166
1167 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001168 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001169 if ((info != NULL) && (info->endTag == 3)) {
1170 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1171 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001172 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001173 }
1174 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1175 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001176 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001177 }
1178}
1179
1180/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001181 * htmlAutoCloseOnEnd:
1182 * @ctxt: an HTML parser context
1183 *
1184 * Close all remaining tags at the end of the stream
1185 */
1186static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001187htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1188{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001189 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001190
William M. Brack899e64a2003-09-26 18:03:42 +00001191 if (ctxt->nameNr == 0)
1192 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001193 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001194 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1195 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001196 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001197 }
1198}
1199
1200/**
Owen Taylor3473f882001-02-23 17:55:21 +00001201 * htmlAutoClose:
1202 * @ctxt: an HTML parser context
1203 * @newtag: The new tag name or NULL
1204 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001205 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001206 * The list is kept in htmlStartClose array. This function is
1207 * called when a new tag has been detected and generates the
1208 * appropriates closes if possible/needed.
1209 * If newtag is NULL this mean we are at the end of the resource
1210 * and we should check
1211 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001212static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001213htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1214{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001215 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001216 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001217 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1218 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001219 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001220 }
1221 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001222 htmlAutoCloseOnEnd(ctxt);
1223 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001224 }
1225 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001226 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1227 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1228 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001229 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1230 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001231 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001232 }
Owen Taylor3473f882001-02-23 17:55:21 +00001233}
1234
1235/**
1236 * htmlAutoCloseTag:
1237 * @doc: the HTML document
1238 * @name: The tag name
1239 * @elem: the HTML element
1240 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001241 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001242 * The list is kept in htmlStartClose array. This function checks
1243 * if the element or one of it's children would autoclose the
1244 * given tag.
1245 *
1246 * Returns 1 if autoclose, 0 otherwise
1247 */
1248int
1249htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1250 htmlNodePtr child;
1251
1252 if (elem == NULL) return(1);
1253 if (xmlStrEqual(name, elem->name)) return(0);
1254 if (htmlCheckAutoClose(elem->name, name)) return(1);
1255 child = elem->children;
1256 while (child != NULL) {
1257 if (htmlAutoCloseTag(doc, name, child)) return(1);
1258 child = child->next;
1259 }
1260 return(0);
1261}
1262
1263/**
1264 * htmlIsAutoClosed:
1265 * @doc: the HTML document
1266 * @elem: the HTML element
1267 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001268 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001269 * The list is kept in htmlStartClose array. This function checks
1270 * if a tag is autoclosed by one of it's child
1271 *
1272 * Returns 1 if autoclosed, 0 otherwise
1273 */
1274int
1275htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1276 htmlNodePtr child;
1277
1278 if (elem == NULL) return(1);
1279 child = elem->children;
1280 while (child != NULL) {
1281 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1282 child = child->next;
1283 }
1284 return(0);
1285}
1286
1287/**
1288 * htmlCheckImplied:
1289 * @ctxt: an HTML parser context
1290 * @newtag: The new tag name
1291 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001292 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001293 * called when a new tag has been detected and generates the
1294 * appropriates implicit tags if missing
1295 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001296static void
Owen Taylor3473f882001-02-23 17:55:21 +00001297htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1298 if (!htmlOmittedDefaultValue)
1299 return;
1300 if (xmlStrEqual(newtag, BAD_CAST"html"))
1301 return;
1302 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001303 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001304 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1305 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1306 }
1307 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1308 return;
1309 if ((ctxt->nameNr <= 1) &&
1310 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1311 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1312 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1313 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1314 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1315 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1316 /*
1317 * dropped OBJECT ... i you put it first BODY will be
1318 * assumed !
1319 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001320 htmlnamePush(ctxt, BAD_CAST"head");
Owen Taylor3473f882001-02-23 17:55:21 +00001321 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1322 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1323 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1324 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1325 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1326 int i;
1327 for (i = 0;i < ctxt->nameNr;i++) {
1328 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1329 return;
1330 }
1331 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1332 return;
1333 }
1334 }
1335
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001336 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001337 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1338 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1339 }
1340}
1341
1342/**
1343 * htmlCheckParagraph
1344 * @ctxt: an HTML parser context
1345 *
1346 * Check whether a p element need to be implied before inserting
1347 * characters in the current element.
1348 *
1349 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1350 * in case of error.
1351 */
1352
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001353static int
Owen Taylor3473f882001-02-23 17:55:21 +00001354htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1355 const xmlChar *tag;
1356 int i;
1357
1358 if (ctxt == NULL)
1359 return(-1);
1360 tag = ctxt->name;
1361 if (tag == NULL) {
1362 htmlAutoClose(ctxt, BAD_CAST"p");
1363 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001364 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001365 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1366 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1367 return(1);
1368 }
1369 if (!htmlOmittedDefaultValue)
1370 return(0);
1371 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1372 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001373 htmlAutoClose(ctxt, BAD_CAST"p");
1374 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001375 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001376 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1377 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1378 return(1);
1379 }
1380 }
1381 return(0);
1382}
1383
1384/**
1385 * htmlIsScriptAttribute:
1386 * @name: an attribute name
1387 *
1388 * Check if an attribute is of content type Script
1389 *
1390 * Returns 1 is the attribute is a script 0 otherwise
1391 */
1392int
1393htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001394 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001395
1396 if (name == NULL)
1397 return(0);
1398 /*
1399 * all script attributes start with 'on'
1400 */
1401 if ((name[0] != 'o') || (name[1] != 'n'))
1402 return(0);
1403 for (i = 0;
1404 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1405 i++) {
1406 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1407 return(1);
1408 }
1409 return(0);
1410}
1411
1412/************************************************************************
1413 * *
1414 * The list of HTML predefined entities *
1415 * *
1416 ************************************************************************/
1417
1418
Daniel Veillard22090732001-07-16 00:06:07 +00001419static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001420/*
1421 * the 4 absolute ones, plus apostrophe.
1422 */
1423{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1424{ 38, "amp", "ampersand, U+0026 ISOnum" },
1425{ 39, "apos", "single quote" },
1426{ 60, "lt", "less-than sign, U+003C ISOnum" },
1427{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1428
1429/*
1430 * A bunch still in the 128-255 range
1431 * Replacing them depend really on the charset used.
1432 */
1433{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1434{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1435{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1436{ 163, "pound","pound sign, U+00A3 ISOnum" },
1437{ 164, "curren","currency sign, U+00A4 ISOnum" },
1438{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1439{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1440{ 167, "sect", "section sign, U+00A7 ISOnum" },
1441{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1442{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1443{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1444{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1445{ 172, "not", "not sign, U+00AC ISOnum" },
1446{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1447{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1448{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1449{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1450{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1451{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1452{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1453{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1454{ 181, "micro","micro sign, U+00B5 ISOnum" },
1455{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1456{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1457{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1458{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1459{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1460{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1461{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1462{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1463{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1464{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1465{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1466{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1467{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1468{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1469{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1470{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1471{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1472{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1473{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1474{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1475{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1476{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1477{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1478{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1479{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1480{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1481{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1482{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1483{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1484{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1485{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1486{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1487{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1488{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1489{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1490{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1491{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1492{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1493{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1494{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1495{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1496{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1497{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1498{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1499{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1500{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1501{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1502{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1503{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1504{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1505{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1506{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1507{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1508{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1509{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1510{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1511{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1512{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1513{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1514{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1515{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1516{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1517{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1518{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1519{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1520{ 247, "divide","division sign, U+00F7 ISOnum" },
1521{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1522{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1523{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1524{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1525{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1526{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1527{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1528{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1529
1530{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1531{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1532{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1533{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1534{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1535
1536/*
1537 * Anything below should really be kept as entities references
1538 */
1539{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1540
1541{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1542{ 732, "tilde","small tilde, U+02DC ISOdia" },
1543
1544{ 913, "Alpha","greek capital letter alpha, U+0391" },
1545{ 914, "Beta", "greek capital letter beta, U+0392" },
1546{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1547{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1548{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1549{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1550{ 919, "Eta", "greek capital letter eta, U+0397" },
1551{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1552{ 921, "Iota", "greek capital letter iota, U+0399" },
1553{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001554{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001555{ 924, "Mu", "greek capital letter mu, U+039C" },
1556{ 925, "Nu", "greek capital letter nu, U+039D" },
1557{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1558{ 927, "Omicron","greek capital letter omicron, U+039F" },
1559{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1560{ 929, "Rho", "greek capital letter rho, U+03A1" },
1561{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1562{ 932, "Tau", "greek capital letter tau, U+03A4" },
1563{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1564{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1565{ 935, "Chi", "greek capital letter chi, U+03A7" },
1566{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1567{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1568
1569{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1570{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1571{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1572{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1573{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1574{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1575{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1576{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1577{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1578{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1579{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1580{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1581{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1582{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1583{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1584{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1585{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1586{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1587{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1588{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1589{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1590{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1591{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1592{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1593{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1594{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1595{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1596{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1597
1598{ 8194, "ensp", "en space, U+2002 ISOpub" },
1599{ 8195, "emsp", "em space, U+2003 ISOpub" },
1600{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1601{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1602{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1603{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1604{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1605{ 8211, "ndash","en dash, U+2013 ISOpub" },
1606{ 8212, "mdash","em dash, U+2014 ISOpub" },
1607{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1608{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1609{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1610{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1611{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1612{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1613{ 8224, "dagger","dagger, U+2020 ISOpub" },
1614{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1615
1616{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1617{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1618
1619{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1620
1621{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1622{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1623
1624{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1625{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1626
1627{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1628{ 8260, "frasl","fraction slash, U+2044 NEW" },
1629
1630{ 8364, "euro", "euro sign, U+20AC NEW" },
1631
1632{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1633{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1634{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1635{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1636{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1637{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1638{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1639{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1640{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1641{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1642{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1643{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1644{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1645{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1646{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1647{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1648
1649{ 8704, "forall","for all, U+2200 ISOtech" },
1650{ 8706, "part", "partial differential, U+2202 ISOtech" },
1651{ 8707, "exist","there exists, U+2203 ISOtech" },
1652{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1653{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1654{ 8712, "isin", "element of, U+2208 ISOtech" },
1655{ 8713, "notin","not an element of, U+2209 ISOtech" },
1656{ 8715, "ni", "contains as member, U+220B ISOtech" },
1657{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001658{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001659{ 8722, "minus","minus sign, U+2212 ISOtech" },
1660{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1661{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1662{ 8733, "prop", "proportional to, U+221D ISOtech" },
1663{ 8734, "infin","infinity, U+221E ISOtech" },
1664{ 8736, "ang", "angle, U+2220 ISOamso" },
1665{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1666{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1667{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1668{ 8746, "cup", "union = cup, U+222A ISOtech" },
1669{ 8747, "int", "integral, U+222B ISOtech" },
1670{ 8756, "there4","therefore, U+2234 ISOtech" },
1671{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1672{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1673{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1674{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1675{ 8801, "equiv","identical to, U+2261 ISOtech" },
1676{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1677{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1678{ 8834, "sub", "subset of, U+2282 ISOtech" },
1679{ 8835, "sup", "superset of, U+2283 ISOtech" },
1680{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1681{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1682{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1683{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1684{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1685{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1686{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1687{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1688{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1689{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1690{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1691{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1692{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1693{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1694
1695{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1696{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1697{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1698{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1699
1700};
1701
1702/************************************************************************
1703 * *
1704 * Commodity functions to handle entities *
1705 * *
1706 ************************************************************************/
1707
1708/*
1709 * Macro used to grow the current buffer.
1710 */
1711#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001712 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001713 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001714 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1715 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001716 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001717 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001718 return(NULL); \
1719 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001720 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001721}
1722
1723/**
1724 * htmlEntityLookup:
1725 * @name: the entity name
1726 *
1727 * Lookup the given entity in EntitiesTable
1728 *
1729 * TODO: the linear scan is really ugly, an hash table is really needed.
1730 *
1731 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1732 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001733const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001734htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001735 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001736
1737 for (i = 0;i < (sizeof(html40EntitiesTable)/
1738 sizeof(html40EntitiesTable[0]));i++) {
1739 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001740 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001741 }
1742 }
1743 return(NULL);
1744}
1745
1746/**
1747 * htmlEntityValueLookup:
1748 * @value: the entity's unicode value
1749 *
1750 * Lookup the given entity in EntitiesTable
1751 *
1752 * TODO: the linear scan is really ugly, an hash table is really needed.
1753 *
1754 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1755 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001756const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001757htmlEntityValueLookup(unsigned int value) {
1758 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001759
1760 for (i = 0;i < (sizeof(html40EntitiesTable)/
1761 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001762 if (html40EntitiesTable[i].value >= value) {
1763 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001764 break;
William M. Brack78637da2003-07-31 14:47:38 +00001765 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001766 }
Owen Taylor3473f882001-02-23 17:55:21 +00001767 }
1768 return(NULL);
1769}
1770
1771/**
1772 * UTF8ToHtml:
1773 * @out: a pointer to an array of bytes to store the result
1774 * @outlen: the length of @out
1775 * @in: a pointer to an array of UTF-8 chars
1776 * @inlen: the length of @in
1777 *
1778 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1779 * plus HTML entities block of chars out.
1780 *
1781 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1782 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001783 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001784 * The value of @outlen after return is the number of octets consumed.
1785 */
1786int
1787UTF8ToHtml(unsigned char* out, int *outlen,
1788 const unsigned char* in, int *inlen) {
1789 const unsigned char* processed = in;
1790 const unsigned char* outend;
1791 const unsigned char* outstart = out;
1792 const unsigned char* instart = in;
1793 const unsigned char* inend;
1794 unsigned int c, d;
1795 int trailing;
1796
Daniel Veillardce682bc2004-11-05 17:22:25 +00001797 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00001798 if (in == NULL) {
1799 /*
1800 * initialization nothing to do
1801 */
1802 *outlen = 0;
1803 *inlen = 0;
1804 return(0);
1805 }
1806 inend = in + (*inlen);
1807 outend = out + (*outlen);
1808 while (in < inend) {
1809 d = *in++;
1810 if (d < 0x80) { c= d; trailing= 0; }
1811 else if (d < 0xC0) {
1812 /* trailing byte in leading position */
1813 *outlen = out - outstart;
1814 *inlen = processed - instart;
1815 return(-2);
1816 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1817 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1818 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1819 else {
1820 /* no chance for this in Ascii */
1821 *outlen = out - outstart;
1822 *inlen = processed - instart;
1823 return(-2);
1824 }
1825
1826 if (inend - in < trailing) {
1827 break;
1828 }
1829
1830 for ( ; trailing; trailing--) {
1831 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1832 break;
1833 c <<= 6;
1834 c |= d & 0x3F;
1835 }
1836
1837 /* assertion: c is a single UTF-4 value */
1838 if (c < 0x80) {
1839 if (out + 1 >= outend)
1840 break;
1841 *out++ = c;
1842 } else {
1843 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001844 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001845
1846 /*
1847 * Try to lookup a predefined HTML entity for it
1848 */
1849
1850 ent = htmlEntityValueLookup(c);
1851 if (ent == NULL) {
1852 /* no chance for this in Ascii */
1853 *outlen = out - outstart;
1854 *inlen = processed - instart;
1855 return(-2);
1856 }
1857 len = strlen(ent->name);
1858 if (out + 2 + len >= outend)
1859 break;
1860 *out++ = '&';
1861 memcpy(out, ent->name, len);
1862 out += len;
1863 *out++ = ';';
1864 }
1865 processed = in;
1866 }
1867 *outlen = out - outstart;
1868 *inlen = processed - instart;
1869 return(0);
1870}
1871
1872/**
1873 * htmlEncodeEntities:
1874 * @out: a pointer to an array of bytes to store the result
1875 * @outlen: the length of @out
1876 * @in: a pointer to an array of UTF-8 chars
1877 * @inlen: the length of @in
1878 * @quoteChar: the quote character to escape (' or ") or zero.
1879 *
1880 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1881 * plus HTML entities block of chars out.
1882 *
1883 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1884 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001885 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001886 * The value of @outlen after return is the number of octets consumed.
1887 */
1888int
1889htmlEncodeEntities(unsigned char* out, int *outlen,
1890 const unsigned char* in, int *inlen, int quoteChar) {
1891 const unsigned char* processed = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001892 const unsigned char* outend;
Owen Taylor3473f882001-02-23 17:55:21 +00001893 const unsigned char* outstart = out;
1894 const unsigned char* instart = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001895 const unsigned char* inend;
Owen Taylor3473f882001-02-23 17:55:21 +00001896 unsigned int c, d;
1897 int trailing;
1898
Daniel Veillardce682bc2004-11-05 17:22:25 +00001899 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
1900 return(-1);
1901 outend = out + (*outlen);
1902 inend = in + (*inlen);
Owen Taylor3473f882001-02-23 17:55:21 +00001903 while (in < inend) {
1904 d = *in++;
1905 if (d < 0x80) { c= d; trailing= 0; }
1906 else if (d < 0xC0) {
1907 /* trailing byte in leading position */
1908 *outlen = out - outstart;
1909 *inlen = processed - instart;
1910 return(-2);
1911 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1912 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1913 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1914 else {
1915 /* no chance for this in Ascii */
1916 *outlen = out - outstart;
1917 *inlen = processed - instart;
1918 return(-2);
1919 }
1920
1921 if (inend - in < trailing)
1922 break;
1923
1924 while (trailing--) {
1925 if (((d= *in++) & 0xC0) != 0x80) {
1926 *outlen = out - outstart;
1927 *inlen = processed - instart;
1928 return(-2);
1929 }
1930 c <<= 6;
1931 c |= d & 0x3F;
1932 }
1933
1934 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001935 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1936 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001937 if (out >= outend)
1938 break;
1939 *out++ = c;
1940 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001941 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001942 const char *cp;
1943 char nbuf[16];
1944 int len;
1945
1946 /*
1947 * Try to lookup a predefined HTML entity for it
1948 */
1949 ent = htmlEntityValueLookup(c);
1950 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001951 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001952 cp = nbuf;
1953 }
1954 else
1955 cp = ent->name;
1956 len = strlen(cp);
1957 if (out + 2 + len > outend)
1958 break;
1959 *out++ = '&';
1960 memcpy(out, cp, len);
1961 out += len;
1962 *out++ = ';';
1963 }
1964 processed = in;
1965 }
1966 *outlen = out - outstart;
1967 *inlen = processed - instart;
1968 return(0);
1969}
1970
Owen Taylor3473f882001-02-23 17:55:21 +00001971/************************************************************************
1972 * *
1973 * Commodity functions to handle streams *
1974 * *
1975 ************************************************************************/
1976
1977/**
Owen Taylor3473f882001-02-23 17:55:21 +00001978 * htmlNewInputStream:
1979 * @ctxt: an HTML parser context
1980 *
1981 * Create a new input stream structure
1982 * Returns the new input stream or NULL
1983 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001984static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001985htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1986 htmlParserInputPtr input;
1987
1988 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1989 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00001990 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001991 return(NULL);
1992 }
1993 memset(input, 0, sizeof(htmlParserInput));
1994 input->filename = NULL;
1995 input->directory = NULL;
1996 input->base = NULL;
1997 input->cur = NULL;
1998 input->buf = NULL;
1999 input->line = 1;
2000 input->col = 1;
2001 input->buf = NULL;
2002 input->free = NULL;
2003 input->version = NULL;
2004 input->consumed = 0;
2005 input->length = 0;
2006 return(input);
2007}
2008
2009
2010/************************************************************************
2011 * *
2012 * Commodity functions, cleanup needed ? *
2013 * *
2014 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002015/*
2016 * all tags allowing pc data from the html 4.01 loose dtd
2017 * NOTE: it might be more apropriate to integrate this information
2018 * into the html40ElementTable array but I don't want to risk any
2019 * binary incomptibility
2020 */
2021static const char *allowPCData[] = {
2022 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2023 "blockquote", "body", "button", "caption", "center", "cite", "code",
2024 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2025 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2026 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2027 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2028};
Owen Taylor3473f882001-02-23 17:55:21 +00002029
2030/**
2031 * areBlanks:
2032 * @ctxt: an HTML parser context
2033 * @str: a xmlChar *
2034 * @len: the size of @str
2035 *
2036 * Is this a sequence of blank chars that one can ignore ?
2037 *
2038 * Returns 1 if ignorable 0 otherwise.
2039 */
2040
2041static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002042 unsigned int i;
2043 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002044 xmlNodePtr lastChild;
Daniel Veillard36d73402005-09-01 09:52:30 +00002045 xmlDtdPtr dtd;
Owen Taylor3473f882001-02-23 17:55:21 +00002046
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002047 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002048 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002049
2050 if (CUR == 0) return(1);
2051 if (CUR != '<') return(0);
2052 if (ctxt->name == NULL)
2053 return(1);
2054 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2055 return(1);
2056 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2057 return(1);
Daniel Veillard36d73402005-09-01 09:52:30 +00002058
2059 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2060 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2061 dtd = xmlGetIntSubset(ctxt->myDoc);
2062 if (dtd != NULL && dtd->ExternalID != NULL) {
2063 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2064 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2065 return(1);
2066 }
2067 }
2068
Owen Taylor3473f882001-02-23 17:55:21 +00002069 if (ctxt->node == NULL) return(0);
2070 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002071 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2072 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002073 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002074 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2075 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002076 /* keep ws in constructs like ...<b> </b>...
2077 for all tags "b" allowing PCDATA */
2078 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2079 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2080 return(0);
2081 }
2082 }
Owen Taylor3473f882001-02-23 17:55:21 +00002083 } else if (xmlNodeIsText(lastChild)) {
2084 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002085 } else {
2086 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2087 for all tags "p" allowing PCDATA */
2088 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2089 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2090 return(0);
2091 }
2092 }
Owen Taylor3473f882001-02-23 17:55:21 +00002093 }
2094 return(1);
2095}
2096
2097/**
Owen Taylor3473f882001-02-23 17:55:21 +00002098 * htmlNewDocNoDtD:
2099 * @URI: URI for the dtd, or NULL
2100 * @ExternalID: the external ID of the DTD, or NULL
2101 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002102 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2103 * are NULL
2104 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002105 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002106 */
2107htmlDocPtr
2108htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2109 xmlDocPtr cur;
2110
2111 /*
2112 * Allocate a new document and fill the fields.
2113 */
2114 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2115 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002116 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002117 return(NULL);
2118 }
2119 memset(cur, 0, sizeof(xmlDoc));
2120
2121 cur->type = XML_HTML_DOCUMENT_NODE;
2122 cur->version = NULL;
2123 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002124 cur->doc = cur;
2125 cur->name = NULL;
2126 cur->children = NULL;
2127 cur->extSubset = NULL;
2128 cur->oldNs = NULL;
2129 cur->encoding = NULL;
2130 cur->standalone = 1;
2131 cur->compression = 0;
2132 cur->ids = NULL;
2133 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002134 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002135 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002136 if ((ExternalID != NULL) ||
2137 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002138 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002139 return(cur);
2140}
2141
2142/**
2143 * htmlNewDoc:
2144 * @URI: URI for the dtd, or NULL
2145 * @ExternalID: the external ID of the DTD, or NULL
2146 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002147 * Creates a new HTML document
2148 *
Owen Taylor3473f882001-02-23 17:55:21 +00002149 * Returns a new document
2150 */
2151htmlDocPtr
2152htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2153 if ((URI == NULL) && (ExternalID == NULL))
2154 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002155 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2156 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002157
2158 return(htmlNewDocNoDtD(URI, ExternalID));
2159}
2160
2161
2162/************************************************************************
2163 * *
2164 * The parser itself *
2165 * Relates to http://www.w3.org/TR/html40 *
2166 * *
2167 ************************************************************************/
2168
2169/************************************************************************
2170 * *
2171 * The parser itself *
2172 * *
2173 ************************************************************************/
2174
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002175static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002176
Owen Taylor3473f882001-02-23 17:55:21 +00002177/**
2178 * htmlParseHTMLName:
2179 * @ctxt: an HTML parser context
2180 *
2181 * parse an HTML tag or attribute name, note that we convert it to lowercase
2182 * since HTML names are not case-sensitive.
2183 *
2184 * Returns the Tag Name parsed or NULL
2185 */
2186
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002187static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002188htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002189 int i = 0;
2190 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2191
William M. Brackd1757ab2004-10-02 22:07:48 +00002192 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Owen Taylor3473f882001-02-23 17:55:21 +00002193 (CUR != ':')) return(NULL);
2194
2195 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002196 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Owen Taylor3473f882001-02-23 17:55:21 +00002197 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2198 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2199 else loc[i] = CUR;
2200 i++;
2201
2202 NEXT;
2203 }
2204
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002205 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002206}
2207
2208/**
2209 * htmlParseName:
2210 * @ctxt: an HTML parser context
2211 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002212 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002213 *
2214 * Returns the Name parsed or NULL
2215 */
2216
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002217static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002218htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002219 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002220 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002221 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002222
2223 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002224
2225 /*
2226 * Accelerator for simple ASCII names
2227 */
2228 in = ctxt->input->cur;
2229 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2230 ((*in >= 0x41) && (*in <= 0x5A)) ||
2231 (*in == '_') || (*in == ':')) {
2232 in++;
2233 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2234 ((*in >= 0x41) && (*in <= 0x5A)) ||
2235 ((*in >= 0x30) && (*in <= 0x39)) ||
2236 (*in == '_') || (*in == '-') ||
2237 (*in == ':') || (*in == '.'))
2238 in++;
2239 if ((*in > 0) && (*in < 0x80)) {
2240 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002241 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002242 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002243 ctxt->nbChars += count;
2244 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002245 return(ret);
2246 }
2247 }
2248 return(htmlParseNameComplex(ctxt));
2249}
2250
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002251static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002252htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002253 int len = 0, l;
2254 int c;
2255 int count = 0;
2256
2257 /*
2258 * Handler for more complex cases
2259 */
2260 GROW;
2261 c = CUR_CHAR(l);
2262 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2263 (!IS_LETTER(c) && (c != '_') &&
2264 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002265 return(NULL);
2266 }
2267
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002268 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2269 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2270 (c == '.') || (c == '-') ||
2271 (c == '_') || (c == ':') ||
2272 (IS_COMBINING(c)) ||
2273 (IS_EXTENDER(c)))) {
2274 if (count++ > 100) {
2275 count = 0;
2276 GROW;
2277 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002278 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002279 NEXTL(l);
2280 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002281 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002282 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002283}
2284
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002285
Owen Taylor3473f882001-02-23 17:55:21 +00002286/**
2287 * htmlParseHTMLAttribute:
2288 * @ctxt: an HTML parser context
2289 * @stop: a char stop value
2290 *
2291 * parse an HTML attribute value till the stop (quote), if
2292 * stop is 0 then it stops at the first space
2293 *
2294 * Returns the attribute parsed or NULL
2295 */
2296
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002297static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002298htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2299 xmlChar *buffer = NULL;
2300 int buffer_size = 0;
2301 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002302 const xmlChar *name = NULL;
2303 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002304 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002305
2306 /*
2307 * allocate a translation buffer.
2308 */
2309 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002310 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002311 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002312 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002313 return(NULL);
2314 }
2315 out = buffer;
2316
2317 /*
2318 * Ok loop until we reach one of the ending chars
2319 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002320 while ((CUR != 0) && (CUR != stop)) {
2321 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002322 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002323 if (CUR == '&') {
2324 if (NXT(1) == '#') {
2325 unsigned int c;
2326 int bits;
2327
2328 c = htmlParseCharRef(ctxt);
2329 if (c < 0x80)
2330 { *out++ = c; bits= -6; }
2331 else if (c < 0x800)
2332 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2333 else if (c < 0x10000)
2334 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2335 else
2336 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2337
2338 for ( ; bits >= 0; bits-= 6) {
2339 *out++ = ((c >> bits) & 0x3F) | 0x80;
2340 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002341
2342 if (out - buffer > buffer_size - 100) {
2343 int indx = out - buffer;
2344
2345 growBuffer(buffer);
2346 out = &buffer[indx];
2347 }
Owen Taylor3473f882001-02-23 17:55:21 +00002348 } else {
2349 ent = htmlParseEntityRef(ctxt, &name);
2350 if (name == NULL) {
2351 *out++ = '&';
2352 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002353 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002354
2355 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002356 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002357 }
2358 } else if (ent == NULL) {
2359 *out++ = '&';
2360 cur = name;
2361 while (*cur != 0) {
2362 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002363 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002364
2365 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002366 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002367 }
2368 *out++ = *cur++;
2369 }
Owen Taylor3473f882001-02-23 17:55:21 +00002370 } else {
2371 unsigned int c;
2372 int bits;
2373
2374 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002375 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002376
2377 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002378 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002379 }
Daniel Veillard48519092006-10-17 15:56:35 +00002380 c = ent->value;
Owen Taylor3473f882001-02-23 17:55:21 +00002381 if (c < 0x80)
2382 { *out++ = c; bits= -6; }
2383 else if (c < 0x800)
2384 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2385 else if (c < 0x10000)
2386 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2387 else
2388 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2389
2390 for ( ; bits >= 0; bits-= 6) {
2391 *out++ = ((c >> bits) & 0x3F) | 0x80;
2392 }
Owen Taylor3473f882001-02-23 17:55:21 +00002393 }
2394 }
2395 } else {
2396 unsigned int c;
2397 int bits, l;
2398
2399 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002400 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002401
2402 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002403 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002404 }
2405 c = CUR_CHAR(l);
2406 if (c < 0x80)
2407 { *out++ = c; bits= -6; }
2408 else if (c < 0x800)
2409 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2410 else if (c < 0x10000)
2411 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2412 else
2413 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2414
2415 for ( ; bits >= 0; bits-= 6) {
2416 *out++ = ((c >> bits) & 0x3F) | 0x80;
2417 }
2418 NEXT;
2419 }
2420 }
2421 *out++ = 0;
2422 return(buffer);
2423}
2424
2425/**
Owen Taylor3473f882001-02-23 17:55:21 +00002426 * htmlParseEntityRef:
2427 * @ctxt: an HTML parser context
2428 * @str: location to store the entity name
2429 *
2430 * parse an HTML ENTITY references
2431 *
2432 * [68] EntityRef ::= '&' Name ';'
2433 *
2434 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2435 * if non-NULL *str will have to be freed by the caller.
2436 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002437const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002438htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2439 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002440 const htmlEntityDesc * ent = NULL;
Daniel Veillard42595322004-11-08 10:52:06 +00002441
2442 if (str != NULL) *str = NULL;
2443 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002444
2445 if (CUR == '&') {
2446 NEXT;
2447 name = htmlParseName(ctxt);
2448 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002449 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2450 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002451 } else {
2452 GROW;
2453 if (CUR == ';') {
Daniel Veillard42595322004-11-08 10:52:06 +00002454 if (str != NULL)
2455 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002456
2457 /*
2458 * Lookup the entity in the table.
2459 */
2460 ent = htmlEntityLookup(name);
2461 if (ent != NULL) /* OK that's ugly !!! */
2462 NEXT;
2463 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002464 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2465 "htmlParseEntityRef: expecting ';'\n",
2466 NULL, NULL);
Daniel Veillard42595322004-11-08 10:52:06 +00002467 if (str != NULL)
2468 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002469 }
2470 }
2471 }
2472 return(ent);
2473}
2474
2475/**
2476 * htmlParseAttValue:
2477 * @ctxt: an HTML parser context
2478 *
2479 * parse a value for an attribute
2480 * Note: the parser won't do substitution of entities here, this
2481 * will be handled later in xmlStringGetNodeList, unless it was
2482 * asked for ctxt->replaceEntities != 0
2483 *
2484 * Returns the AttValue parsed or NULL.
2485 */
2486
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002487static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002488htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2489 xmlChar *ret = NULL;
2490
2491 if (CUR == '"') {
2492 NEXT;
2493 ret = htmlParseHTMLAttribute(ctxt, '"');
2494 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002495 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2496 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002497 } else
2498 NEXT;
2499 } else if (CUR == '\'') {
2500 NEXT;
2501 ret = htmlParseHTMLAttribute(ctxt, '\'');
2502 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002503 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2504 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002505 } else
2506 NEXT;
2507 } else {
2508 /*
2509 * That's an HTMLism, the attribute value may not be quoted
2510 */
2511 ret = htmlParseHTMLAttribute(ctxt, 0);
2512 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002513 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2514 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002515 }
2516 }
2517 return(ret);
2518}
2519
2520/**
2521 * htmlParseSystemLiteral:
2522 * @ctxt: an HTML parser context
2523 *
2524 * parse an HTML Literal
2525 *
2526 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2527 *
2528 * Returns the SystemLiteral parsed or NULL
2529 */
2530
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002531static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002532htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2533 const xmlChar *q;
2534 xmlChar *ret = NULL;
2535
2536 if (CUR == '"') {
2537 NEXT;
2538 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002539 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002540 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002541 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002542 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2543 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002544 } else {
2545 ret = xmlStrndup(q, CUR_PTR - q);
2546 NEXT;
2547 }
2548 } else if (CUR == '\'') {
2549 NEXT;
2550 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002551 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002552 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002553 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002554 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2555 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002556 } else {
2557 ret = xmlStrndup(q, CUR_PTR - q);
2558 NEXT;
2559 }
2560 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002561 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2562 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002563 }
2564
2565 return(ret);
2566}
2567
2568/**
2569 * htmlParsePubidLiteral:
2570 * @ctxt: an HTML parser context
2571 *
2572 * parse an HTML public literal
2573 *
2574 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2575 *
2576 * Returns the PubidLiteral parsed or NULL.
2577 */
2578
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002579static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002580htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2581 const xmlChar *q;
2582 xmlChar *ret = NULL;
2583 /*
2584 * Name ::= (Letter | '_') (NameChar)*
2585 */
2586 if (CUR == '"') {
2587 NEXT;
2588 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002589 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002590 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002591 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2592 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002593 } else {
2594 ret = xmlStrndup(q, CUR_PTR - q);
2595 NEXT;
2596 }
2597 } else if (CUR == '\'') {
2598 NEXT;
2599 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002600 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002601 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002602 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002603 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2604 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002605 } else {
2606 ret = xmlStrndup(q, CUR_PTR - q);
2607 NEXT;
2608 }
2609 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002610 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2611 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002612 }
2613
2614 return(ret);
2615}
2616
2617/**
2618 * htmlParseScript:
2619 * @ctxt: an HTML parser context
2620 *
2621 * parse the content of an HTML SCRIPT or STYLE element
2622 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2623 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2624 * http://www.w3.org/TR/html4/types.html#type-script
2625 * http://www.w3.org/TR/html4/types.html#h-6.15
2626 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2627 *
2628 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2629 * element and the value of intrinsic event attributes. User agents must
2630 * not evaluate script data as HTML markup but instead must pass it on as
2631 * data to a script engine.
2632 * NOTES:
2633 * - The content is passed like CDATA
2634 * - the attributes for style and scripting "onXXX" are also described
2635 * as CDATA but SGML allows entities references in attributes so their
2636 * processing is identical as other attributes
2637 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002638static void
Owen Taylor3473f882001-02-23 17:55:21 +00002639htmlParseScript(htmlParserCtxtPtr ctxt) {
Daniel Veillard7d2b3232005-07-14 08:57:39 +00002640 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
Owen Taylor3473f882001-02-23 17:55:21 +00002641 int nbchar = 0;
Daniel Veillard358fef42005-07-13 16:37:38 +00002642 int cur,l;
Owen Taylor3473f882001-02-23 17:55:21 +00002643
2644 SHRINK;
Daniel Veillard358fef42005-07-13 16:37:38 +00002645 cur = CUR_CHAR(l);
William M. Brack76e95df2003-10-18 16:20:14 +00002646 while (IS_CHAR_CH(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002647 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2648 (NXT(3) == '-')) {
2649 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2650 if (ctxt->sax->cdataBlock!= NULL) {
2651 /*
2652 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2653 */
2654 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002655 } else if (ctxt->sax->characters != NULL) {
2656 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Daniel Veillardc1f78342001-11-10 11:43:05 +00002657 }
2658 }
2659 nbchar = 0;
2660 htmlParseComment(ctxt);
Daniel Veillard358fef42005-07-13 16:37:38 +00002661 cur = CUR_CHAR(l);
Daniel Veillardc1f78342001-11-10 11:43:05 +00002662 continue;
2663 } else if ((cur == '<') && (NXT(1) == '/')) {
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002664 /*
2665 * One should break here, the specification is clear:
2666 * Authors should therefore escape "</" within the content.
2667 * Escape mechanisms are specific to each scripting or
2668 * style sheet language.
2669 *
2670 * In recovery mode, only break if end tag match the
2671 * current tag, effectively ignoring all tags inside the
2672 * script/style block and treating the entire block as
2673 * CDATA.
2674 */
2675 if (ctxt->recovery) {
2676 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2677 xmlStrlen(ctxt->name)) == 0)
2678 {
2679 break; /* while */
2680 } else {
2681 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
Daniel Veillard2cf36a12005-10-25 12:21:29 +00002682 "Element %s embeds close tag\n",
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002683 ctxt->name, NULL);
2684 }
2685 } else {
2686 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2687 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2688 {
2689 break; /* while */
2690 }
2691 }
Owen Taylor3473f882001-02-23 17:55:21 +00002692 }
Daniel Veillard358fef42005-07-13 16:37:38 +00002693 COPY_BUF(l,buf,nbchar,cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002694 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2695 if (ctxt->sax->cdataBlock!= NULL) {
2696 /*
2697 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2698 */
2699 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002700 } else if (ctxt->sax->characters != NULL) {
2701 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002702 }
2703 nbchar = 0;
2704 }
Daniel Veillardb9900082005-10-25 12:36:29 +00002705 GROW;
Daniel Veillard358fef42005-07-13 16:37:38 +00002706 NEXTL(l);
2707 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002708 }
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002709
Daniel Veillard68716a72006-10-16 09:32:17 +00002710 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002711 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2712 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002713 NEXT;
2714 }
2715
2716 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2717 if (ctxt->sax->cdataBlock!= NULL) {
2718 /*
2719 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2720 */
2721 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002722 } else if (ctxt->sax->characters != NULL) {
2723 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002724 }
2725 }
2726}
2727
2728
2729/**
2730 * htmlParseCharData:
2731 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002732 *
2733 * parse a CharData section.
2734 * if we are within a CDATA section ']]>' marks an end of section.
2735 *
2736 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2737 */
2738
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002739static void
2740htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002741 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2742 int nbchar = 0;
2743 int cur, l;
2744
2745 SHRINK;
2746 cur = CUR_CHAR(l);
2747 while (((cur != '<') || (ctxt->token == '<')) &&
2748 ((cur != '&') || (ctxt->token == '&')) &&
2749 (IS_CHAR(cur))) {
2750 COPY_BUF(l,buf,nbchar,cur);
2751 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2752 /*
2753 * Ok the segment is to be consumed as chars.
2754 */
2755 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2756 if (areBlanks(ctxt, buf, nbchar)) {
2757 if (ctxt->sax->ignorableWhitespace != NULL)
2758 ctxt->sax->ignorableWhitespace(ctxt->userData,
2759 buf, nbchar);
2760 } else {
2761 htmlCheckParagraph(ctxt);
2762 if (ctxt->sax->characters != NULL)
2763 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2764 }
2765 }
2766 nbchar = 0;
2767 }
2768 NEXTL(l);
2769 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002770 if (cur == 0) {
2771 SHRINK;
2772 GROW;
2773 cur = CUR_CHAR(l);
2774 }
Owen Taylor3473f882001-02-23 17:55:21 +00002775 }
2776 if (nbchar != 0) {
Daniel Veillardd2755a82005-08-07 23:42:39 +00002777 buf[nbchar] = 0;
2778
Owen Taylor3473f882001-02-23 17:55:21 +00002779 /*
2780 * Ok the segment is to be consumed as chars.
2781 */
2782 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2783 if (areBlanks(ctxt, buf, nbchar)) {
2784 if (ctxt->sax->ignorableWhitespace != NULL)
2785 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2786 } else {
2787 htmlCheckParagraph(ctxt);
2788 if (ctxt->sax->characters != NULL)
2789 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2790 }
2791 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002792 } else {
2793 /*
2794 * Loop detection
2795 */
2796 if (cur == 0)
2797 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002798 }
2799}
2800
2801/**
2802 * htmlParseExternalID:
2803 * @ctxt: an HTML parser context
2804 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002805 *
2806 * Parse an External ID or a Public ID
2807 *
Owen Taylor3473f882001-02-23 17:55:21 +00002808 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2809 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2810 *
2811 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2812 *
2813 * Returns the function returns SystemLiteral and in the second
2814 * case publicID receives PubidLiteral, is strict is off
2815 * it is possible to return NULL and have publicID set.
2816 */
2817
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002818static xmlChar *
2819htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002820 xmlChar *URI = NULL;
2821
2822 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2823 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2824 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2825 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002826 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002827 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2828 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002829 }
2830 SKIP_BLANKS;
2831 URI = htmlParseSystemLiteral(ctxt);
2832 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002833 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2834 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002835 }
2836 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2837 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2838 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2839 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002840 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002841 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2842 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002843 }
2844 SKIP_BLANKS;
2845 *publicID = htmlParsePubidLiteral(ctxt);
2846 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002847 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2848 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2849 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002850 }
2851 SKIP_BLANKS;
2852 if ((CUR == '"') || (CUR == '\'')) {
2853 URI = htmlParseSystemLiteral(ctxt);
2854 }
2855 }
2856 return(URI);
2857}
2858
2859/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00002860 * xmlParsePI:
2861 * @ctxt: an XML parser context
2862 *
2863 * parse an XML Processing Instruction.
2864 *
2865 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2866 */
2867static void
2868htmlParsePI(htmlParserCtxtPtr ctxt) {
2869 xmlChar *buf = NULL;
2870 int len = 0;
2871 int size = HTML_PARSER_BUFFER_SIZE;
2872 int cur, l;
2873 const xmlChar *target;
2874 xmlParserInputState state;
2875 int count = 0;
2876
2877 if ((RAW == '<') && (NXT(1) == '?')) {
2878 state = ctxt->instate;
2879 ctxt->instate = XML_PARSER_PI;
2880 /*
2881 * this is a Processing Instruction.
2882 */
2883 SKIP(2);
2884 SHRINK;
2885
2886 /*
2887 * Parse the target name and check for special support like
2888 * namespace.
2889 */
2890 target = htmlParseName(ctxt);
2891 if (target != NULL) {
2892 if (RAW == '>') {
2893 SKIP(1);
2894
2895 /*
2896 * SAX: PI detected.
2897 */
2898 if ((ctxt->sax) && (!ctxt->disableSAX) &&
2899 (ctxt->sax->processingInstruction != NULL))
2900 ctxt->sax->processingInstruction(ctxt->userData,
2901 target, NULL);
2902 ctxt->instate = state;
2903 return;
2904 }
2905 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
2906 if (buf == NULL) {
2907 htmlErrMemory(ctxt, NULL);
2908 ctxt->instate = state;
2909 return;
2910 }
2911 cur = CUR;
2912 if (!IS_BLANK(cur)) {
2913 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2914 "ParsePI: PI %s space expected\n", target, NULL);
2915 }
2916 SKIP_BLANKS;
2917 cur = CUR_CHAR(l);
2918 while (IS_CHAR(cur) && (cur != '>')) {
2919 if (len + 5 >= size) {
2920 xmlChar *tmp;
2921
2922 size *= 2;
2923 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2924 if (tmp == NULL) {
2925 htmlErrMemory(ctxt, NULL);
2926 xmlFree(buf);
2927 ctxt->instate = state;
2928 return;
2929 }
2930 buf = tmp;
2931 }
2932 count++;
2933 if (count > 50) {
2934 GROW;
2935 count = 0;
2936 }
2937 COPY_BUF(l,buf,len,cur);
2938 NEXTL(l);
2939 cur = CUR_CHAR(l);
2940 if (cur == 0) {
2941 SHRINK;
2942 GROW;
2943 cur = CUR_CHAR(l);
2944 }
2945 }
2946 buf[len] = 0;
2947 if (cur != '>') {
2948 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
2949 "ParsePI: PI %s never end ...\n", target, NULL);
2950 } else {
2951 SKIP(1);
2952
2953 /*
2954 * SAX: PI detected.
2955 */
2956 if ((ctxt->sax) && (!ctxt->disableSAX) &&
2957 (ctxt->sax->processingInstruction != NULL))
2958 ctxt->sax->processingInstruction(ctxt->userData,
2959 target, buf);
2960 }
2961 xmlFree(buf);
2962 } else {
2963 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
2964 "PI is not started correctly", NULL, NULL);
2965 }
2966 ctxt->instate = state;
2967 }
2968}
2969
2970/**
Owen Taylor3473f882001-02-23 17:55:21 +00002971 * htmlParseComment:
2972 * @ctxt: an HTML parser context
2973 *
2974 * Parse an XML (SGML) comment <!-- .... -->
2975 *
2976 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2977 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002978static void
Owen Taylor3473f882001-02-23 17:55:21 +00002979htmlParseComment(htmlParserCtxtPtr ctxt) {
2980 xmlChar *buf = NULL;
2981 int len;
2982 int size = HTML_PARSER_BUFFER_SIZE;
2983 int q, ql;
2984 int r, rl;
2985 int cur, l;
2986 xmlParserInputState state;
2987
2988 /*
2989 * Check that there is a comment right here.
2990 */
2991 if ((RAW != '<') || (NXT(1) != '!') ||
2992 (NXT(2) != '-') || (NXT(3) != '-')) return;
2993
2994 state = ctxt->instate;
2995 ctxt->instate = XML_PARSER_COMMENT;
2996 SHRINK;
2997 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002998 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002999 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003000 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003001 ctxt->instate = state;
3002 return;
3003 }
3004 q = CUR_CHAR(ql);
3005 NEXTL(ql);
3006 r = CUR_CHAR(rl);
3007 NEXTL(rl);
3008 cur = CUR_CHAR(l);
3009 len = 0;
3010 while (IS_CHAR(cur) &&
3011 ((cur != '>') ||
3012 (r != '-') || (q != '-'))) {
3013 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00003014 xmlChar *tmp;
3015
Owen Taylor3473f882001-02-23 17:55:21 +00003016 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00003017 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3018 if (tmp == NULL) {
3019 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00003020 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003021 ctxt->instate = state;
3022 return;
3023 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00003024 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00003025 }
3026 COPY_BUF(ql,buf,len,q);
3027 q = r;
3028 ql = rl;
3029 r = cur;
3030 rl = l;
3031 NEXTL(l);
3032 cur = CUR_CHAR(l);
3033 if (cur == 0) {
3034 SHRINK;
3035 GROW;
3036 cur = CUR_CHAR(l);
3037 }
3038 }
3039 buf[len] = 0;
3040 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003041 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3042 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003043 xmlFree(buf);
3044 } else {
3045 NEXT;
3046 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3047 (!ctxt->disableSAX))
3048 ctxt->sax->comment(ctxt->userData, buf);
3049 xmlFree(buf);
3050 }
3051 ctxt->instate = state;
3052}
3053
3054/**
3055 * htmlParseCharRef:
3056 * @ctxt: an HTML parser context
3057 *
3058 * parse Reference declarations
3059 *
3060 * [66] CharRef ::= '&#' [0-9]+ ';' |
3061 * '&#x' [0-9a-fA-F]+ ';'
3062 *
3063 * Returns the value parsed (as an int)
3064 */
3065int
3066htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3067 int val = 0;
3068
Daniel Veillarda03e3652004-11-02 18:45:30 +00003069 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3070 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3071 "htmlParseCharRef: context error\n",
3072 NULL, NULL);
3073 return(0);
3074 }
Owen Taylor3473f882001-02-23 17:55:21 +00003075 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003076 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003077 SKIP(3);
3078 while (CUR != ';') {
3079 if ((CUR >= '0') && (CUR <= '9'))
3080 val = val * 16 + (CUR - '0');
3081 else if ((CUR >= 'a') && (CUR <= 'f'))
3082 val = val * 16 + (CUR - 'a') + 10;
3083 else if ((CUR >= 'A') && (CUR <= 'F'))
3084 val = val * 16 + (CUR - 'A') + 10;
3085 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003086 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3087 "htmlParseCharRef: invalid hexadecimal value\n",
3088 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003089 return(0);
3090 }
3091 NEXT;
3092 }
3093 if (CUR == ';')
3094 NEXT;
3095 } else if ((CUR == '&') && (NXT(1) == '#')) {
3096 SKIP(2);
3097 while (CUR != ';') {
3098 if ((CUR >= '0') && (CUR <= '9'))
3099 val = val * 10 + (CUR - '0');
3100 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003101 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3102 "htmlParseCharRef: invalid decimal value\n",
3103 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003104 return(0);
3105 }
3106 NEXT;
3107 }
3108 if (CUR == ';')
3109 NEXT;
3110 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003111 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3112 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003113 }
3114 /*
3115 * Check the value IS_CHAR ...
3116 */
3117 if (IS_CHAR(val)) {
3118 return(val);
3119 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003120 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3121 "htmlParseCharRef: invalid xmlChar value %d\n",
3122 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003123 }
3124 return(0);
3125}
3126
3127
3128/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003129 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003130 * @ctxt: an HTML parser context
3131 *
3132 * parse a DOCTYPE declaration
3133 *
3134 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3135 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3136 */
3137
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003138static void
Owen Taylor3473f882001-02-23 17:55:21 +00003139htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003140 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003141 xmlChar *ExternalID = NULL;
3142 xmlChar *URI = NULL;
3143
3144 /*
3145 * We know that '<!DOCTYPE' has been detected.
3146 */
3147 SKIP(9);
3148
3149 SKIP_BLANKS;
3150
3151 /*
3152 * Parse the DOCTYPE name.
3153 */
3154 name = htmlParseName(ctxt);
3155 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003156 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3157 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3158 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003159 }
3160 /*
3161 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3162 */
3163
3164 SKIP_BLANKS;
3165
3166 /*
3167 * Check for SystemID and ExternalID
3168 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003169 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003170 SKIP_BLANKS;
3171
3172 /*
3173 * We should be at the end of the DOCTYPE declaration.
3174 */
3175 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003176 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3177 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003178 /* We shouldn't try to resynchronize ... */
3179 }
3180 NEXT;
3181
3182 /*
3183 * Create or update the document accordingly to the DOCTYPE
3184 */
3185 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3186 (!ctxt->disableSAX))
3187 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3188
3189 /*
3190 * Cleanup, since we don't use all those identifiers
3191 */
3192 if (URI != NULL) xmlFree(URI);
3193 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003194}
3195
3196/**
3197 * htmlParseAttribute:
3198 * @ctxt: an HTML parser context
3199 * @value: a xmlChar ** used to store the value of the attribute
3200 *
3201 * parse an attribute
3202 *
3203 * [41] Attribute ::= Name Eq AttValue
3204 *
3205 * [25] Eq ::= S? '=' S?
3206 *
3207 * With namespace:
3208 *
3209 * [NS 11] Attribute ::= QName Eq AttValue
3210 *
3211 * Also the case QName == xmlns:??? is handled independently as a namespace
3212 * definition.
3213 *
3214 * Returns the attribute name, and the value in *value.
3215 */
3216
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003217static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003218htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003219 const xmlChar *name;
3220 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003221
3222 *value = NULL;
3223 name = htmlParseHTMLName(ctxt);
3224 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003225 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3226 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003227 return(NULL);
3228 }
3229
3230 /*
3231 * read the value
3232 */
3233 SKIP_BLANKS;
3234 if (CUR == '=') {
3235 NEXT;
3236 SKIP_BLANKS;
3237 val = htmlParseAttValue(ctxt);
Daniel Veillardc47d2632006-10-17 16:13:27 +00003238 } else if (htmlIsBooleanAttr(name)) {
3239 /*
3240 * assume a minimized attribute
3241 */
3242 val = xmlStrdup(name);
Owen Taylor3473f882001-02-23 17:55:21 +00003243 }
3244
3245 *value = val;
3246 return(name);
3247}
3248
3249/**
3250 * htmlCheckEncoding:
3251 * @ctxt: an HTML parser context
3252 * @attvalue: the attribute value
3253 *
3254 * Checks an http-equiv attribute from a Meta tag to detect
3255 * the encoding
3256 * If a new encoding is detected the parser is switched to decode
3257 * it and pass UTF8
3258 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003259static void
Owen Taylor3473f882001-02-23 17:55:21 +00003260htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3261 const xmlChar *encoding;
3262
3263 if ((ctxt == NULL) || (attvalue == NULL))
3264 return;
3265
3266 /* do not change encoding */
3267 if (ctxt->input->encoding != NULL)
3268 return;
3269
3270 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3271 if (encoding != NULL) {
3272 encoding += 8;
3273 } else {
3274 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3275 if (encoding != NULL)
3276 encoding += 9;
3277 }
3278 if (encoding != NULL) {
3279 xmlCharEncoding enc;
3280 xmlCharEncodingHandlerPtr handler;
3281
3282 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3283
3284 if (ctxt->input->encoding != NULL)
3285 xmlFree((xmlChar *) ctxt->input->encoding);
3286 ctxt->input->encoding = xmlStrdup(encoding);
3287
3288 enc = xmlParseCharEncoding((const char *) encoding);
3289 /*
3290 * registered set of known encodings
3291 */
3292 if (enc != XML_CHAR_ENCODING_ERROR) {
Daniel Veillard7e303562006-10-16 13:14:55 +00003293 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3294 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3295 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3296 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3297 (ctxt->input->buf != NULL) &&
3298 (ctxt->input->buf->encoder == NULL)) {
3299 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3300 "htmlCheckEncoding: wrong encoding meta\n",
3301 NULL, NULL);
3302 } else {
3303 xmlSwitchEncoding(ctxt, enc);
3304 }
Owen Taylor3473f882001-02-23 17:55:21 +00003305 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3306 } else {
3307 /*
3308 * fallback for unknown encodings
3309 */
3310 handler = xmlFindCharEncodingHandler((const char *) encoding);
3311 if (handler != NULL) {
3312 xmlSwitchToEncoding(ctxt, handler);
3313 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3314 } else {
3315 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3316 }
3317 }
3318
3319 if ((ctxt->input->buf != NULL) &&
3320 (ctxt->input->buf->encoder != NULL) &&
3321 (ctxt->input->buf->raw != NULL) &&
3322 (ctxt->input->buf->buffer != NULL)) {
3323 int nbchars;
3324 int processed;
3325
3326 /*
3327 * convert as much as possible to the parser reading buffer.
3328 */
3329 processed = ctxt->input->cur - ctxt->input->base;
3330 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3331 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3332 ctxt->input->buf->buffer,
3333 ctxt->input->buf->raw);
3334 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003335 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3336 "htmlCheckEncoding: encoder error\n",
3337 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003338 }
3339 ctxt->input->base =
3340 ctxt->input->cur = ctxt->input->buf->buffer->content;
3341 }
3342 }
3343}
3344
3345/**
3346 * htmlCheckMeta:
3347 * @ctxt: an HTML parser context
3348 * @atts: the attributes values
3349 *
3350 * Checks an attributes from a Meta tag
3351 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003352static void
Owen Taylor3473f882001-02-23 17:55:21 +00003353htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3354 int i;
3355 const xmlChar *att, *value;
3356 int http = 0;
3357 const xmlChar *content = NULL;
3358
3359 if ((ctxt == NULL) || (atts == NULL))
3360 return;
3361
3362 i = 0;
3363 att = atts[i++];
3364 while (att != NULL) {
3365 value = atts[i++];
3366 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3367 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3368 http = 1;
3369 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3370 content = value;
3371 att = atts[i++];
3372 }
3373 if ((http) && (content != NULL))
3374 htmlCheckEncoding(ctxt, content);
3375
3376}
3377
3378/**
3379 * htmlParseStartTag:
3380 * @ctxt: an HTML parser context
3381 *
3382 * parse a start of tag either for rule element or
3383 * EmptyElement. In both case we don't parse the tag closing chars.
3384 *
3385 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3386 *
3387 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3388 *
3389 * With namespace:
3390 *
3391 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3392 *
3393 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3394 *
Daniel Veillard597f1c12005-07-03 23:00:18 +00003395 * Returns 0 in case of success and -1 in case of error.
Owen Taylor3473f882001-02-23 17:55:21 +00003396 */
3397
Daniel Veillard597f1c12005-07-03 23:00:18 +00003398static int
Owen Taylor3473f882001-02-23 17:55:21 +00003399htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003400 const xmlChar *name;
3401 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003402 xmlChar *attvalue;
Daniel Veillard30e76072006-03-09 14:13:55 +00003403 const xmlChar **atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003404 int nbatts = 0;
Daniel Veillard30e76072006-03-09 14:13:55 +00003405 int maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003406 int meta = 0;
3407 int i;
3408
Daniel Veillarda03e3652004-11-02 18:45:30 +00003409 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3410 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3411 "htmlParseStartTag: context error\n", NULL, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003412 return -1;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003413 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003414 if (CUR != '<') return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003415 NEXT;
3416
Daniel Veillard30e76072006-03-09 14:13:55 +00003417 atts = ctxt->atts;
3418 maxatts = ctxt->maxatts;
3419
Owen Taylor3473f882001-02-23 17:55:21 +00003420 GROW;
3421 name = htmlParseHTMLName(ctxt);
3422 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003423 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3424 "htmlParseStartTag: invalid element name\n",
3425 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003426 /* Dump the bogus tag like browsers do */
William M. Brack76e95df2003-10-18 16:20:14 +00003427 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Owen Taylor3473f882001-02-23 17:55:21 +00003428 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003429 return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003430 }
3431 if (xmlStrEqual(name, BAD_CAST"meta"))
3432 meta = 1;
3433
3434 /*
3435 * Check for auto-closure of HTML elements.
3436 */
3437 htmlAutoClose(ctxt, name);
3438
3439 /*
3440 * Check for implied HTML elements.
3441 */
3442 htmlCheckImplied(ctxt, name);
3443
3444 /*
3445 * Avoid html at any level > 0, head at any level != 1
3446 * or any attempt to recurse body
3447 */
3448 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003449 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3450 "htmlParseStartTag: misplaced <html> tag\n",
3451 name, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003452 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003453 }
3454 if ((ctxt->nameNr != 1) &&
3455 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003456 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3457 "htmlParseStartTag: misplaced <head> tag\n",
3458 name, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003459 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003460 }
3461 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003462 int indx;
3463 for (indx = 0;indx < ctxt->nameNr;indx++) {
3464 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003465 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3466 "htmlParseStartTag: misplaced <body> tag\n",
3467 name, NULL);
Daniel Veillardc59d8262003-11-20 21:59:12 +00003468 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3469 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003470 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003471 }
3472 }
3473 }
3474
3475 /*
3476 * Now parse the attributes, it ends up with the ending
3477 *
3478 * (S Attribute)* S?
3479 */
3480 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003481 while ((IS_CHAR_CH(CUR)) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003482 (CUR != '>') &&
3483 ((CUR != '/') || (NXT(1) != '>'))) {
3484 long cons = ctxt->nbChars;
3485
3486 GROW;
3487 attname = htmlParseAttribute(ctxt, &attvalue);
3488 if (attname != NULL) {
3489
3490 /*
3491 * Well formedness requires at most one declaration of an attribute
3492 */
3493 for (i = 0; i < nbatts;i += 2) {
3494 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003495 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3496 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003497 if (attvalue != NULL)
3498 xmlFree(attvalue);
3499 goto failed;
3500 }
3501 }
3502
3503 /*
3504 * Add the pair to atts
3505 */
3506 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003507 maxatts = 22; /* allow for 10 attrs by default */
3508 atts = (const xmlChar **)
3509 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003510 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003511 htmlErrMemory(ctxt, NULL);
3512 if (attvalue != NULL)
3513 xmlFree(attvalue);
3514 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003515 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003516 ctxt->atts = atts;
3517 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003518 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003519 const xmlChar **n;
3520
Owen Taylor3473f882001-02-23 17:55:21 +00003521 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003522 n = (const xmlChar **) xmlRealloc((void *) atts,
3523 maxatts * sizeof(const xmlChar *));
3524 if (n == NULL) {
3525 htmlErrMemory(ctxt, NULL);
3526 if (attvalue != NULL)
3527 xmlFree(attvalue);
3528 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003529 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003530 atts = n;
3531 ctxt->atts = atts;
3532 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003533 }
3534 atts[nbatts++] = attname;
3535 atts[nbatts++] = attvalue;
3536 atts[nbatts] = NULL;
3537 atts[nbatts + 1] = NULL;
3538 }
3539 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003540 if (attvalue != NULL)
3541 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003542 /* Dump the bogus attribute string up to the next blank or
3543 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003544 while ((IS_CHAR_CH(CUR)) &&
3545 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003546 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003547 NEXT;
3548 }
3549
3550failed:
3551 SKIP_BLANKS;
3552 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003553 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3554 "htmlParseStartTag: problem parsing attributes\n",
3555 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003556 break;
3557 }
3558 }
3559
3560 /*
3561 * Handle specific association to the META tag
3562 */
3563 if (meta)
3564 htmlCheckMeta(ctxt, atts);
3565
3566 /*
3567 * SAX: Start of Element !
3568 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003569 htmlnamePush(ctxt, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003570 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3571 if (nbatts != 0)
3572 ctxt->sax->startElement(ctxt->userData, name, atts);
3573 else
3574 ctxt->sax->startElement(ctxt->userData, name, NULL);
3575 }
Owen Taylor3473f882001-02-23 17:55:21 +00003576
3577 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003578 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003579 if (atts[i] != NULL)
3580 xmlFree((xmlChar *) atts[i]);
3581 }
Owen Taylor3473f882001-02-23 17:55:21 +00003582 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003583
3584 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003585}
3586
3587/**
3588 * htmlParseEndTag:
3589 * @ctxt: an HTML parser context
3590 *
3591 * parse an end of tag
3592 *
3593 * [42] ETag ::= '</' Name S? '>'
3594 *
3595 * With namespace
3596 *
3597 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003598 *
3599 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003600 */
3601
Daniel Veillardf420ac52001-07-04 16:04:09 +00003602static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003603htmlParseEndTag(htmlParserCtxtPtr ctxt)
3604{
3605 const xmlChar *name;
3606 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003607 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003608
3609 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003610 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3611 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003612 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003613 }
3614 SKIP(2);
3615
3616 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003617 if (name == NULL)
3618 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003619
3620 /*
3621 * We should definitely be at the ending "S? '>'" part
3622 */
3623 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003624 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003625 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3626 "End tag : expected '>'\n", NULL, NULL);
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00003627 if (ctxt->recovery) {
3628 /*
3629 * We're not at the ending > !!
3630 * Error, unless in recover mode where we search forwards
3631 * until we find a >
3632 */
3633 while (CUR != '\0' && CUR != '>') NEXT;
3634 NEXT;
3635 }
Owen Taylor3473f882001-02-23 17:55:21 +00003636 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003637 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003638
3639 /*
3640 * If the name read is not one of the element in the parsing stack
3641 * then return, it's just an error.
3642 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003643 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3644 if (xmlStrEqual(name, ctxt->nameTab[i]))
3645 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003646 }
3647 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003648 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3649 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003650 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003651 }
3652
3653
3654 /*
3655 * Check for auto-closure of HTML elements.
3656 */
3657
3658 htmlAutoCloseOnClose(ctxt, name);
3659
3660 /*
3661 * Well formedness constraints, opening and closing must match.
3662 * With the exception that the autoclose may have popped stuff out
3663 * of the stack.
3664 */
3665 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003666 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003667 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3668 "Opening and ending tag mismatch: %s and %s\n",
3669 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003670 }
3671 }
3672
3673 /*
3674 * SAX: End of Tag
3675 */
3676 oldname = ctxt->name;
3677 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003678 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3679 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003680 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003681 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003682 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003683 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003684 }
3685
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003686 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003687}
3688
3689
3690/**
3691 * htmlParseReference:
3692 * @ctxt: an HTML parser context
3693 *
3694 * parse and handle entity references in content,
3695 * this will end-up in a call to character() since this is either a
3696 * CharRef, or a predefined entity.
3697 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003698static void
Owen Taylor3473f882001-02-23 17:55:21 +00003699htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003700 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003701 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003702 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003703 if (CUR != '&') return;
3704
3705 if (NXT(1) == '#') {
3706 unsigned int c;
3707 int bits, i = 0;
3708
3709 c = htmlParseCharRef(ctxt);
3710 if (c == 0)
3711 return;
3712
3713 if (c < 0x80) { out[i++]= c; bits= -6; }
3714 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3715 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3716 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3717
3718 for ( ; bits >= 0; bits-= 6) {
3719 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3720 }
3721 out[i] = 0;
3722
3723 htmlCheckParagraph(ctxt);
3724 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3725 ctxt->sax->characters(ctxt->userData, out, i);
3726 } else {
3727 ent = htmlParseEntityRef(ctxt, &name);
3728 if (name == NULL) {
3729 htmlCheckParagraph(ctxt);
3730 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3731 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3732 return;
3733 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003734 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003735 htmlCheckParagraph(ctxt);
3736 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3737 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3738 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3739 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3740 }
3741 } else {
3742 unsigned int c;
3743 int bits, i = 0;
3744
3745 c = ent->value;
3746 if (c < 0x80)
3747 { out[i++]= c; bits= -6; }
3748 else if (c < 0x800)
3749 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3750 else if (c < 0x10000)
3751 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3752 else
3753 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3754
3755 for ( ; bits >= 0; bits-= 6) {
3756 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3757 }
3758 out[i] = 0;
3759
3760 htmlCheckParagraph(ctxt);
3761 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3762 ctxt->sax->characters(ctxt->userData, out, i);
3763 }
Owen Taylor3473f882001-02-23 17:55:21 +00003764 }
3765}
3766
3767/**
3768 * htmlParseContent:
3769 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00003770 *
3771 * Parse a content: comment, sub-element, reference or text.
Owen Taylor3473f882001-02-23 17:55:21 +00003772 */
3773
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003774static void
Owen Taylor3473f882001-02-23 17:55:21 +00003775htmlParseContent(htmlParserCtxtPtr ctxt) {
3776 xmlChar *currentNode;
3777 int depth;
3778
3779 currentNode = xmlStrdup(ctxt->name);
3780 depth = ctxt->nameNr;
3781 while (1) {
3782 long cons = ctxt->nbChars;
3783
3784 GROW;
3785 /*
3786 * Our tag or one of it's parent or children is ending.
3787 */
3788 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003789 if (htmlParseEndTag(ctxt) &&
3790 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3791 if (currentNode != NULL)
3792 xmlFree(currentNode);
3793 return;
3794 }
3795 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003796 }
3797
3798 /*
3799 * Has this node been popped out during parsing of
3800 * the next element
3801 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003802 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3803 (!xmlStrEqual(currentNode, ctxt->name)))
3804 {
Owen Taylor3473f882001-02-23 17:55:21 +00003805 if (currentNode != NULL) xmlFree(currentNode);
3806 return;
3807 }
3808
Daniel Veillardf9533d12001-03-03 10:04:57 +00003809 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3810 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003811 /*
3812 * Handle SCRIPT/STYLE separately
3813 */
3814 htmlParseScript(ctxt);
3815 } else {
3816 /*
3817 * Sometimes DOCTYPE arrives in the middle of the document
3818 */
3819 if ((CUR == '<') && (NXT(1) == '!') &&
3820 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3821 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3822 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3823 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003824 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3825 "Misplaced DOCTYPE declaration\n",
3826 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003827 htmlParseDocTypeDecl(ctxt);
3828 }
3829
3830 /*
3831 * First case : a comment
3832 */
3833 if ((CUR == '<') && (NXT(1) == '!') &&
3834 (NXT(2) == '-') && (NXT(3) == '-')) {
3835 htmlParseComment(ctxt);
3836 }
3837
3838 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003839 * Second case : a Processing Instruction.
3840 */
3841 else if ((CUR == '<') && (NXT(1) == '?')) {
3842 htmlParsePI(ctxt);
3843 }
3844
3845 /*
3846 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00003847 */
3848 else if (CUR == '<') {
3849 htmlParseElement(ctxt);
3850 }
3851
3852 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003853 * Fourth case : a reference. If if has not been resolved,
Owen Taylor3473f882001-02-23 17:55:21 +00003854 * parsing returns it's Name, create the node
3855 */
3856 else if (CUR == '&') {
3857 htmlParseReference(ctxt);
3858 }
3859
3860 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003861 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00003862 */
3863 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003864 htmlAutoCloseOnEnd(ctxt);
3865 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003866 }
3867
3868 /*
3869 * Last case, text. Note that References are handled directly.
3870 */
3871 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003872 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003873 }
3874
3875 if (cons == ctxt->nbChars) {
3876 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003877 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3878 "detected an error in element content\n",
3879 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003880 }
3881 break;
3882 }
3883 }
3884 GROW;
3885 }
3886 if (currentNode != NULL) xmlFree(currentNode);
3887}
3888
3889/**
Daniel Veillard499cc922006-01-18 17:22:35 +00003890 * htmlParseContent:
3891 * @ctxt: an HTML parser context
3892 *
3893 * Parse a content: comment, sub-element, reference or text.
3894 */
3895
3896void
3897__htmlParseContent(void *ctxt) {
3898 if (ctxt != NULL)
3899 htmlParseContent((htmlParserCtxtPtr) ctxt);
3900}
3901
3902/**
Owen Taylor3473f882001-02-23 17:55:21 +00003903 * htmlParseElement:
3904 * @ctxt: an HTML parser context
3905 *
3906 * parse an HTML element, this is highly recursive
3907 *
3908 * [39] element ::= EmptyElemTag | STag content ETag
3909 *
3910 * [41] Attribute ::= Name Eq AttValue
3911 */
3912
3913void
3914htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003915 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003916 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003917 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003918 htmlParserNodeInfo node_info;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003919 int failed;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003920 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003921 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003922
Daniel Veillarda03e3652004-11-02 18:45:30 +00003923 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3924 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
Daniel Veillard597f1c12005-07-03 23:00:18 +00003925 "htmlParseElement: context error\n", NULL, NULL);
Daniel Veillarda03e3652004-11-02 18:45:30 +00003926 return;
3927 }
Owen Taylor3473f882001-02-23 17:55:21 +00003928 /* Capture start position */
3929 if (ctxt->record_info) {
3930 node_info.begin_pos = ctxt->input->consumed +
3931 (CUR_PTR - ctxt->input->base);
3932 node_info.begin_line = ctxt->input->line;
3933 }
3934
Daniel Veillard597f1c12005-07-03 23:00:18 +00003935 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003936 name = ctxt->name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003937 if (failed || (name == NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003938 if (CUR == '>')
3939 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003940 return;
3941 }
Owen Taylor3473f882001-02-23 17:55:21 +00003942
3943 /*
3944 * Lookup the info for that element.
3945 */
3946 info = htmlTagLookup(name);
3947 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003948 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
3949 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003950 }
3951
3952 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003953 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003954 */
3955 if ((CUR == '/') && (NXT(1) == '>')) {
3956 SKIP(2);
3957 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3958 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003959 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003960 return;
3961 }
3962
3963 if (CUR == '>') {
3964 NEXT;
3965 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003966 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3967 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003968
3969 /*
3970 * end of parsing of this node.
3971 */
3972 if (xmlStrEqual(name, ctxt->name)) {
3973 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00003974 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003975 }
3976
3977 /*
3978 * Capture end position and add node
3979 */
Daniel Veillard30e76072006-03-09 14:13:55 +00003980 if (ctxt->record_info) {
Owen Taylor3473f882001-02-23 17:55:21 +00003981 node_info.end_pos = ctxt->input->consumed +
3982 (CUR_PTR - ctxt->input->base);
3983 node_info.end_line = ctxt->input->line;
3984 node_info.node = ctxt->node;
3985 xmlParserAddNodeInfo(ctxt, &node_info);
3986 }
3987 return;
3988 }
3989
3990 /*
3991 * Check for an Empty Element from DTD definition
3992 */
3993 if ((info != NULL) && (info->empty)) {
3994 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3995 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003996 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003997 return;
3998 }
3999
4000 /*
4001 * Parse the content of the element:
4002 */
4003 currentNode = xmlStrdup(ctxt->name);
4004 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00004005 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00004006 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00004007 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00004008 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00004009 if (ctxt->nameNr < depth) break;
4010 }
4011
Owen Taylor3473f882001-02-23 17:55:21 +00004012 /*
4013 * Capture end position and add node
4014 */
4015 if ( currentNode != NULL && ctxt->record_info ) {
4016 node_info.end_pos = ctxt->input->consumed +
4017 (CUR_PTR - ctxt->input->base);
4018 node_info.end_line = ctxt->input->line;
4019 node_info.node = ctxt->node;
4020 xmlParserAddNodeInfo(ctxt, &node_info);
4021 }
William M. Brack76e95df2003-10-18 16:20:14 +00004022 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004023 htmlAutoCloseOnEnd(ctxt);
4024 }
4025
Owen Taylor3473f882001-02-23 17:55:21 +00004026 if (currentNode != NULL)
4027 xmlFree(currentNode);
4028}
4029
4030/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004031 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00004032 * @ctxt: an HTML parser context
4033 *
4034 * parse an HTML document (and build a tree if using the standard SAX
4035 * interface).
4036 *
4037 * Returns 0, -1 in case of error. the parser context is augmented
4038 * as a result of the parsing.
4039 */
4040
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00004041int
Owen Taylor3473f882001-02-23 17:55:21 +00004042htmlParseDocument(htmlParserCtxtPtr ctxt) {
4043 xmlDtdPtr dtd;
4044
Daniel Veillardd0463562001-10-13 09:15:48 +00004045 xmlInitParser();
4046
Owen Taylor3473f882001-02-23 17:55:21 +00004047 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00004048
Daniel Veillarda03e3652004-11-02 18:45:30 +00004049 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4050 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4051 "htmlParseDocument: context error\n", NULL, NULL);
4052 return(XML_ERR_INTERNAL_ERROR);
4053 }
4054 ctxt->html = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00004055 GROW;
4056 /*
4057 * SAX: beginning of the document processing.
4058 */
4059 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4060 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4061
4062 /*
4063 * Wipe out everything which is before the first '<'
4064 */
4065 SKIP_BLANKS;
4066 if (CUR == 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004067 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4068 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004069 }
4070
4071 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4072 ctxt->sax->startDocument(ctxt->userData);
4073
4074
4075 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004076 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004077 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004078 while (((CUR == '<') && (NXT(1) == '!') &&
4079 (NXT(2) == '-') && (NXT(3) == '-')) ||
4080 ((CUR == '<') && (NXT(1) == '?'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004081 htmlParseComment(ctxt);
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004082 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004083 SKIP_BLANKS;
4084 }
4085
4086
4087 /*
4088 * Then possibly doc type declaration(s) and more Misc
4089 * (doctypedecl Misc*)?
4090 */
4091 if ((CUR == '<') && (NXT(1) == '!') &&
4092 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4093 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4094 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4095 (UPP(8) == 'E')) {
4096 htmlParseDocTypeDecl(ctxt);
4097 }
4098 SKIP_BLANKS;
4099
4100 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004101 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004102 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004103 while (((CUR == '<') && (NXT(1) == '!') &&
4104 (NXT(2) == '-') && (NXT(3) == '-')) ||
4105 ((CUR == '<') && (NXT(1) == '?'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004106 htmlParseComment(ctxt);
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004107 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004108 SKIP_BLANKS;
4109 }
4110
4111 /*
4112 * Time to start parsing the tree itself
4113 */
4114 htmlParseContent(ctxt);
4115
4116 /*
4117 * autoclose
4118 */
4119 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004120 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004121
4122
4123 /*
4124 * SAX: end of the document processing.
4125 */
4126 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4127 ctxt->sax->endDocument(ctxt->userData);
4128
4129 if (ctxt->myDoc != NULL) {
4130 dtd = xmlGetIntSubset(ctxt->myDoc);
4131 if (dtd == NULL)
4132 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00004133 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004134 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4135 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4136 }
4137 if (! ctxt->wellFormed) return(-1);
4138 return(0);
4139}
4140
4141
4142/************************************************************************
4143 * *
4144 * Parser contexts handling *
4145 * *
4146 ************************************************************************/
4147
4148/**
William M. Brackedb65a72004-02-06 07:36:04 +00004149 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004150 * @ctxt: an HTML parser context
4151 *
4152 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004153 *
4154 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004155 */
4156
Daniel Veillardf403d292003-10-05 13:51:35 +00004157static int
Owen Taylor3473f882001-02-23 17:55:21 +00004158htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4159{
4160 htmlSAXHandler *sax;
4161
Daniel Veillardf403d292003-10-05 13:51:35 +00004162 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004163 memset(ctxt, 0, sizeof(htmlParserCtxt));
4164
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004165 ctxt->dict = xmlDictCreate();
4166 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004167 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4168 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004169 }
Owen Taylor3473f882001-02-23 17:55:21 +00004170 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4171 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004172 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4173 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004174 }
4175 else
4176 memset(sax, 0, sizeof(htmlSAXHandler));
4177
4178 /* Allocate the Input stack */
4179 ctxt->inputTab = (htmlParserInputPtr *)
4180 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4181 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004182 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004183 ctxt->inputNr = 0;
4184 ctxt->inputMax = 0;
4185 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004186 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004187 }
4188 ctxt->inputNr = 0;
4189 ctxt->inputMax = 5;
4190 ctxt->input = NULL;
4191 ctxt->version = NULL;
4192 ctxt->encoding = NULL;
4193 ctxt->standalone = -1;
4194 ctxt->instate = XML_PARSER_START;
4195
4196 /* Allocate the Node stack */
4197 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4198 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004199 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004200 ctxt->nodeNr = 0;
4201 ctxt->nodeMax = 0;
4202 ctxt->node = NULL;
4203 ctxt->inputNr = 0;
4204 ctxt->inputMax = 0;
4205 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004206 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004207 }
4208 ctxt->nodeNr = 0;
4209 ctxt->nodeMax = 10;
4210 ctxt->node = NULL;
4211
4212 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004213 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004214 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004215 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004216 ctxt->nameNr = 0;
4217 ctxt->nameMax = 10;
4218 ctxt->name = NULL;
4219 ctxt->nodeNr = 0;
4220 ctxt->nodeMax = 0;
4221 ctxt->node = NULL;
4222 ctxt->inputNr = 0;
4223 ctxt->inputMax = 0;
4224 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004225 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004226 }
4227 ctxt->nameNr = 0;
4228 ctxt->nameMax = 10;
4229 ctxt->name = NULL;
4230
Daniel Veillard092643b2003-09-25 14:29:29 +00004231 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004232 else {
4233 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004234 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004235 }
4236 ctxt->userData = ctxt;
4237 ctxt->myDoc = NULL;
4238 ctxt->wellFormed = 1;
4239 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004240 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004241 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004242 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004243 ctxt->vctxt.userData = ctxt;
4244 ctxt->vctxt.error = xmlParserValidityError;
4245 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004246 ctxt->record_info = 0;
4247 ctxt->validate = 0;
4248 ctxt->nbChars = 0;
4249 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004250 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004251 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004252 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004253}
4254
4255/**
4256 * htmlFreeParserCtxt:
4257 * @ctxt: an HTML parser context
4258 *
4259 * Free all the memory used by a parser context. However the parsed
4260 * document in ctxt->myDoc is not freed.
4261 */
4262
4263void
4264htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4265{
4266 xmlFreeParserCtxt(ctxt);
4267}
4268
4269/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004270 * htmlNewParserCtxt:
4271 *
4272 * Allocate and initialize a new parser context.
4273 *
Daniel Veillard34c647c2006-09-21 06:53:59 +00004274 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
Daniel Veillard1d995272002-07-22 16:43:32 +00004275 */
4276
Daniel Veillard34c647c2006-09-21 06:53:59 +00004277htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004278htmlNewParserCtxt(void)
4279{
4280 xmlParserCtxtPtr ctxt;
4281
4282 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4283 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004284 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004285 return(NULL);
4286 }
4287 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004288 if (htmlInitParserCtxt(ctxt) < 0) {
4289 htmlFreeParserCtxt(ctxt);
4290 return(NULL);
4291 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004292 return(ctxt);
4293}
4294
4295/**
4296 * htmlCreateMemoryParserCtxt:
4297 * @buffer: a pointer to a char array
4298 * @size: the size of the array
4299 *
4300 * Create a parser context for an HTML in-memory document.
4301 *
4302 * Returns the new parser context or NULL
4303 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004304htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004305htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4306 xmlParserCtxtPtr ctxt;
4307 xmlParserInputPtr input;
4308 xmlParserInputBufferPtr buf;
4309
4310 if (buffer == NULL)
4311 return(NULL);
4312 if (size <= 0)
4313 return(NULL);
4314
4315 ctxt = htmlNewParserCtxt();
4316 if (ctxt == NULL)
4317 return(NULL);
4318
4319 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4320 if (buf == NULL) return(NULL);
4321
4322 input = xmlNewInputStream(ctxt);
4323 if (input == NULL) {
4324 xmlFreeParserCtxt(ctxt);
4325 return(NULL);
4326 }
4327
4328 input->filename = NULL;
4329 input->buf = buf;
4330 input->base = input->buf->buffer->content;
4331 input->cur = input->buf->buffer->content;
4332 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4333
4334 inputPush(ctxt, input);
4335 return(ctxt);
4336}
4337
4338/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004339 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004340 * @cur: a pointer to an array of xmlChar
4341 * @encoding: a free form C string describing the HTML document encoding, or NULL
4342 *
4343 * Create a parser context for an HTML document.
4344 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004345 * TODO: check the need to add encoding handling there
4346 *
Owen Taylor3473f882001-02-23 17:55:21 +00004347 * Returns the new parser context or NULL
4348 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004349static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00004350htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004351 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004352 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004353
Daniel Veillard1d995272002-07-22 16:43:32 +00004354 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004355 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004356 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004357 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4358
4359 if (encoding != NULL) {
4360 xmlCharEncoding enc;
4361 xmlCharEncodingHandlerPtr handler;
4362
4363 if (ctxt->input->encoding != NULL)
4364 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004365 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004366
4367 enc = xmlParseCharEncoding(encoding);
4368 /*
4369 * registered set of known encodings
4370 */
4371 if (enc != XML_CHAR_ENCODING_ERROR) {
4372 xmlSwitchEncoding(ctxt, enc);
4373 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004374 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4375 "Unsupported encoding %s\n",
4376 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004377 }
4378 } else {
4379 /*
4380 * fallback for unknown encodings
4381 */
4382 handler = xmlFindCharEncodingHandler((const char *) encoding);
4383 if (handler != NULL) {
4384 xmlSwitchToEncoding(ctxt, handler);
4385 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004386 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4387 "Unsupported encoding %s\n",
4388 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004389 }
4390 }
4391 }
4392 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004393}
4394
Daniel Veillard73b013f2003-09-30 12:36:01 +00004395#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004396/************************************************************************
4397 * *
4398 * Progressive parsing interfaces *
4399 * *
4400 ************************************************************************/
4401
4402/**
4403 * htmlParseLookupSequence:
4404 * @ctxt: an HTML parser context
4405 * @first: the first char to lookup
4406 * @next: the next char to lookup or zero
4407 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004408 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004409 *
4410 * Try to find if a sequence (first, next, third) or just (first next) or
4411 * (first) is available in the input stream.
4412 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4413 * to avoid rescanning sequences of bytes, it DOES change the state of the
4414 * parser, do not use liberally.
4415 * This is basically similar to xmlParseLookupSequence()
4416 *
4417 * Returns the index to the current parsing point if the full sequence
4418 * is available, -1 otherwise.
4419 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004420static int
Owen Taylor3473f882001-02-23 17:55:21 +00004421htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
William M. Brackc1939562003-08-05 15:52:22 +00004422 xmlChar next, xmlChar third, int iscomment) {
Owen Taylor3473f882001-02-23 17:55:21 +00004423 int base, len;
4424 htmlParserInputPtr in;
4425 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004426 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004427
4428 in = ctxt->input;
4429 if (in == NULL) return(-1);
4430 base = in->cur - in->base;
4431 if (base < 0) return(-1);
4432 if (ctxt->checkIndex > base)
4433 base = ctxt->checkIndex;
4434 if (in->buf == NULL) {
4435 buf = in->base;
4436 len = in->length;
4437 } else {
4438 buf = in->buf->buffer->content;
4439 len = in->buf->buffer->use;
4440 }
4441 /* take into account the sequence length */
4442 if (third) len -= 2;
4443 else if (next) len --;
4444 for (;base < len;base++) {
William M. Brackc1939562003-08-05 15:52:22 +00004445 if (!incomment && (base + 4 < len) && !iscomment) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004446 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4447 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4448 incomment = 1;
Daniel Veillard97e01882003-07-30 18:59:19 +00004449 /* do not increment past <! - some people use <!--> */
4450 base += 2;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004451 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004452 }
4453 if (incomment) {
William M. Brack4a557d92003-07-29 04:28:04 +00004454 if (base + 3 > len)
Daniel Veillardc1f78342001-11-10 11:43:05 +00004455 return(-1);
4456 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4457 (buf[base + 2] == '>')) {
4458 incomment = 0;
4459 base += 2;
4460 }
4461 continue;
4462 }
Owen Taylor3473f882001-02-23 17:55:21 +00004463 if (buf[base] == first) {
4464 if (third != 0) {
4465 if ((buf[base + 1] != next) ||
4466 (buf[base + 2] != third)) continue;
4467 } else if (next != 0) {
4468 if (buf[base + 1] != next) continue;
4469 }
4470 ctxt->checkIndex = 0;
4471#ifdef DEBUG_PUSH
4472 if (next == 0)
4473 xmlGenericError(xmlGenericErrorContext,
4474 "HPP: lookup '%c' found at %d\n",
4475 first, base);
4476 else if (third == 0)
4477 xmlGenericError(xmlGenericErrorContext,
4478 "HPP: lookup '%c%c' found at %d\n",
4479 first, next, base);
4480 else
4481 xmlGenericError(xmlGenericErrorContext,
4482 "HPP: lookup '%c%c%c' found at %d\n",
4483 first, next, third, base);
4484#endif
4485 return(base - (in->cur - in->base));
4486 }
4487 }
4488 ctxt->checkIndex = base;
4489#ifdef DEBUG_PUSH
4490 if (next == 0)
4491 xmlGenericError(xmlGenericErrorContext,
4492 "HPP: lookup '%c' failed\n", first);
4493 else if (third == 0)
4494 xmlGenericError(xmlGenericErrorContext,
4495 "HPP: lookup '%c%c' failed\n", first, next);
4496 else
4497 xmlGenericError(xmlGenericErrorContext,
4498 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4499#endif
4500 return(-1);
4501}
4502
4503/**
4504 * htmlParseTryOrFinish:
4505 * @ctxt: an HTML parser context
4506 * @terminate: last chunk indicator
4507 *
4508 * Try to progress on parsing
4509 *
4510 * Returns zero if no parsing was possible
4511 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004512static int
Owen Taylor3473f882001-02-23 17:55:21 +00004513htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4514 int ret = 0;
4515 htmlParserInputPtr in;
4516 int avail = 0;
4517 xmlChar cur, next;
4518
4519#ifdef DEBUG_PUSH
4520 switch (ctxt->instate) {
4521 case XML_PARSER_EOF:
4522 xmlGenericError(xmlGenericErrorContext,
4523 "HPP: try EOF\n"); break;
4524 case XML_PARSER_START:
4525 xmlGenericError(xmlGenericErrorContext,
4526 "HPP: try START\n"); break;
4527 case XML_PARSER_MISC:
4528 xmlGenericError(xmlGenericErrorContext,
4529 "HPP: try MISC\n");break;
4530 case XML_PARSER_COMMENT:
4531 xmlGenericError(xmlGenericErrorContext,
4532 "HPP: try COMMENT\n");break;
4533 case XML_PARSER_PROLOG:
4534 xmlGenericError(xmlGenericErrorContext,
4535 "HPP: try PROLOG\n");break;
4536 case XML_PARSER_START_TAG:
4537 xmlGenericError(xmlGenericErrorContext,
4538 "HPP: try START_TAG\n");break;
4539 case XML_PARSER_CONTENT:
4540 xmlGenericError(xmlGenericErrorContext,
4541 "HPP: try CONTENT\n");break;
4542 case XML_PARSER_CDATA_SECTION:
4543 xmlGenericError(xmlGenericErrorContext,
4544 "HPP: try CDATA_SECTION\n");break;
4545 case XML_PARSER_END_TAG:
4546 xmlGenericError(xmlGenericErrorContext,
4547 "HPP: try END_TAG\n");break;
4548 case XML_PARSER_ENTITY_DECL:
4549 xmlGenericError(xmlGenericErrorContext,
4550 "HPP: try ENTITY_DECL\n");break;
4551 case XML_PARSER_ENTITY_VALUE:
4552 xmlGenericError(xmlGenericErrorContext,
4553 "HPP: try ENTITY_VALUE\n");break;
4554 case XML_PARSER_ATTRIBUTE_VALUE:
4555 xmlGenericError(xmlGenericErrorContext,
4556 "HPP: try ATTRIBUTE_VALUE\n");break;
4557 case XML_PARSER_DTD:
4558 xmlGenericError(xmlGenericErrorContext,
4559 "HPP: try DTD\n");break;
4560 case XML_PARSER_EPILOG:
4561 xmlGenericError(xmlGenericErrorContext,
4562 "HPP: try EPILOG\n");break;
4563 case XML_PARSER_PI:
4564 xmlGenericError(xmlGenericErrorContext,
4565 "HPP: try PI\n");break;
4566 case XML_PARSER_SYSTEM_LITERAL:
4567 xmlGenericError(xmlGenericErrorContext,
4568 "HPP: try SYSTEM_LITERAL\n");break;
4569 }
4570#endif
4571
4572 while (1) {
4573
4574 in = ctxt->input;
4575 if (in == NULL) break;
4576 if (in->buf == NULL)
4577 avail = in->length - (in->cur - in->base);
4578 else
4579 avail = in->buf->buffer->use - (in->cur - in->base);
4580 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004581 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004582 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4583 /*
4584 * SAX: end of the document processing.
4585 */
4586 ctxt->instate = XML_PARSER_EOF;
4587 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4588 ctxt->sax->endDocument(ctxt->userData);
4589 }
4590 }
4591 if (avail < 1)
4592 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004593 cur = in->cur[0];
4594 if (cur == 0) {
4595 SKIP(1);
4596 continue;
4597 }
4598
Owen Taylor3473f882001-02-23 17:55:21 +00004599 switch (ctxt->instate) {
4600 case XML_PARSER_EOF:
4601 /*
4602 * Document parsing is done !
4603 */
4604 goto done;
4605 case XML_PARSER_START:
4606 /*
4607 * Very first chars read from the document flow.
4608 */
4609 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004610 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004611 SKIP_BLANKS;
4612 if (in->buf == NULL)
4613 avail = in->length - (in->cur - in->base);
4614 else
4615 avail = in->buf->buffer->use - (in->cur - in->base);
4616 }
4617 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4618 ctxt->sax->setDocumentLocator(ctxt->userData,
4619 &xmlDefaultSAXLocator);
4620 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4621 (!ctxt->disableSAX))
4622 ctxt->sax->startDocument(ctxt->userData);
4623
4624 cur = in->cur[0];
4625 next = in->cur[1];
4626 if ((cur == '<') && (next == '!') &&
4627 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4628 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4629 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4630 (UPP(8) == 'E')) {
4631 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004632 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004633 goto done;
4634#ifdef DEBUG_PUSH
4635 xmlGenericError(xmlGenericErrorContext,
4636 "HPP: Parsing internal subset\n");
4637#endif
4638 htmlParseDocTypeDecl(ctxt);
4639 ctxt->instate = XML_PARSER_PROLOG;
4640#ifdef DEBUG_PUSH
4641 xmlGenericError(xmlGenericErrorContext,
4642 "HPP: entering PROLOG\n");
4643#endif
4644 } else {
4645 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004646#ifdef DEBUG_PUSH
Daniel Veillard597f1c12005-07-03 23:00:18 +00004647 xmlGenericError(xmlGenericErrorContext,
4648 "HPP: entering MISC\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004649#endif
Daniel Veillard597f1c12005-07-03 23:00:18 +00004650 }
Owen Taylor3473f882001-02-23 17:55:21 +00004651 break;
4652 case XML_PARSER_MISC:
4653 SKIP_BLANKS;
4654 if (in->buf == NULL)
4655 avail = in->length - (in->cur - in->base);
4656 else
4657 avail = in->buf->buffer->use - (in->cur - in->base);
4658 if (avail < 2)
4659 goto done;
4660 cur = in->cur[0];
4661 next = in->cur[1];
4662 if ((cur == '<') && (next == '!') &&
4663 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4664 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004665 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004666 goto done;
4667#ifdef DEBUG_PUSH
4668 xmlGenericError(xmlGenericErrorContext,
4669 "HPP: Parsing Comment\n");
4670#endif
4671 htmlParseComment(ctxt);
4672 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004673 } else if ((cur == '<') && (next == '?')) {
4674 if ((!terminate) &&
4675 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4676 goto done;
4677#ifdef DEBUG_PUSH
4678 xmlGenericError(xmlGenericErrorContext,
4679 "HPP: Parsing PI\n");
4680#endif
4681 htmlParsePI(ctxt);
4682 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004683 } else if ((cur == '<') && (next == '!') &&
4684 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4685 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4686 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4687 (UPP(8) == 'E')) {
4688 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004689 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004690 goto done;
4691#ifdef DEBUG_PUSH
4692 xmlGenericError(xmlGenericErrorContext,
4693 "HPP: Parsing internal subset\n");
4694#endif
4695 htmlParseDocTypeDecl(ctxt);
4696 ctxt->instate = XML_PARSER_PROLOG;
4697#ifdef DEBUG_PUSH
4698 xmlGenericError(xmlGenericErrorContext,
4699 "HPP: entering PROLOG\n");
4700#endif
4701 } else if ((cur == '<') && (next == '!') &&
4702 (avail < 9)) {
4703 goto done;
4704 } else {
4705 ctxt->instate = XML_PARSER_START_TAG;
4706#ifdef DEBUG_PUSH
4707 xmlGenericError(xmlGenericErrorContext,
4708 "HPP: entering START_TAG\n");
4709#endif
4710 }
4711 break;
4712 case XML_PARSER_PROLOG:
4713 SKIP_BLANKS;
4714 if (in->buf == NULL)
4715 avail = in->length - (in->cur - in->base);
4716 else
4717 avail = in->buf->buffer->use - (in->cur - in->base);
4718 if (avail < 2)
4719 goto done;
4720 cur = in->cur[0];
4721 next = in->cur[1];
4722 if ((cur == '<') && (next == '!') &&
4723 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4724 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004725 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004726 goto done;
4727#ifdef DEBUG_PUSH
4728 xmlGenericError(xmlGenericErrorContext,
4729 "HPP: Parsing Comment\n");
4730#endif
4731 htmlParseComment(ctxt);
4732 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004733 } else if ((cur == '<') && (next == '?')) {
4734 if ((!terminate) &&
4735 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4736 goto done;
4737#ifdef DEBUG_PUSH
4738 xmlGenericError(xmlGenericErrorContext,
4739 "HPP: Parsing PI\n");
4740#endif
4741 htmlParsePI(ctxt);
4742 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004743 } else if ((cur == '<') && (next == '!') &&
4744 (avail < 4)) {
4745 goto done;
4746 } else {
4747 ctxt->instate = XML_PARSER_START_TAG;
4748#ifdef DEBUG_PUSH
4749 xmlGenericError(xmlGenericErrorContext,
4750 "HPP: entering START_TAG\n");
4751#endif
4752 }
4753 break;
4754 case XML_PARSER_EPILOG:
4755 if (in->buf == NULL)
4756 avail = in->length - (in->cur - in->base);
4757 else
4758 avail = in->buf->buffer->use - (in->cur - in->base);
4759 if (avail < 1)
4760 goto done;
4761 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004762 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004763 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004764 goto done;
4765 }
4766 if (avail < 2)
4767 goto done;
4768 next = in->cur[1];
4769 if ((cur == '<') && (next == '!') &&
4770 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4771 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004772 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004773 goto done;
4774#ifdef DEBUG_PUSH
4775 xmlGenericError(xmlGenericErrorContext,
4776 "HPP: Parsing Comment\n");
4777#endif
4778 htmlParseComment(ctxt);
4779 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004780 } else if ((cur == '<') && (next == '?')) {
4781 if ((!terminate) &&
4782 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4783 goto done;
4784#ifdef DEBUG_PUSH
4785 xmlGenericError(xmlGenericErrorContext,
4786 "HPP: Parsing PI\n");
4787#endif
4788 htmlParsePI(ctxt);
4789 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004790 } else if ((cur == '<') && (next == '!') &&
4791 (avail < 4)) {
4792 goto done;
4793 } else {
4794 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004795 ctxt->wellFormed = 0;
4796 ctxt->instate = XML_PARSER_EOF;
4797#ifdef DEBUG_PUSH
4798 xmlGenericError(xmlGenericErrorContext,
4799 "HPP: entering EOF\n");
4800#endif
4801 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4802 ctxt->sax->endDocument(ctxt->userData);
4803 goto done;
4804 }
4805 break;
4806 case XML_PARSER_START_TAG: {
Daniel Veillard6a0baa02005-12-10 11:11:12 +00004807 const xmlChar *name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004808 int failed;
Daniel Veillardbb371292001-08-16 23:26:59 +00004809 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004810
4811 if (avail < 2)
4812 goto done;
4813 cur = in->cur[0];
4814 if (cur != '<') {
4815 ctxt->instate = XML_PARSER_CONTENT;
4816#ifdef DEBUG_PUSH
4817 xmlGenericError(xmlGenericErrorContext,
4818 "HPP: entering CONTENT\n");
4819#endif
4820 break;
4821 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004822 if (in->cur[1] == '/') {
4823 ctxt->instate = XML_PARSER_END_TAG;
4824 ctxt->checkIndex = 0;
4825#ifdef DEBUG_PUSH
4826 xmlGenericError(xmlGenericErrorContext,
4827 "HPP: entering END_TAG\n");
4828#endif
4829 break;
4830 }
Owen Taylor3473f882001-02-23 17:55:21 +00004831 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004832 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004833 goto done;
4834
Daniel Veillard597f1c12005-07-03 23:00:18 +00004835 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004836 name = ctxt->name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004837 if (failed ||
Owen Taylor3473f882001-02-23 17:55:21 +00004838 (name == NULL)) {
4839 if (CUR == '>')
4840 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004841 break;
4842 }
Owen Taylor3473f882001-02-23 17:55:21 +00004843
4844 /*
4845 * Lookup the info for that element.
4846 */
4847 info = htmlTagLookup(name);
4848 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004849 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4850 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004851 }
4852
4853 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004854 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004855 */
4856 if ((CUR == '/') && (NXT(1) == '>')) {
4857 SKIP(2);
4858 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4859 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00004860 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004861 ctxt->instate = XML_PARSER_CONTENT;
4862#ifdef DEBUG_PUSH
4863 xmlGenericError(xmlGenericErrorContext,
4864 "HPP: entering CONTENT\n");
4865#endif
4866 break;
4867 }
4868
4869 if (CUR == '>') {
4870 NEXT;
4871 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004872 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4873 "Couldn't find end of Start Tag %s\n",
4874 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004875
4876 /*
4877 * end of parsing of this node.
4878 */
4879 if (xmlStrEqual(name, ctxt->name)) {
4880 nodePop(ctxt);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00004881 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004882 }
4883
4884 ctxt->instate = XML_PARSER_CONTENT;
4885#ifdef DEBUG_PUSH
4886 xmlGenericError(xmlGenericErrorContext,
4887 "HPP: entering CONTENT\n");
4888#endif
4889 break;
4890 }
4891
4892 /*
4893 * Check for an Empty Element from DTD definition
4894 */
4895 if ((info != NULL) && (info->empty)) {
4896 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4897 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00004898 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004899 }
4900 ctxt->instate = XML_PARSER_CONTENT;
4901#ifdef DEBUG_PUSH
4902 xmlGenericError(xmlGenericErrorContext,
4903 "HPP: entering CONTENT\n");
4904#endif
4905 break;
4906 }
4907 case XML_PARSER_CONTENT: {
4908 long cons;
4909 /*
4910 * Handle preparsed entities and charRef
4911 */
4912 if (ctxt->token != 0) {
4913 xmlChar chr[2] = { 0 , 0 } ;
4914
4915 chr[0] = (xmlChar) ctxt->token;
4916 htmlCheckParagraph(ctxt);
4917 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4918 ctxt->sax->characters(ctxt->userData, chr, 1);
4919 ctxt->token = 0;
4920 ctxt->checkIndex = 0;
4921 }
4922 if ((avail == 1) && (terminate)) {
4923 cur = in->cur[0];
4924 if ((cur != '<') && (cur != '&')) {
4925 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00004926 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004927 if (ctxt->sax->ignorableWhitespace != NULL)
4928 ctxt->sax->ignorableWhitespace(
4929 ctxt->userData, &cur, 1);
4930 } else {
4931 htmlCheckParagraph(ctxt);
4932 if (ctxt->sax->characters != NULL)
4933 ctxt->sax->characters(
4934 ctxt->userData, &cur, 1);
4935 }
4936 }
4937 ctxt->token = 0;
4938 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00004939 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00004940 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004941 }
Owen Taylor3473f882001-02-23 17:55:21 +00004942 }
4943 if (avail < 2)
4944 goto done;
4945 cur = in->cur[0];
4946 next = in->cur[1];
4947 cons = ctxt->nbChars;
4948 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4949 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4950 /*
4951 * Handle SCRIPT/STYLE separately
4952 */
Daniel Veillard68716a72006-10-16 09:32:17 +00004953 if (!terminate) {
4954 int idx;
4955 xmlChar val;
4956
4957 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
4958 if (idx < 0)
4959 goto done;
4960 val = in->cur[idx + 2];
4961 if (val == 0) /* bad cut of input */
4962 goto done;
4963 }
Owen Taylor3473f882001-02-23 17:55:21 +00004964 htmlParseScript(ctxt);
4965 if ((cur == '<') && (next == '/')) {
4966 ctxt->instate = XML_PARSER_END_TAG;
4967 ctxt->checkIndex = 0;
4968#ifdef DEBUG_PUSH
4969 xmlGenericError(xmlGenericErrorContext,
4970 "HPP: entering END_TAG\n");
4971#endif
4972 break;
4973 }
4974 } else {
4975 /*
4976 * Sometimes DOCTYPE arrives in the middle of the document
4977 */
4978 if ((cur == '<') && (next == '!') &&
4979 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4980 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4981 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4982 (UPP(8) == 'E')) {
4983 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004984 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004985 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00004986 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4987 "Misplaced DOCTYPE declaration\n",
4988 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004989 htmlParseDocTypeDecl(ctxt);
4990 } else if ((cur == '<') && (next == '!') &&
4991 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4992 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004993 (htmlParseLookupSequence(
4994 ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004995 goto done;
4996#ifdef DEBUG_PUSH
4997 xmlGenericError(xmlGenericErrorContext,
4998 "HPP: Parsing Comment\n");
4999#endif
5000 htmlParseComment(ctxt);
5001 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005002 } else if ((cur == '<') && (next == '?')) {
5003 if ((!terminate) &&
5004 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5005 goto done;
5006#ifdef DEBUG_PUSH
5007 xmlGenericError(xmlGenericErrorContext,
5008 "HPP: Parsing PI\n");
5009#endif
5010 htmlParsePI(ctxt);
5011 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00005012 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5013 goto done;
5014 } else if ((cur == '<') && (next == '/')) {
5015 ctxt->instate = XML_PARSER_END_TAG;
5016 ctxt->checkIndex = 0;
5017#ifdef DEBUG_PUSH
5018 xmlGenericError(xmlGenericErrorContext,
5019 "HPP: entering END_TAG\n");
5020#endif
5021 break;
5022 } else if (cur == '<') {
5023 ctxt->instate = XML_PARSER_START_TAG;
5024 ctxt->checkIndex = 0;
5025#ifdef DEBUG_PUSH
5026 xmlGenericError(xmlGenericErrorContext,
5027 "HPP: entering START_TAG\n");
5028#endif
5029 break;
5030 } else if (cur == '&') {
5031 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005032 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005033 goto done;
5034#ifdef DEBUG_PUSH
5035 xmlGenericError(xmlGenericErrorContext,
5036 "HPP: Parsing Reference\n");
5037#endif
5038 /* TODO: check generation of subtrees if noent !!! */
5039 htmlParseReference(ctxt);
5040 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005041 /*
5042 * check that the text sequence is complete
5043 * before handing out the data to the parser
5044 * to avoid problems with erroneous end of
5045 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00005046 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00005047 if ((!terminate) &&
5048 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5049 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00005050 ctxt->checkIndex = 0;
5051#ifdef DEBUG_PUSH
5052 xmlGenericError(xmlGenericErrorContext,
5053 "HPP: Parsing char data\n");
5054#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005055 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005056 }
5057 }
5058 if (cons == ctxt->nbChars) {
5059 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005060 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5061 "detected an error in element content\n",
5062 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005063 }
5064 NEXT;
5065 break;
5066 }
5067
5068 break;
5069 }
5070 case XML_PARSER_END_TAG:
5071 if (avail < 2)
5072 goto done;
5073 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005074 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005075 goto done;
5076 htmlParseEndTag(ctxt);
5077 if (ctxt->nameNr == 0) {
5078 ctxt->instate = XML_PARSER_EPILOG;
5079 } else {
5080 ctxt->instate = XML_PARSER_CONTENT;
5081 }
5082 ctxt->checkIndex = 0;
5083#ifdef DEBUG_PUSH
5084 xmlGenericError(xmlGenericErrorContext,
5085 "HPP: entering CONTENT\n");
5086#endif
5087 break;
5088 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005089 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5090 "HPP: internal error, state == CDATA\n",
5091 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005092 ctxt->instate = XML_PARSER_CONTENT;
5093 ctxt->checkIndex = 0;
5094#ifdef DEBUG_PUSH
5095 xmlGenericError(xmlGenericErrorContext,
5096 "HPP: entering CONTENT\n");
5097#endif
5098 break;
5099 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005100 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5101 "HPP: internal error, state == DTD\n",
5102 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005103 ctxt->instate = XML_PARSER_CONTENT;
5104 ctxt->checkIndex = 0;
5105#ifdef DEBUG_PUSH
5106 xmlGenericError(xmlGenericErrorContext,
5107 "HPP: entering CONTENT\n");
5108#endif
5109 break;
5110 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005111 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5112 "HPP: internal error, state == COMMENT\n",
5113 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005114 ctxt->instate = XML_PARSER_CONTENT;
5115 ctxt->checkIndex = 0;
5116#ifdef DEBUG_PUSH
5117 xmlGenericError(xmlGenericErrorContext,
5118 "HPP: entering CONTENT\n");
5119#endif
5120 break;
5121 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005122 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5123 "HPP: internal error, state == PI\n",
5124 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005125 ctxt->instate = XML_PARSER_CONTENT;
5126 ctxt->checkIndex = 0;
5127#ifdef DEBUG_PUSH
5128 xmlGenericError(xmlGenericErrorContext,
5129 "HPP: entering CONTENT\n");
5130#endif
5131 break;
5132 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005133 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5134 "HPP: internal error, state == ENTITY_DECL\n",
5135 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005136 ctxt->instate = XML_PARSER_CONTENT;
5137 ctxt->checkIndex = 0;
5138#ifdef DEBUG_PUSH
5139 xmlGenericError(xmlGenericErrorContext,
5140 "HPP: entering CONTENT\n");
5141#endif
5142 break;
5143 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005144 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5145 "HPP: internal error, state == ENTITY_VALUE\n",
5146 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005147 ctxt->instate = XML_PARSER_CONTENT;
5148 ctxt->checkIndex = 0;
5149#ifdef DEBUG_PUSH
5150 xmlGenericError(xmlGenericErrorContext,
5151 "HPP: entering DTD\n");
5152#endif
5153 break;
5154 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005155 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5156 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5157 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005158 ctxt->instate = XML_PARSER_START_TAG;
5159 ctxt->checkIndex = 0;
5160#ifdef DEBUG_PUSH
5161 xmlGenericError(xmlGenericErrorContext,
5162 "HPP: entering START_TAG\n");
5163#endif
5164 break;
5165 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005166 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5167 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5168 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005169 ctxt->instate = XML_PARSER_CONTENT;
5170 ctxt->checkIndex = 0;
5171#ifdef DEBUG_PUSH
5172 xmlGenericError(xmlGenericErrorContext,
5173 "HPP: entering CONTENT\n");
5174#endif
5175 break;
5176 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005177 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5178 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5179 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005180 ctxt->instate = XML_PARSER_CONTENT;
5181 ctxt->checkIndex = 0;
5182#ifdef DEBUG_PUSH
5183 xmlGenericError(xmlGenericErrorContext,
5184 "HPP: entering CONTENT\n");
5185#endif
5186 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005187 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005188 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5189 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5190 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005191 ctxt->instate = XML_PARSER_CONTENT;
5192 ctxt->checkIndex = 0;
5193#ifdef DEBUG_PUSH
5194 xmlGenericError(xmlGenericErrorContext,
5195 "HPP: entering CONTENT\n");
5196#endif
5197 break;
5198
Owen Taylor3473f882001-02-23 17:55:21 +00005199 }
5200 }
5201done:
5202 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005203 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005204 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5205 /*
5206 * SAX: end of the document processing.
5207 */
5208 ctxt->instate = XML_PARSER_EOF;
5209 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5210 ctxt->sax->endDocument(ctxt->userData);
5211 }
5212 }
5213 if ((ctxt->myDoc != NULL) &&
5214 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5215 (ctxt->instate == XML_PARSER_EPILOG))) {
5216 xmlDtdPtr dtd;
5217 dtd = xmlGetIntSubset(ctxt->myDoc);
5218 if (dtd == NULL)
5219 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00005220 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005221 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5222 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5223 }
5224#ifdef DEBUG_PUSH
5225 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5226#endif
5227 return(ret);
5228}
5229
5230/**
Owen Taylor3473f882001-02-23 17:55:21 +00005231 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00005232 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00005233 * @chunk: an char array
5234 * @size: the size in byte of the chunk
5235 * @terminate: last chunk indicator
5236 *
5237 * Parse a Chunk of memory
5238 *
5239 * Returns zero if no error, the xmlParserErrors otherwise.
5240 */
5241int
5242htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5243 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00005244 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5245 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5246 "htmlParseChunk: context error\n", NULL, NULL);
5247 return(XML_ERR_INTERNAL_ERROR);
5248 }
Owen Taylor3473f882001-02-23 17:55:21 +00005249 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5250 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5251 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5252 int cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005253 int res;
Owen Taylor3473f882001-02-23 17:55:21 +00005254
Daniel Veillardd2755a82005-08-07 23:42:39 +00005255 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5256 if (res < 0) {
5257 ctxt->errNo = XML_PARSER_EOF;
5258 ctxt->disableSAX = 1;
5259 return (XML_PARSER_EOF);
5260 }
Owen Taylor3473f882001-02-23 17:55:21 +00005261 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5262 ctxt->input->cur = ctxt->input->base + cur;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005263 ctxt->input->end =
5264 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005265#ifdef DEBUG_PUSH
5266 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5267#endif
5268
Daniel Veillard14f752c2003-08-09 11:44:50 +00005269#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00005270 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5271 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005272#endif
Owen Taylor3473f882001-02-23 17:55:21 +00005273 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005274 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5275 xmlParserInputBufferPtr in = ctxt->input->buf;
5276 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5277 (in->raw != NULL)) {
5278 int nbchars;
5279
5280 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5281 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005282 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5283 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005284 return(XML_ERR_INVALID_ENCODING);
5285 }
5286 }
5287 }
Owen Taylor3473f882001-02-23 17:55:21 +00005288 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00005289 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00005290 if (terminate) {
5291 if ((ctxt->instate != XML_PARSER_EOF) &&
5292 (ctxt->instate != XML_PARSER_EPILOG) &&
5293 (ctxt->instate != XML_PARSER_MISC)) {
5294 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005295 ctxt->wellFormed = 0;
5296 }
5297 if (ctxt->instate != XML_PARSER_EOF) {
5298 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5299 ctxt->sax->endDocument(ctxt->userData);
5300 }
5301 ctxt->instate = XML_PARSER_EOF;
5302 }
5303 return((xmlParserErrors) ctxt->errNo);
5304}
5305
5306/************************************************************************
5307 * *
5308 * User entry points *
5309 * *
5310 ************************************************************************/
5311
5312/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005313 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005314 * @sax: a SAX handler
5315 * @user_data: The user data returned on SAX callbacks
5316 * @chunk: a pointer to an array of chars
5317 * @size: number of chars in the array
5318 * @filename: an optional file name or URI
5319 * @enc: an optional encoding
5320 *
5321 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005322 * The value of @filename is used for fetching external entities
5323 * and error/warning reports.
5324 *
5325 * Returns the new parser context or NULL
5326 */
5327htmlParserCtxtPtr
5328htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5329 const char *chunk, int size, const char *filename,
5330 xmlCharEncoding enc) {
5331 htmlParserCtxtPtr ctxt;
5332 htmlParserInputPtr inputStream;
5333 xmlParserInputBufferPtr buf;
5334
Daniel Veillardd0463562001-10-13 09:15:48 +00005335 xmlInitParser();
5336
Owen Taylor3473f882001-02-23 17:55:21 +00005337 buf = xmlAllocParserInputBuffer(enc);
5338 if (buf == NULL) return(NULL);
5339
Daniel Veillardf403d292003-10-05 13:51:35 +00005340 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005341 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005342 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005343 return(NULL);
5344 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00005345 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5346 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005347 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005348 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005349 xmlFree(ctxt->sax);
5350 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5351 if (ctxt->sax == NULL) {
5352 xmlFree(buf);
5353 xmlFree(ctxt);
5354 return(NULL);
5355 }
5356 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5357 if (user_data != NULL)
5358 ctxt->userData = user_data;
5359 }
5360 if (filename == NULL) {
5361 ctxt->directory = NULL;
5362 } else {
5363 ctxt->directory = xmlParserGetDirectory(filename);
5364 }
5365
5366 inputStream = htmlNewInputStream(ctxt);
5367 if (inputStream == NULL) {
5368 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005369 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005370 return(NULL);
5371 }
5372
5373 if (filename == NULL)
5374 inputStream->filename = NULL;
5375 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005376 inputStream->filename = (char *)
5377 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005378 inputStream->buf = buf;
5379 inputStream->base = inputStream->buf->buffer->content;
5380 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillard5f704af2003-03-05 10:01:43 +00005381 inputStream->end =
5382 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005383
5384 inputPush(ctxt, inputStream);
5385
5386 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5387 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005388 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5389 int cur = ctxt->input->cur - ctxt->input->base;
5390
Owen Taylor3473f882001-02-23 17:55:21 +00005391 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005392
5393 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5394 ctxt->input->cur = ctxt->input->base + cur;
5395 ctxt->input->end =
5396 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005397#ifdef DEBUG_PUSH
5398 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5399#endif
5400 }
Daniel Veillard68716a72006-10-16 09:32:17 +00005401 ctxt->progressive = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00005402
5403 return(ctxt);
5404}
William M. Brack21e4ef22005-01-02 09:53:13 +00005405#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00005406
5407/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005408 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005409 * @cur: a pointer to an array of xmlChar
5410 * @encoding: a free form C string describing the HTML document encoding, or NULL
5411 * @sax: the SAX handler block
5412 * @userData: if using SAX, this pointer will be provided on callbacks.
5413 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005414 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5415 * to handle parse events. If sax is NULL, fallback to the default DOM
5416 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00005417 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005418 * Returns the resulting document tree unless SAX is NULL or the document is
5419 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005420 */
5421
5422htmlDocPtr
5423htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5424 htmlDocPtr ret;
5425 htmlParserCtxtPtr ctxt;
5426
Daniel Veillardd0463562001-10-13 09:15:48 +00005427 xmlInitParser();
5428
Owen Taylor3473f882001-02-23 17:55:21 +00005429 if (cur == NULL) return(NULL);
5430
5431
5432 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5433 if (ctxt == NULL) return(NULL);
5434 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005435 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005436 ctxt->sax = sax;
5437 ctxt->userData = userData;
5438 }
5439
5440 htmlParseDocument(ctxt);
5441 ret = ctxt->myDoc;
5442 if (sax != NULL) {
5443 ctxt->sax = NULL;
5444 ctxt->userData = NULL;
5445 }
5446 htmlFreeParserCtxt(ctxt);
5447
5448 return(ret);
5449}
5450
5451/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005452 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005453 * @cur: a pointer to an array of xmlChar
5454 * @encoding: a free form C string describing the HTML document encoding, or NULL
5455 *
5456 * parse an HTML in-memory document and build a tree.
5457 *
5458 * Returns the resulting document tree
5459 */
5460
5461htmlDocPtr
5462htmlParseDoc(xmlChar *cur, const char *encoding) {
5463 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5464}
5465
5466
5467/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005468 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005469 * @filename: the filename
5470 * @encoding: a free form C string describing the HTML document encoding, or NULL
5471 *
5472 * Create a parser context for a file content.
5473 * Automatic support for ZLIB/Compress compressed document is provided
5474 * by default if found at compile-time.
5475 *
5476 * Returns the new parser context or NULL
5477 */
5478htmlParserCtxtPtr
5479htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5480{
5481 htmlParserCtxtPtr ctxt;
5482 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005483 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005484 /* htmlCharEncoding enc; */
5485 xmlChar *content, *content_line = (xmlChar *) "charset=";
5486
Daniel Veillarda03e3652004-11-02 18:45:30 +00005487 if (filename == NULL)
5488 return(NULL);
5489
Daniel Veillardf403d292003-10-05 13:51:35 +00005490 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005491 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00005492 return(NULL);
5493 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005494 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5495 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00005496#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005497 if (xmlDefaultSAXHandler.error != NULL) {
5498 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5499 }
Daniel Veillard87247e82004-01-13 20:42:02 +00005500#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00005501 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005502 return(NULL);
5503 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005504
5505 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5506 xmlFree(canonicFilename);
5507 if (inputStream == NULL) {
5508 xmlFreeParserCtxt(ctxt);
5509 return(NULL);
5510 }
Owen Taylor3473f882001-02-23 17:55:21 +00005511
5512 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005513
Owen Taylor3473f882001-02-23 17:55:21 +00005514 /* set encoding */
5515 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005516 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Owen Taylor3473f882001-02-23 17:55:21 +00005517 if (content) {
5518 strcpy ((char *)content, (char *)content_line);
5519 strcat ((char *)content, (char *)encoding);
5520 htmlCheckEncoding (ctxt, content);
5521 xmlFree (content);
5522 }
5523 }
5524
5525 return(ctxt);
5526}
5527
5528/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005529 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005530 * @filename: the filename
5531 * @encoding: a free form C string describing the HTML document encoding, or NULL
5532 * @sax: the SAX handler block
5533 * @userData: if using SAX, this pointer will be provided on callbacks.
5534 *
5535 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5536 * compressed document is provided by default if found at compile-time.
5537 * It use the given SAX function block to handle the parsing callback.
5538 * If sax is NULL, fallback to the default DOM tree building routines.
5539 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005540 * Returns the resulting document tree unless SAX is NULL or the document is
5541 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005542 */
5543
5544htmlDocPtr
5545htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5546 void *userData) {
5547 htmlDocPtr ret;
5548 htmlParserCtxtPtr ctxt;
5549 htmlSAXHandlerPtr oldsax = NULL;
5550
Daniel Veillardd0463562001-10-13 09:15:48 +00005551 xmlInitParser();
5552
Owen Taylor3473f882001-02-23 17:55:21 +00005553 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5554 if (ctxt == NULL) return(NULL);
5555 if (sax != NULL) {
5556 oldsax = ctxt->sax;
5557 ctxt->sax = sax;
5558 ctxt->userData = userData;
5559 }
5560
5561 htmlParseDocument(ctxt);
5562
5563 ret = ctxt->myDoc;
5564 if (sax != NULL) {
5565 ctxt->sax = oldsax;
5566 ctxt->userData = NULL;
5567 }
5568 htmlFreeParserCtxt(ctxt);
5569
5570 return(ret);
5571}
5572
5573/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005574 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005575 * @filename: the filename
5576 * @encoding: a free form C string describing the HTML document encoding, or NULL
5577 *
5578 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5579 * compressed document is provided by default if found at compile-time.
5580 *
5581 * Returns the resulting document tree
5582 */
5583
5584htmlDocPtr
5585htmlParseFile(const char *filename, const char *encoding) {
5586 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5587}
5588
5589/**
5590 * htmlHandleOmittedElem:
5591 * @val: int 0 or 1
5592 *
5593 * Set and return the previous value for handling HTML omitted tags.
5594 *
5595 * Returns the last value for 0 for no handling, 1 for auto insertion.
5596 */
5597
5598int
5599htmlHandleOmittedElem(int val) {
5600 int old = htmlOmittedDefaultValue;
5601
5602 htmlOmittedDefaultValue = val;
5603 return(old);
5604}
5605
Daniel Veillard930dfb62003-02-05 10:17:38 +00005606/**
5607 * htmlElementAllowedHere:
5608 * @parent: HTML parent element
5609 * @elt: HTML element
5610 *
5611 * Checks whether an HTML element may be a direct child of a parent element.
5612 * Note - doesn't check for deprecated elements
5613 *
5614 * Returns 1 if allowed; 0 otherwise.
5615 */
5616int
5617htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5618 const char** p ;
5619
5620 if ( ! elt || ! parent || ! parent->subelts )
5621 return 0 ;
5622
5623 for ( p = parent->subelts; *p; ++p )
5624 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5625 return 1 ;
5626
5627 return 0 ;
5628}
5629/**
5630 * htmlElementStatusHere:
5631 * @parent: HTML parent element
5632 * @elt: HTML element
5633 *
5634 * Checks whether an HTML element may be a direct child of a parent element.
5635 * and if so whether it is valid or deprecated.
5636 *
5637 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5638 */
5639htmlStatus
5640htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5641 if ( ! parent || ! elt )
5642 return HTML_INVALID ;
5643 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5644 return HTML_INVALID ;
5645
5646 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5647}
5648/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005649 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005650 * @elt: HTML element
5651 * @attr: HTML attribute
5652 * @legacy: whether to allow deprecated attributes
5653 *
5654 * Checks whether an attribute is valid for an element
5655 * Has full knowledge of Required and Deprecated attributes
5656 *
5657 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5658 */
5659htmlStatus
5660htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5661 const char** p ;
5662
5663 if ( !elt || ! attr )
5664 return HTML_INVALID ;
5665
5666 if ( elt->attrs_req )
5667 for ( p = elt->attrs_req; *p; ++p)
5668 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5669 return HTML_REQUIRED ;
5670
5671 if ( elt->attrs_opt )
5672 for ( p = elt->attrs_opt; *p; ++p)
5673 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5674 return HTML_VALID ;
5675
5676 if ( legacy && elt->attrs_depr )
5677 for ( p = elt->attrs_depr; *p; ++p)
5678 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5679 return HTML_DEPRECATED ;
5680
5681 return HTML_INVALID ;
5682}
5683/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005684 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005685 * @node: an htmlNodePtr in a tree
5686 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005687 * for Element nodes)
5688 *
5689 * Checks whether the tree node is valid. Experimental (the author
5690 * only uses the HTML enhancements in a SAX parser)
5691 *
5692 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5693 * legacy allowed) or htmlElementStatusHere (otherwise).
5694 * for Attribute nodes, a return from htmlAttrAllowed
5695 * for other nodes, HTML_NA (no checks performed)
5696 */
5697htmlStatus
5698htmlNodeStatus(const htmlNodePtr node, int legacy) {
5699 if ( ! node )
5700 return HTML_INVALID ;
5701
5702 switch ( node->type ) {
5703 case XML_ELEMENT_NODE:
5704 return legacy
5705 ? ( htmlElementAllowedHere (
5706 htmlTagLookup(node->parent->name) , node->name
5707 ) ? HTML_VALID : HTML_INVALID )
5708 : htmlElementStatusHere(
5709 htmlTagLookup(node->parent->name) ,
5710 htmlTagLookup(node->name) )
5711 ;
5712 case XML_ATTRIBUTE_NODE:
5713 return htmlAttrAllowed(
5714 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5715 default: return HTML_NA ;
5716 }
5717}
Daniel Veillard9475a352003-09-26 12:47:50 +00005718/************************************************************************
5719 * *
5720 * New set (2.6.0) of simpler and more flexible APIs *
5721 * *
5722 ************************************************************************/
5723/**
5724 * DICT_FREE:
5725 * @str: a string
5726 *
5727 * Free a string if it is not owned by the "dict" dictionnary in the
5728 * current scope
5729 */
5730#define DICT_FREE(str) \
5731 if ((str) && ((!dict) || \
5732 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5733 xmlFree((char *)(str));
5734
5735/**
5736 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00005737 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00005738 *
5739 * Reset a parser context
5740 */
5741void
5742htmlCtxtReset(htmlParserCtxtPtr ctxt)
5743{
5744 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00005745 xmlDictPtr dict;
5746
5747 if (ctxt == NULL)
5748 return;
5749
Daniel Veillardf1a27c62006-10-13 22:33:03 +00005750 xmlInitParser();
Daniel Veillarda03e3652004-11-02 18:45:30 +00005751 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00005752
5753 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5754 xmlFreeInputStream(input);
5755 }
5756 ctxt->inputNr = 0;
5757 ctxt->input = NULL;
5758
5759 ctxt->spaceNr = 0;
Daniel Veillarda521d282004-11-09 14:59:59 +00005760 if (ctxt->spaceTab != NULL) {
5761 ctxt->spaceTab[0] = -1;
5762 ctxt->space = &ctxt->spaceTab[0];
5763 } else {
5764 ctxt->space = NULL;
5765 }
Daniel Veillard9475a352003-09-26 12:47:50 +00005766
5767
5768 ctxt->nodeNr = 0;
5769 ctxt->node = NULL;
5770
5771 ctxt->nameNr = 0;
5772 ctxt->name = NULL;
5773
5774 DICT_FREE(ctxt->version);
5775 ctxt->version = NULL;
5776 DICT_FREE(ctxt->encoding);
5777 ctxt->encoding = NULL;
5778 DICT_FREE(ctxt->directory);
5779 ctxt->directory = NULL;
5780 DICT_FREE(ctxt->extSubURI);
5781 ctxt->extSubURI = NULL;
5782 DICT_FREE(ctxt->extSubSystem);
5783 ctxt->extSubSystem = NULL;
5784 if (ctxt->myDoc != NULL)
5785 xmlFreeDoc(ctxt->myDoc);
5786 ctxt->myDoc = NULL;
5787
5788 ctxt->standalone = -1;
5789 ctxt->hasExternalSubset = 0;
5790 ctxt->hasPErefs = 0;
5791 ctxt->html = 1;
5792 ctxt->external = 0;
5793 ctxt->instate = XML_PARSER_START;
5794 ctxt->token = 0;
5795
5796 ctxt->wellFormed = 1;
5797 ctxt->nsWellFormed = 1;
5798 ctxt->valid = 1;
5799 ctxt->vctxt.userData = ctxt;
5800 ctxt->vctxt.error = xmlParserValidityError;
5801 ctxt->vctxt.warning = xmlParserValidityWarning;
5802 ctxt->record_info = 0;
5803 ctxt->nbChars = 0;
5804 ctxt->checkIndex = 0;
5805 ctxt->inSubset = 0;
5806 ctxt->errNo = XML_ERR_OK;
5807 ctxt->depth = 0;
5808 ctxt->charset = XML_CHAR_ENCODING_UTF8;
5809 ctxt->catalogs = NULL;
5810 xmlInitNodeInfoSeq(&ctxt->node_seq);
5811
5812 if (ctxt->attsDefault != NULL) {
5813 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5814 ctxt->attsDefault = NULL;
5815 }
5816 if (ctxt->attsSpecial != NULL) {
5817 xmlHashFree(ctxt->attsSpecial, NULL);
5818 ctxt->attsSpecial = NULL;
5819 }
5820}
5821
5822/**
5823 * htmlCtxtUseOptions:
5824 * @ctxt: an HTML parser context
5825 * @options: a combination of htmlParserOption(s)
5826 *
5827 * Applies the options to the parser context
5828 *
5829 * Returns 0 in case of success, the set of unknown or unimplemented options
5830 * in case of error.
5831 */
5832int
5833htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5834{
Daniel Veillarda03e3652004-11-02 18:45:30 +00005835 if (ctxt == NULL)
5836 return(-1);
5837
Daniel Veillard9475a352003-09-26 12:47:50 +00005838 if (options & HTML_PARSE_NOWARNING) {
5839 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005840 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00005841 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005842 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00005843 }
5844 if (options & HTML_PARSE_NOERROR) {
5845 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005846 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00005847 ctxt->sax->fatalError = NULL;
5848 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005849 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00005850 }
5851 if (options & HTML_PARSE_PEDANTIC) {
5852 ctxt->pedantic = 1;
5853 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005854 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00005855 } else
5856 ctxt->pedantic = 0;
5857 if (options & XML_PARSE_NOBLANKS) {
5858 ctxt->keepBlanks = 0;
5859 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5860 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005861 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00005862 } else
5863 ctxt->keepBlanks = 1;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00005864 if (options & HTML_PARSE_RECOVER) {
5865 ctxt->recovery = 1;
5866 } else
5867 ctxt->recovery = 0;
Daniel Veillard8874b942005-08-25 13:19:21 +00005868 if (options & HTML_PARSE_COMPACT) {
5869 ctxt->options |= HTML_PARSE_COMPACT;
5870 options -= HTML_PARSE_COMPACT;
5871 }
Daniel Veillard9475a352003-09-26 12:47:50 +00005872 ctxt->dictNames = 0;
5873 return (options);
5874}
5875
5876/**
5877 * htmlDoRead:
5878 * @ctxt: an HTML parser context
5879 * @URL: the base URL to use for the document
5880 * @encoding: the document encoding, or NULL
5881 * @options: a combination of htmlParserOption(s)
5882 * @reuse: keep the context for reuse
5883 *
5884 * Common front-end for the htmlRead functions
5885 *
5886 * Returns the resulting document tree or NULL
5887 */
5888static htmlDocPtr
5889htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5890 int options, int reuse)
5891{
5892 htmlDocPtr ret;
5893
5894 htmlCtxtUseOptions(ctxt, options);
5895 ctxt->html = 1;
5896 if (encoding != NULL) {
5897 xmlCharEncodingHandlerPtr hdlr;
5898
5899 hdlr = xmlFindCharEncodingHandler(encoding);
5900 if (hdlr != NULL)
5901 xmlSwitchToEncoding(ctxt, hdlr);
5902 }
5903 if ((URL != NULL) && (ctxt->input != NULL) &&
5904 (ctxt->input->filename == NULL))
5905 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
5906 htmlParseDocument(ctxt);
5907 ret = ctxt->myDoc;
5908 ctxt->myDoc = NULL;
5909 if (!reuse) {
5910 if ((ctxt->dictNames) &&
5911 (ret != NULL) &&
5912 (ret->dict == ctxt->dict))
5913 ctxt->dict = NULL;
5914 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00005915 }
5916 return (ret);
5917}
5918
5919/**
5920 * htmlReadDoc:
5921 * @cur: a pointer to a zero terminated string
5922 * @URL: the base URL to use for the document
5923 * @encoding: the document encoding, or NULL
5924 * @options: a combination of htmlParserOption(s)
5925 *
5926 * parse an XML in-memory document and build a tree.
5927 *
5928 * Returns the resulting document tree
5929 */
5930htmlDocPtr
5931htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
5932{
5933 htmlParserCtxtPtr ctxt;
5934
5935 if (cur == NULL)
5936 return (NULL);
5937
Daniel Veillardf1a27c62006-10-13 22:33:03 +00005938 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00005939 ctxt = xmlCreateDocParserCtxt(cur);
5940 if (ctxt == NULL)
5941 return (NULL);
5942 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5943}
5944
5945/**
5946 * htmlReadFile:
5947 * @filename: a file or URL
5948 * @encoding: the document encoding, or NULL
5949 * @options: a combination of htmlParserOption(s)
5950 *
5951 * parse an XML file from the filesystem or the network.
5952 *
5953 * Returns the resulting document tree
5954 */
5955htmlDocPtr
5956htmlReadFile(const char *filename, const char *encoding, int options)
5957{
5958 htmlParserCtxtPtr ctxt;
5959
Daniel Veillardf1a27c62006-10-13 22:33:03 +00005960 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00005961 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5962 if (ctxt == NULL)
5963 return (NULL);
5964 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
5965}
5966
5967/**
5968 * htmlReadMemory:
5969 * @buffer: a pointer to a char array
5970 * @size: the size of the array
5971 * @URL: the base URL to use for the document
5972 * @encoding: the document encoding, or NULL
5973 * @options: a combination of htmlParserOption(s)
5974 *
5975 * parse an XML in-memory document and build a tree.
5976 *
5977 * Returns the resulting document tree
5978 */
5979htmlDocPtr
5980htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
5981{
5982 htmlParserCtxtPtr ctxt;
5983
Daniel Veillardf1a27c62006-10-13 22:33:03 +00005984 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00005985 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
5986 if (ctxt == NULL)
5987 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00005988 htmlDefaultSAXHandlerInit();
William M. Brackd43cdcd2004-08-03 15:13:29 +00005989 if (ctxt->sax != NULL)
5990 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00005991 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5992}
5993
5994/**
5995 * htmlReadFd:
5996 * @fd: an open file descriptor
5997 * @URL: the base URL to use for the document
5998 * @encoding: the document encoding, or NULL
5999 * @options: a combination of htmlParserOption(s)
6000 *
6001 * parse an XML from a file descriptor and build a tree.
6002 *
6003 * Returns the resulting document tree
6004 */
6005htmlDocPtr
6006htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6007{
6008 htmlParserCtxtPtr ctxt;
6009 xmlParserInputBufferPtr input;
6010 xmlParserInputPtr stream;
6011
6012 if (fd < 0)
6013 return (NULL);
6014
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006015 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006016 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6017 if (input == NULL)
6018 return (NULL);
6019 ctxt = xmlNewParserCtxt();
6020 if (ctxt == NULL) {
6021 xmlFreeParserInputBuffer(input);
6022 return (NULL);
6023 }
6024 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6025 if (stream == NULL) {
6026 xmlFreeParserInputBuffer(input);
6027 xmlFreeParserCtxt(ctxt);
6028 return (NULL);
6029 }
6030 inputPush(ctxt, stream);
6031 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6032}
6033
6034/**
6035 * htmlReadIO:
6036 * @ioread: an I/O read function
6037 * @ioclose: an I/O close function
6038 * @ioctx: an I/O handler
6039 * @URL: the base URL to use for the document
6040 * @encoding: the document encoding, or NULL
6041 * @options: a combination of htmlParserOption(s)
6042 *
6043 * parse an HTML document from I/O functions and source and build a tree.
6044 *
6045 * Returns the resulting document tree
6046 */
6047htmlDocPtr
6048htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6049 void *ioctx, const char *URL, const char *encoding, int options)
6050{
6051 htmlParserCtxtPtr ctxt;
6052 xmlParserInputBufferPtr input;
6053 xmlParserInputPtr stream;
6054
6055 if (ioread == NULL)
6056 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006057 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006058
6059 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6060 XML_CHAR_ENCODING_NONE);
6061 if (input == NULL)
6062 return (NULL);
6063 ctxt = xmlNewParserCtxt();
6064 if (ctxt == NULL) {
6065 xmlFreeParserInputBuffer(input);
6066 return (NULL);
6067 }
6068 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6069 if (stream == NULL) {
6070 xmlFreeParserInputBuffer(input);
6071 xmlFreeParserCtxt(ctxt);
6072 return (NULL);
6073 }
6074 inputPush(ctxt, stream);
6075 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6076}
6077
6078/**
6079 * htmlCtxtReadDoc:
6080 * @ctxt: an HTML parser context
6081 * @cur: a pointer to a zero terminated string
6082 * @URL: the base URL to use for the document
6083 * @encoding: the document encoding, or NULL
6084 * @options: a combination of htmlParserOption(s)
6085 *
6086 * parse an XML in-memory document and build a tree.
6087 * This reuses the existing @ctxt parser context
6088 *
6089 * Returns the resulting document tree
6090 */
6091htmlDocPtr
6092htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6093 const char *URL, const char *encoding, int options)
6094{
6095 xmlParserInputPtr stream;
6096
6097 if (cur == NULL)
6098 return (NULL);
6099 if (ctxt == NULL)
6100 return (NULL);
6101
6102 htmlCtxtReset(ctxt);
6103
6104 stream = xmlNewStringInputStream(ctxt, cur);
6105 if (stream == NULL) {
6106 return (NULL);
6107 }
6108 inputPush(ctxt, stream);
6109 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6110}
6111
6112/**
6113 * htmlCtxtReadFile:
6114 * @ctxt: an HTML parser context
6115 * @filename: a file or URL
6116 * @encoding: the document encoding, or NULL
6117 * @options: a combination of htmlParserOption(s)
6118 *
6119 * parse an XML file from the filesystem or the network.
6120 * This reuses the existing @ctxt parser context
6121 *
6122 * Returns the resulting document tree
6123 */
6124htmlDocPtr
6125htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6126 const char *encoding, int options)
6127{
6128 xmlParserInputPtr stream;
6129
6130 if (filename == NULL)
6131 return (NULL);
6132 if (ctxt == NULL)
6133 return (NULL);
6134
6135 htmlCtxtReset(ctxt);
6136
Daniel Veillard29614c72004-11-26 10:47:26 +00006137 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006138 if (stream == NULL) {
6139 return (NULL);
6140 }
6141 inputPush(ctxt, stream);
6142 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6143}
6144
6145/**
6146 * htmlCtxtReadMemory:
6147 * @ctxt: an HTML parser context
6148 * @buffer: a pointer to a char array
6149 * @size: the size of the array
6150 * @URL: the base URL to use for the document
6151 * @encoding: the document encoding, or NULL
6152 * @options: a combination of htmlParserOption(s)
6153 *
6154 * parse an XML in-memory document and build a tree.
6155 * This reuses the existing @ctxt parser context
6156 *
6157 * Returns the resulting document tree
6158 */
6159htmlDocPtr
6160htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6161 const char *URL, const char *encoding, int options)
6162{
6163 xmlParserInputBufferPtr input;
6164 xmlParserInputPtr stream;
6165
6166 if (ctxt == NULL)
6167 return (NULL);
6168 if (buffer == NULL)
6169 return (NULL);
6170
6171 htmlCtxtReset(ctxt);
6172
6173 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6174 if (input == NULL) {
6175 return(NULL);
6176 }
6177
6178 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6179 if (stream == NULL) {
6180 xmlFreeParserInputBuffer(input);
6181 return(NULL);
6182 }
6183
6184 inputPush(ctxt, stream);
6185 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6186}
6187
6188/**
6189 * htmlCtxtReadFd:
6190 * @ctxt: an HTML parser context
6191 * @fd: an open file descriptor
6192 * @URL: the base URL to use for the document
6193 * @encoding: the document encoding, or NULL
6194 * @options: a combination of htmlParserOption(s)
6195 *
6196 * parse an XML from a file descriptor and build a tree.
6197 * This reuses the existing @ctxt parser context
6198 *
6199 * Returns the resulting document tree
6200 */
6201htmlDocPtr
6202htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6203 const char *URL, const char *encoding, int options)
6204{
6205 xmlParserInputBufferPtr input;
6206 xmlParserInputPtr stream;
6207
6208 if (fd < 0)
6209 return (NULL);
6210 if (ctxt == NULL)
6211 return (NULL);
6212
6213 htmlCtxtReset(ctxt);
6214
6215
6216 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6217 if (input == NULL)
6218 return (NULL);
6219 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6220 if (stream == NULL) {
6221 xmlFreeParserInputBuffer(input);
6222 return (NULL);
6223 }
6224 inputPush(ctxt, stream);
6225 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6226}
6227
6228/**
6229 * htmlCtxtReadIO:
6230 * @ctxt: an HTML parser context
6231 * @ioread: an I/O read function
6232 * @ioclose: an I/O close function
6233 * @ioctx: an I/O handler
6234 * @URL: the base URL to use for the document
6235 * @encoding: the document encoding, or NULL
6236 * @options: a combination of htmlParserOption(s)
6237 *
6238 * parse an HTML document from I/O functions and source and build a tree.
6239 * This reuses the existing @ctxt parser context
6240 *
6241 * Returns the resulting document tree
6242 */
6243htmlDocPtr
6244htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6245 xmlInputCloseCallback ioclose, void *ioctx,
6246 const char *URL,
6247 const char *encoding, int options)
6248{
6249 xmlParserInputBufferPtr input;
6250 xmlParserInputPtr stream;
6251
6252 if (ioread == NULL)
6253 return (NULL);
6254 if (ctxt == NULL)
6255 return (NULL);
6256
6257 htmlCtxtReset(ctxt);
6258
6259 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6260 XML_CHAR_ENCODING_NONE);
6261 if (input == NULL)
6262 return (NULL);
6263 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6264 if (stream == NULL) {
6265 xmlFreeParserInputBuffer(input);
6266 return (NULL);
6267 }
6268 inputPush(ctxt, stream);
6269 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6270}
6271
Daniel Veillard5d4644e2005-04-01 13:11:58 +00006272#define bottom_HTMLparser
6273#include "elfgcchack.h"
Owen Taylor3473f882001-02-23 17:55:21 +00006274#endif /* LIBXML_HTML_ENABLED */