blob: 304f7c47767324e16adcd0f9bb2a7e56be8db34e [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillardf403d292003-10-05 13:51:35 +000062 * Some factorized error routines *
63 * *
64 ************************************************************************/
65
66/**
William M. Brackedb65a72004-02-06 07:36:04 +000067 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000068 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
Daniel Veillard157fee02003-10-31 10:36:03 +000076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000079 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
Daniel Veillard157fee02003-10-31 10:36:03 +0000109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000112 if (ctxt != NULL)
113 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
117 NULL, 0, 0,
118 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000119 if (ctxt != NULL)
120 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
128 * @val: integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
135{
Daniel Veillard157fee02003-10-31 10:36:03 +0000136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
138 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000139 if (ctxt != NULL)
140 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000144 if (ctxt != NULL)
145 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000146}
147
148/************************************************************************
149 * *
Owen Taylor3473f882001-02-23 17:55:21 +0000150 * Parser stacks related functions and macros *
151 * *
152 ************************************************************************/
153
Daniel Veillard1c732d22002-11-30 11:22:59 +0000154/**
155 * htmlnamePush:
156 * @ctxt: an HTML parser context
157 * @value: the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000162 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000163static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000165{
166 if (ctxt->nameNr >= ctxt->nameMax) {
167 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000168 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000169 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000170 ctxt->nameMax *
171 sizeof(ctxt->nameTab[0]));
172 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000173 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000174 return (0);
175 }
176 }
177 ctxt->nameTab[ctxt->nameNr] = value;
178 ctxt->name = value;
179 return (ctxt->nameNr++);
180}
181/**
182 * htmlnamePop:
183 * @ctxt: an HTML parser context
184 *
185 * Pops the top element name from the name stack
186 *
187 * Returns the name just removed
188 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000189static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000190htmlnamePop(htmlParserCtxtPtr ctxt)
191{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000192 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000193
Daniel Veillard1c732d22002-11-30 11:22:59 +0000194 if (ctxt->nameNr <= 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000195 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000196 ctxt->nameNr--;
197 if (ctxt->nameNr < 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000198 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000199 if (ctxt->nameNr > 0)
200 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
201 else
202 ctxt->name = NULL;
203 ret = ctxt->nameTab[ctxt->nameNr];
Daniel Veillard24505b02005-07-28 23:49:35 +0000204 ctxt->nameTab[ctxt->nameNr] = NULL;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000205 return (ret);
206}
Owen Taylor3473f882001-02-23 17:55:21 +0000207
208/*
209 * Macros for accessing the content. Those should be used only by the parser,
210 * and not exported.
211 *
212 * Dirty macros, i.e. one need to make assumption on the context to use them
213 *
214 * CUR_PTR return the current pointer to the xmlChar to be parsed.
215 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
216 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
217 * in UNICODE mode. This should be used internally by the parser
218 * only to compare to ASCII values otherwise it would break when
219 * running with UTF-8 encoding.
220 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
221 * to compare on ASCII based substring.
222 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
223 * it should be used only to compare on ASCII based substring.
224 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000225 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000226 *
227 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
228 *
229 * CURRENT Returns the current char value, with the full decoding of
230 * UTF-8 if we are using this mode. It returns an int.
231 * NEXT Skip to the next character, this does the proper decoding
232 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000233 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000234 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
235 */
236
237#define UPPER (toupper(*ctxt->input->cur))
238
Daniel Veillard77a90a72003-03-22 00:04:05 +0000239#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000240
241#define NXT(val) ctxt->input->cur[(val)]
242
243#define UPP(val) (toupper(ctxt->input->cur[(val)]))
244
245#define CUR_PTR ctxt->input->cur
246
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000247#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
248 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
249 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000250
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000251#define GROW if ((ctxt->progressive == 0) && \
252 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
253 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000254
255#define CURRENT ((int) (*ctxt->input->cur))
256
257#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
258
259/* Inported from XML */
260
Daniel Veillard561b7f82002-03-20 21:55:57 +0000261/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
262#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000263#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000264
Daniel Veillard561b7f82002-03-20 21:55:57 +0000265#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000266#define NXT(val) ctxt->input->cur[(val)]
267#define CUR_PTR ctxt->input->cur
268
269
270#define NEXTL(l) do { \
271 if (*(ctxt->input->cur) == '\n') { \
272 ctxt->input->line++; ctxt->input->col = 1; \
273 } else ctxt->input->col++; \
274 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
275 } while (0)
276
277/************
278 \
279 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
280 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
281 ************/
282
283#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
284#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
285
286#define COPY_BUF(l,b,i,v) \
287 if (l == 1) b[i++] = (xmlChar) v; \
288 else i += xmlCopyChar(l,&b[i],v)
289
290/**
291 * htmlCurrentChar:
292 * @ctxt: the HTML parser context
293 * @len: pointer to the length of the char read
294 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000295 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000296 * bytes in the input buffer. Implement the end of line normalization:
297 * 2.11 End-of-Line Handling
298 * If the encoding is unspecified, in the case we find an ISO-Latin-1
299 * char, then the encoding converter is plugged in automatically.
300 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000301 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000302 */
303
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000304static int
Owen Taylor3473f882001-02-23 17:55:21 +0000305htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
306 if (ctxt->instate == XML_PARSER_EOF)
307 return(0);
308
309 if (ctxt->token != 0) {
310 *len = 0;
311 return(ctxt->token);
312 }
313 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
314 /*
315 * We are supposed to handle UTF8, check it's valid
316 * From rfc2044: encoding of the Unicode values on UTF-8:
317 *
318 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
319 * 0000 0000-0000 007F 0xxxxxxx
320 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
321 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
322 *
323 * Check for the 0x110000 limit too
324 */
325 const unsigned char *cur = ctxt->input->cur;
326 unsigned char c;
327 unsigned int val;
328
329 c = *cur;
330 if (c & 0x80) {
331 if (cur[1] == 0)
332 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
333 if ((cur[1] & 0xc0) != 0x80)
334 goto encoding_error;
335 if ((c & 0xe0) == 0xe0) {
336
337 if (cur[2] == 0)
338 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
339 if ((cur[2] & 0xc0) != 0x80)
340 goto encoding_error;
341 if ((c & 0xf0) == 0xf0) {
342 if (cur[3] == 0)
343 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
344 if (((c & 0xf8) != 0xf0) ||
345 ((cur[3] & 0xc0) != 0x80))
346 goto encoding_error;
347 /* 4-byte code */
348 *len = 4;
349 val = (cur[0] & 0x7) << 18;
350 val |= (cur[1] & 0x3f) << 12;
351 val |= (cur[2] & 0x3f) << 6;
352 val |= cur[3] & 0x3f;
353 } else {
354 /* 3-byte code */
355 *len = 3;
356 val = (cur[0] & 0xf) << 12;
357 val |= (cur[1] & 0x3f) << 6;
358 val |= cur[2] & 0x3f;
359 }
360 } else {
361 /* 2-byte code */
362 *len = 2;
363 val = (cur[0] & 0x1f) << 6;
364 val |= cur[1] & 0x3f;
365 }
366 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000367 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
368 "Char 0x%X out of allowed range\n", val);
Owen Taylor3473f882001-02-23 17:55:21 +0000369 }
370 return(val);
371 } else {
372 /* 1-byte code */
373 *len = 1;
374 return((int) *ctxt->input->cur);
375 }
376 }
377 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000378 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000379 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000380 * XML constructs only use < 128 chars
381 */
382 *len = 1;
383 if ((int) *ctxt->input->cur < 0x80)
384 return((int) *ctxt->input->cur);
385
386 /*
387 * Humm this is bad, do an automatic flow conversion
388 */
389 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
390 ctxt->charset = XML_CHAR_ENCODING_UTF8;
391 return(xmlCurrentChar(ctxt, len));
392
393encoding_error:
394 /*
395 * If we detect an UTF8 error that probably mean that the
396 * input encoding didn't get properly advertized in the
397 * declaration header. Report the error and switch the encoding
398 * to ISO-Latin-1 (if you don't like this policy, just declare the
399 * encoding !)
400 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000401 {
402 char buffer[150];
403
404 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
Owen Taylor3473f882001-02-23 17:55:21 +0000405 ctxt->input->cur[0], ctxt->input->cur[1],
406 ctxt->input->cur[2], ctxt->input->cur[3]);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000407 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
408 "Input is not proper UTF-8, indicate encoding !\n",
409 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000410 }
411
412 ctxt->charset = XML_CHAR_ENCODING_8859_1;
413 *len = 1;
414 return((int) *ctxt->input->cur);
415}
416
417/**
Owen Taylor3473f882001-02-23 17:55:21 +0000418 * htmlSkipBlankChars:
419 * @ctxt: the HTML parser context
420 *
421 * skip all blanks character found at that point in the input streams.
422 *
423 * Returns the number of space chars skipped
424 */
425
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000426static int
Owen Taylor3473f882001-02-23 17:55:21 +0000427htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
428 int res = 0;
429
William M. Brack76e95df2003-10-18 16:20:14 +0000430 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000431 if ((*ctxt->input->cur == 0) &&
432 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
433 xmlPopInput(ctxt);
434 } else {
435 if (*(ctxt->input->cur) == '\n') {
436 ctxt->input->line++; ctxt->input->col = 1;
437 } else ctxt->input->col++;
438 ctxt->input->cur++;
439 ctxt->nbChars++;
440 if (*ctxt->input->cur == 0)
441 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
442 }
443 res++;
444 }
445 return(res);
446}
447
448
449
450/************************************************************************
451 * *
452 * The list of HTML elements and their properties *
453 * *
454 ************************************************************************/
455
456/*
457 * Start Tag: 1 means the start tag can be ommited
458 * End Tag: 1 means the end tag can be ommited
459 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000460 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000461 * Depr: this element is deprecated
462 * DTD: 1 means that this element is valid only in the Loose DTD
463 * 2 means that this element is valid only in the Frameset DTD
464 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000465 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000466 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000467 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000468
469/* Definitions and a couple of vars for HTML Elements */
470
471#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000472#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000473#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000474#define NB_PHRASE 10
Daniel Veillard930dfb62003-02-05 10:17:38 +0000475#define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000476#define NB_SPECIAL 15
Daniel Veillard930dfb62003-02-05 10:17:38 +0000477#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000478#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
479#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
480#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000481#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000482#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000483#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000484#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000485#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000486#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000487#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000488#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000489#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000490#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000491#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000492#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000493#define EMPTY NULL
494
495
Daniel Veillard065abe82006-07-03 08:55:04 +0000496static const char* const html_flow[] = { FLOW, NULL } ;
497static const char* const html_inline[] = { INLINE, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000498
499/* placeholders: elts with content but no subelements */
Daniel Veillard065abe82006-07-03 08:55:04 +0000500static const char* const html_pcdata[] = { NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000501#define html_cdata html_pcdata
502
503
504/* ... and for HTML Attributes */
505
506#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000507#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000508#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000509#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000510#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000511#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000512#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000513#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000514#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000515#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000516#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000517#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000518
Daniel Veillard065abe82006-07-03 08:55:04 +0000519static const char* const html_attrs[] = { ATTRS, NULL } ;
520static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
521static const char* const core_attrs[] = { COREATTRS, NULL } ;
522static const char* const i18n_attrs[] = { I18N, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000523
524
525/* Other declarations that should go inline ... */
Daniel Veillard065abe82006-07-03 08:55:04 +0000526static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000527 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
528 "tabindex", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000529static const char* const target_attr[] = { "target", NULL } ;
530static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
531static const char* const alt_attr[] = { "alt", NULL } ;
532static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
533static const char* const href_attrs[] = { "href", NULL } ;
534static const char* const clear_attrs[] = { "clear", NULL } ;
535static const char* const inline_p[] = { INLINE, "p", NULL } ;
536
537static const char* const flow_param[] = { FLOW, "param", NULL } ;
538static const char* const applet_attrs[] = { COREATTRS , "codebase",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000539 "archive", "alt", "name", "height", "width", "align",
540 "hspace", "vspace", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000541static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000542 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000543static const char* const basefont_attrs[] =
Daniel Veillard930dfb62003-02-05 10:17:38 +0000544 { "id", "size", "color", "face", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000545static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
546static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
547static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
548static const char* const body_depr[] = { "background", "bgcolor", "text",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000549 "link", "vlink", "alink", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000550static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000551 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
552
553
Daniel Veillard065abe82006-07-03 08:55:04 +0000554static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
555static const char* const col_elt[] = { "col", NULL } ;
556static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
557static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
558static const char* const dl_contents[] = { "dt", "dd", NULL } ;
559static const char* const compact_attr[] = { "compact", NULL } ;
560static const char* const label_attr[] = { "label", NULL } ;
561static const char* const fieldset_contents[] = { FLOW, "legend" } ;
562static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
563static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
564static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
565static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
566static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
567static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
568static const char* const head_attrs[] = { I18N, "profile", NULL } ;
569static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
570static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
571static const char* const version_attr[] = { "version", NULL } ;
572static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
573static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
574static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
575static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
576static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
577static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
578static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
579static const char* const align_attr[] = { "align", NULL } ;
580static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
581static const char* const map_contents[] = { BLOCK, "area", NULL } ;
582static const char* const name_attr[] = { "name", NULL } ;
583static const char* const action_attr[] = { "action", NULL } ;
584static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
585static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
586static const char* const content_attr[] = { "content", NULL } ;
587static const char* const type_attr[] = { "type", NULL } ;
588static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
589static const char* const object_contents[] = { FLOW, "param", NULL } ;
590static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
591static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
592static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
593static const char* const option_elt[] = { "option", NULL } ;
594static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
595static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
596static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
597static const char* const width_attr[] = { "width", NULL } ;
598static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
599static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
600static const char* const language_attr[] = { "language", NULL } ;
601static const char* const select_content[] = { "optgroup", "option", NULL } ;
602static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
603static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
604static const char* const table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
605static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
606static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
607static const char* const tr_elt[] = { "tr", NULL } ;
608static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
609static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
610static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
611static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
612static const char* const tr_contents[] = { "th", "td", NULL } ;
613static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
614static const char* const li_elt[] = { "li", NULL } ;
615static const char* const ul_depr[] = { "type", "compact", NULL} ;
616static const char* const dir_attr[] = { "dir", NULL} ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000617
618#define DECL (const char**)
619
Daniel Veillard22090732001-07-16 00:06:07 +0000620static const htmlElemDesc
621html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000622{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
623 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
624},
625{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
626 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
627},
628{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
629 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
630},
631{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
632 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
633},
634{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
635 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
636},
637{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
638 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
639},
640{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
641 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
642},
643{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
644 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
645},
646{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
647 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
648},
649{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
650 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
651},
652{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
653 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
654},
655{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
656 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
657},
658{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
659 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
660},
661{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
662 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
663},
664{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
665 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
666},
667{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
668 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
669},
670{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
671 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
672},
673{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
674 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
675},
676{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
677 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
678},
679{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
680 EMPTY , NULL , DECL col_attrs , NULL, NULL
681},
682{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
683 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
684},
685{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
686 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
687},
688{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
689 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
690},
691{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
692 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
693},
694{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
695 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
696},
697{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
698 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
699},
700{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
William M. Bracke978ae22007-03-21 06:16:02 +0000701 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
Daniel Veillard930dfb62003-02-05 10:17:38 +0000702},
703{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
704 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
705},
706{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
707 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
708},
709{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
710 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
711},
712{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
713 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
714},
715{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
716 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
717},
718{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
719 EMPTY, NULL, NULL, DECL frame_attrs, NULL
720},
721{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
722 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
723},
724{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
725 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
726},
727{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
728 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
729},
730{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
731 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
732},
733{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
734 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
735},
736{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
737 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
738},
739{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
740 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
741},
742{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
743 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
744},
745{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
746 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
747},
748{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
749 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
750},
751{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
752 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
753},
754{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
755 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
756},
757{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
William M. Bracke978ae22007-03-21 06:16:02 +0000758 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
Daniel Veillard930dfb62003-02-05 10:17:38 +0000759},
760{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
761 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
762},
763{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
764 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
765},
766{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
767 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
768},
769{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
770 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
771},
772{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
773 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
774},
775{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
776 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
777},
778{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
779 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
780},
781{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
782 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
783},
784{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
William M. Bracke978ae22007-03-21 06:16:02 +0000785 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000786},
787{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
788 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
789},
790{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
791 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
792},
793{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
794 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
795},
796{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
797 DECL html_flow, "div", DECL html_attrs, NULL, NULL
798},
799{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
800 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
801},
802{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
803 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
804},
805{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
William M. Bracke978ae22007-03-21 06:16:02 +0000806 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000807},
808{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
809 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
810},
811{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
812 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
813},
814{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
William M. Bracke978ae22007-03-21 06:16:02 +0000815 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000816},
817{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
818 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
819},
820{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
821 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
822},
823{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
824 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
825},
826{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
827 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
828},
829{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
830 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
831},
832{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
833 DECL select_content, NULL, DECL select_attrs, NULL, NULL
834},
835{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
836 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
837},
838{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
839 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
840},
841{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
842 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
843},
844{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
845 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
846},
847{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
848 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
849},
850{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
851 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
852},
853{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
854 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
855},
856{ "table", 0, 0, 0, 0, 0, 0, 0, "",
857 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
858},
859{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
860 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
861},
862{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
863 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
864},
865{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
866 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
867},
868{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
869 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
870},
871{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
872 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
873},
874{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
875 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
876},
877{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
878 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
879},
880{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
881 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
882},
883{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
884 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
885},
886{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
887 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
888},
889{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
890 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
891},
892{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
893 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
894}
Owen Taylor3473f882001-02-23 17:55:21 +0000895};
896
897/*
Owen Taylor3473f882001-02-23 17:55:21 +0000898 * start tags that imply the end of current element
899 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000900static const char * const htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000901"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
902 "dl", "ul", "ol", "menu", "dir", "address", "pre",
903 "listing", "xmp", "head", NULL,
904"head", "p", NULL,
905"title", "p", NULL,
906"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +0000907"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000908"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
909 "pre", "listing", "xmp", "head", "li", NULL,
910"hr", "p", "head", NULL,
911"h1", "p", "head", NULL,
912"h2", "p", "head", NULL,
913"h3", "p", "head", NULL,
914"h4", "p", "head", NULL,
915"h5", "p", "head", NULL,
916"h6", "p", "head", NULL,
917"dir", "p", "head", NULL,
918"address", "p", "head", "ul", NULL,
919"pre", "p", "head", "ul", NULL,
920"listing", "p", "head", NULL,
921"xmp", "p", "head", NULL,
922"blockquote", "p", "head", NULL,
923"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
924 "xmp", "head", NULL,
925"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
926 "head", "dd", NULL,
927"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
928 "head", "dt", NULL,
929"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
930 "listing", "xmp", NULL,
931"ol", "p", "head", "ul", NULL,
932"menu", "p", "head", "ul", NULL,
933"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
934"div", "p", "head", NULL,
935"noscript", "p", "head", NULL,
936"center", "font", "b", "i", "p", "head", NULL,
937"a", "a", NULL,
938"caption", "p", NULL,
939"colgroup", "caption", "colgroup", "col", "p", NULL,
940"col", "caption", "col", "p", NULL,
941"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
942 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000943"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
944"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000945"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
946"thead", "caption", "col", "colgroup", NULL,
947"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
948 "tbody", "p", NULL,
949"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
950 "tfoot", "tbody", "p", NULL,
951"optgroup", "option", NULL,
952"option", "option", NULL,
953"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
954 "pre", "listing", "xmp", "a", NULL,
955NULL
956};
957
958/*
959 * The list of HTML elements which are supposed not to have
960 * CDATA content and where a p element will be implied
961 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000962 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000963 * implied paragraph
964 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000965static const char *const htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000966 "html",
967 "head",
Owen Taylor3473f882001-02-23 17:55:21 +0000968 NULL
969};
970
971/*
972 * The list of HTML attributes which are of content %Script;
973 * NOTE: when adding ones, check htmlIsScriptAttribute() since
974 * it assumes the name starts with 'on'
975 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000976static const char *const htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000977 "onclick",
978 "ondblclick",
979 "onmousedown",
980 "onmouseup",
981 "onmouseover",
982 "onmousemove",
983 "onmouseout",
984 "onkeypress",
985 "onkeydown",
986 "onkeyup",
987 "onload",
988 "onunload",
989 "onfocus",
990 "onblur",
991 "onsubmit",
992 "onrest",
993 "onchange",
994 "onselect"
995};
996
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000997/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000998 * This table is used by the htmlparser to know what to do with
999 * broken html pages. By assigning different priorities to different
1000 * elements the parser can decide how to handle extra endtags.
1001 * Endtags are only allowed to close elements with lower or equal
1002 * priority.
1003 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001004
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001005typedef struct {
1006 const char *name;
1007 int priority;
1008} elementPriority;
1009
Daniel Veillard22090732001-07-16 00:06:07 +00001010static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001011 {"div", 150},
1012 {"td", 160},
1013 {"th", 160},
1014 {"tr", 170},
1015 {"thead", 180},
1016 {"tbody", 180},
1017 {"tfoot", 180},
1018 {"table", 190},
1019 {"head", 200},
1020 {"body", 200},
1021 {"html", 220},
1022 {NULL, 100} /* Default priority */
1023};
Owen Taylor3473f882001-02-23 17:55:21 +00001024
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001025static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001026static int htmlStartCloseIndexinitialized = 0;
1027
1028/************************************************************************
1029 * *
1030 * functions to handle HTML specific data *
1031 * *
1032 ************************************************************************/
1033
1034/**
1035 * htmlInitAutoClose:
1036 *
1037 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1038 * This is not reentrant. Call xmlInitParser() once before processing in
1039 * case of use in multithreaded programs.
1040 */
1041void
1042htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001043 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001044
1045 if (htmlStartCloseIndexinitialized) return;
1046
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001047 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1048 indx = 0;
1049 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
Daniel Veillard28aac0b2006-10-16 08:31:18 +00001050 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001051 while (htmlStartClose[i] != NULL) i++;
1052 i++;
1053 }
1054 htmlStartCloseIndexinitialized = 1;
1055}
1056
1057/**
1058 * htmlTagLookup:
1059 * @tag: The tag name in lowercase
1060 *
1061 * Lookup the HTML tag in the ElementTable
1062 *
1063 * Returns the related htmlElemDescPtr or NULL if not found.
1064 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001065const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001066htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001067 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001068
1069 for (i = 0; i < (sizeof(html40ElementTable) /
1070 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001071 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001072 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001073 }
1074 return(NULL);
1075}
1076
1077/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001078 * htmlGetEndPriority:
1079 * @name: The name of the element to look up the priority for.
1080 *
1081 * Return value: The "endtag" priority.
1082 **/
1083static int
1084htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001085 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001086
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001087 while ((htmlEndPriority[i].name != NULL) &&
1088 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1089 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001090
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001091 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001092}
1093
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001094
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001095/**
Owen Taylor3473f882001-02-23 17:55:21 +00001096 * htmlCheckAutoClose:
1097 * @newtag: The new tag name
1098 * @oldtag: The old tag name
1099 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001100 * Checks whether the new tag is one of the registered valid tags for
1101 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001102 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1103 *
1104 * Returns 0 if no, 1 if yes.
1105 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001106static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001107htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1108{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001109 int i, indx;
1110 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001111
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001112 if (htmlStartCloseIndexinitialized == 0)
1113 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001114
1115 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001116 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001117 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001118 if (closed == NULL)
1119 return (0);
1120 if (xmlStrEqual(BAD_CAST * closed, newtag))
1121 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001122 }
1123
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001124 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001125 i++;
1126 while (htmlStartClose[i] != NULL) {
1127 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001128 return (1);
1129 }
1130 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001131 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001132 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001133}
1134
1135/**
1136 * htmlAutoCloseOnClose:
1137 * @ctxt: an HTML parser context
1138 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001139 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001140 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001141 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001142 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001143static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001144htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1145{
1146 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001147 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001148
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001149 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001150
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001151 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001152
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001153 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1154 break;
1155 /*
1156 * A missplaced endtag can only close elements with lower
1157 * or equal priority, so if we find an element with higher
1158 * priority before we find an element with
1159 * matching name, we just ignore this endtag
1160 */
1161 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1162 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001163 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001164 if (i < 0)
1165 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001166
1167 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001168 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001169 if ((info != NULL) && (info->endTag == 3)) {
1170 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1171 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001172 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001173 }
1174 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1175 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001176 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001177 }
1178}
1179
1180/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001181 * htmlAutoCloseOnEnd:
1182 * @ctxt: an HTML parser context
1183 *
1184 * Close all remaining tags at the end of the stream
1185 */
1186static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001187htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1188{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001189 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001190
William M. Brack899e64a2003-09-26 18:03:42 +00001191 if (ctxt->nameNr == 0)
1192 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001193 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001194 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1195 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001196 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001197 }
1198}
1199
1200/**
Owen Taylor3473f882001-02-23 17:55:21 +00001201 * htmlAutoClose:
1202 * @ctxt: an HTML parser context
1203 * @newtag: The new tag name or NULL
1204 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001205 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001206 * The list is kept in htmlStartClose array. This function is
1207 * called when a new tag has been detected and generates the
1208 * appropriates closes if possible/needed.
1209 * If newtag is NULL this mean we are at the end of the resource
1210 * and we should check
1211 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001212static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001213htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1214{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001215 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001216 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001217 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1218 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001219 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001220 }
1221 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001222 htmlAutoCloseOnEnd(ctxt);
1223 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001224 }
1225 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001226 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1227 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1228 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001229 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1230 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001231 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001232 }
Owen Taylor3473f882001-02-23 17:55:21 +00001233}
1234
1235/**
1236 * htmlAutoCloseTag:
1237 * @doc: the HTML document
1238 * @name: The tag name
1239 * @elem: the HTML element
1240 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001241 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001242 * The list is kept in htmlStartClose array. This function checks
1243 * if the element or one of it's children would autoclose the
1244 * given tag.
1245 *
1246 * Returns 1 if autoclose, 0 otherwise
1247 */
1248int
1249htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1250 htmlNodePtr child;
1251
1252 if (elem == NULL) return(1);
1253 if (xmlStrEqual(name, elem->name)) return(0);
1254 if (htmlCheckAutoClose(elem->name, name)) return(1);
1255 child = elem->children;
1256 while (child != NULL) {
1257 if (htmlAutoCloseTag(doc, name, child)) return(1);
1258 child = child->next;
1259 }
1260 return(0);
1261}
1262
1263/**
1264 * htmlIsAutoClosed:
1265 * @doc: the HTML document
1266 * @elem: the HTML element
1267 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001268 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001269 * The list is kept in htmlStartClose array. This function checks
1270 * if a tag is autoclosed by one of it's child
1271 *
1272 * Returns 1 if autoclosed, 0 otherwise
1273 */
1274int
1275htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1276 htmlNodePtr child;
1277
1278 if (elem == NULL) return(1);
1279 child = elem->children;
1280 while (child != NULL) {
1281 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1282 child = child->next;
1283 }
1284 return(0);
1285}
1286
1287/**
1288 * htmlCheckImplied:
1289 * @ctxt: an HTML parser context
1290 * @newtag: The new tag name
1291 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001292 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001293 * called when a new tag has been detected and generates the
1294 * appropriates implicit tags if missing
1295 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001296static void
Owen Taylor3473f882001-02-23 17:55:21 +00001297htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1298 if (!htmlOmittedDefaultValue)
1299 return;
1300 if (xmlStrEqual(newtag, BAD_CAST"html"))
1301 return;
1302 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001303 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001304 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1305 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1306 }
1307 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1308 return;
1309 if ((ctxt->nameNr <= 1) &&
1310 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1311 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1312 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1313 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1314 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1315 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1316 /*
1317 * dropped OBJECT ... i you put it first BODY will be
1318 * assumed !
1319 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001320 htmlnamePush(ctxt, BAD_CAST"head");
Owen Taylor3473f882001-02-23 17:55:21 +00001321 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1322 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1323 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1324 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1325 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1326 int i;
1327 for (i = 0;i < ctxt->nameNr;i++) {
1328 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1329 return;
1330 }
1331 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1332 return;
1333 }
1334 }
1335
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001336 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001337 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1338 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1339 }
1340}
1341
1342/**
1343 * htmlCheckParagraph
1344 * @ctxt: an HTML parser context
1345 *
1346 * Check whether a p element need to be implied before inserting
1347 * characters in the current element.
1348 *
1349 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1350 * in case of error.
1351 */
1352
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001353static int
Owen Taylor3473f882001-02-23 17:55:21 +00001354htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1355 const xmlChar *tag;
1356 int i;
1357
1358 if (ctxt == NULL)
1359 return(-1);
1360 tag = ctxt->name;
1361 if (tag == NULL) {
1362 htmlAutoClose(ctxt, BAD_CAST"p");
1363 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001364 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001365 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1366 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1367 return(1);
1368 }
1369 if (!htmlOmittedDefaultValue)
1370 return(0);
1371 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1372 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001373 htmlAutoClose(ctxt, BAD_CAST"p");
1374 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001375 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001376 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1377 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1378 return(1);
1379 }
1380 }
1381 return(0);
1382}
1383
1384/**
1385 * htmlIsScriptAttribute:
1386 * @name: an attribute name
1387 *
1388 * Check if an attribute is of content type Script
1389 *
1390 * Returns 1 is the attribute is a script 0 otherwise
1391 */
1392int
1393htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001394 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001395
1396 if (name == NULL)
1397 return(0);
1398 /*
1399 * all script attributes start with 'on'
1400 */
1401 if ((name[0] != 'o') || (name[1] != 'n'))
1402 return(0);
1403 for (i = 0;
1404 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1405 i++) {
1406 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1407 return(1);
1408 }
1409 return(0);
1410}
1411
1412/************************************************************************
1413 * *
1414 * The list of HTML predefined entities *
1415 * *
1416 ************************************************************************/
1417
1418
Daniel Veillard22090732001-07-16 00:06:07 +00001419static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001420/*
1421 * the 4 absolute ones, plus apostrophe.
1422 */
1423{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1424{ 38, "amp", "ampersand, U+0026 ISOnum" },
1425{ 39, "apos", "single quote" },
1426{ 60, "lt", "less-than sign, U+003C ISOnum" },
1427{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1428
1429/*
1430 * A bunch still in the 128-255 range
1431 * Replacing them depend really on the charset used.
1432 */
1433{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1434{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1435{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1436{ 163, "pound","pound sign, U+00A3 ISOnum" },
1437{ 164, "curren","currency sign, U+00A4 ISOnum" },
1438{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1439{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1440{ 167, "sect", "section sign, U+00A7 ISOnum" },
1441{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1442{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1443{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1444{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1445{ 172, "not", "not sign, U+00AC ISOnum" },
1446{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1447{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1448{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1449{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1450{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1451{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1452{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1453{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1454{ 181, "micro","micro sign, U+00B5 ISOnum" },
1455{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1456{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1457{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1458{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1459{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1460{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1461{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1462{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1463{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1464{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1465{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1466{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1467{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1468{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1469{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1470{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1471{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1472{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1473{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1474{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1475{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1476{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1477{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1478{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1479{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1480{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1481{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1482{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1483{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1484{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1485{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1486{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1487{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1488{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1489{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1490{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1491{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1492{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1493{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1494{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1495{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1496{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1497{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1498{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1499{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1500{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1501{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1502{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1503{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1504{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1505{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1506{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1507{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1508{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1509{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1510{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1511{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1512{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1513{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1514{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1515{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1516{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1517{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1518{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1519{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1520{ 247, "divide","division sign, U+00F7 ISOnum" },
1521{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1522{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1523{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1524{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1525{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1526{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1527{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1528{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1529
1530{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1531{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1532{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1533{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1534{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1535
1536/*
1537 * Anything below should really be kept as entities references
1538 */
1539{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1540
1541{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1542{ 732, "tilde","small tilde, U+02DC ISOdia" },
1543
1544{ 913, "Alpha","greek capital letter alpha, U+0391" },
1545{ 914, "Beta", "greek capital letter beta, U+0392" },
1546{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1547{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1548{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1549{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1550{ 919, "Eta", "greek capital letter eta, U+0397" },
1551{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1552{ 921, "Iota", "greek capital letter iota, U+0399" },
1553{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001554{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001555{ 924, "Mu", "greek capital letter mu, U+039C" },
1556{ 925, "Nu", "greek capital letter nu, U+039D" },
1557{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1558{ 927, "Omicron","greek capital letter omicron, U+039F" },
1559{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1560{ 929, "Rho", "greek capital letter rho, U+03A1" },
1561{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1562{ 932, "Tau", "greek capital letter tau, U+03A4" },
1563{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1564{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1565{ 935, "Chi", "greek capital letter chi, U+03A7" },
1566{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1567{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1568
1569{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1570{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1571{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1572{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1573{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1574{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1575{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1576{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1577{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1578{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1579{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1580{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1581{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1582{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1583{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1584{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1585{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1586{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1587{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1588{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1589{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1590{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1591{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1592{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1593{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1594{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1595{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1596{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1597
1598{ 8194, "ensp", "en space, U+2002 ISOpub" },
1599{ 8195, "emsp", "em space, U+2003 ISOpub" },
1600{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1601{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1602{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1603{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1604{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1605{ 8211, "ndash","en dash, U+2013 ISOpub" },
1606{ 8212, "mdash","em dash, U+2014 ISOpub" },
1607{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1608{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1609{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1610{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1611{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1612{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1613{ 8224, "dagger","dagger, U+2020 ISOpub" },
1614{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1615
1616{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1617{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1618
1619{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1620
1621{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1622{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1623
1624{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1625{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1626
1627{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1628{ 8260, "frasl","fraction slash, U+2044 NEW" },
1629
1630{ 8364, "euro", "euro sign, U+20AC NEW" },
1631
1632{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1633{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1634{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1635{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1636{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1637{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1638{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1639{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1640{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1641{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1642{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1643{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1644{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1645{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1646{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1647{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1648
1649{ 8704, "forall","for all, U+2200 ISOtech" },
1650{ 8706, "part", "partial differential, U+2202 ISOtech" },
1651{ 8707, "exist","there exists, U+2203 ISOtech" },
1652{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1653{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1654{ 8712, "isin", "element of, U+2208 ISOtech" },
1655{ 8713, "notin","not an element of, U+2209 ISOtech" },
1656{ 8715, "ni", "contains as member, U+220B ISOtech" },
1657{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001658{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001659{ 8722, "minus","minus sign, U+2212 ISOtech" },
1660{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1661{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1662{ 8733, "prop", "proportional to, U+221D ISOtech" },
1663{ 8734, "infin","infinity, U+221E ISOtech" },
1664{ 8736, "ang", "angle, U+2220 ISOamso" },
1665{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1666{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1667{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1668{ 8746, "cup", "union = cup, U+222A ISOtech" },
1669{ 8747, "int", "integral, U+222B ISOtech" },
1670{ 8756, "there4","therefore, U+2234 ISOtech" },
1671{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1672{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1673{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1674{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1675{ 8801, "equiv","identical to, U+2261 ISOtech" },
1676{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1677{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1678{ 8834, "sub", "subset of, U+2282 ISOtech" },
1679{ 8835, "sup", "superset of, U+2283 ISOtech" },
1680{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1681{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1682{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1683{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1684{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1685{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1686{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1687{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1688{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1689{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1690{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1691{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1692{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1693{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1694
1695{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1696{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1697{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1698{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1699
1700};
1701
1702/************************************************************************
1703 * *
1704 * Commodity functions to handle entities *
1705 * *
1706 ************************************************************************/
1707
1708/*
1709 * Macro used to grow the current buffer.
1710 */
1711#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001712 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001713 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001714 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1715 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001716 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001717 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001718 return(NULL); \
1719 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001720 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001721}
1722
1723/**
1724 * htmlEntityLookup:
1725 * @name: the entity name
1726 *
1727 * Lookup the given entity in EntitiesTable
1728 *
1729 * TODO: the linear scan is really ugly, an hash table is really needed.
1730 *
1731 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1732 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001733const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001734htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001735 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001736
1737 for (i = 0;i < (sizeof(html40EntitiesTable)/
1738 sizeof(html40EntitiesTable[0]));i++) {
1739 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001740 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001741 }
1742 }
1743 return(NULL);
1744}
1745
1746/**
1747 * htmlEntityValueLookup:
1748 * @value: the entity's unicode value
1749 *
1750 * Lookup the given entity in EntitiesTable
1751 *
1752 * TODO: the linear scan is really ugly, an hash table is really needed.
1753 *
1754 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1755 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001756const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001757htmlEntityValueLookup(unsigned int value) {
1758 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001759
1760 for (i = 0;i < (sizeof(html40EntitiesTable)/
1761 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001762 if (html40EntitiesTable[i].value >= value) {
1763 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001764 break;
William M. Brack78637da2003-07-31 14:47:38 +00001765 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001766 }
Owen Taylor3473f882001-02-23 17:55:21 +00001767 }
1768 return(NULL);
1769}
1770
1771/**
1772 * UTF8ToHtml:
1773 * @out: a pointer to an array of bytes to store the result
1774 * @outlen: the length of @out
1775 * @in: a pointer to an array of UTF-8 chars
1776 * @inlen: the length of @in
1777 *
1778 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1779 * plus HTML entities block of chars out.
1780 *
1781 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1782 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001783 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001784 * The value of @outlen after return is the number of octets consumed.
1785 */
1786int
1787UTF8ToHtml(unsigned char* out, int *outlen,
1788 const unsigned char* in, int *inlen) {
1789 const unsigned char* processed = in;
1790 const unsigned char* outend;
1791 const unsigned char* outstart = out;
1792 const unsigned char* instart = in;
1793 const unsigned char* inend;
1794 unsigned int c, d;
1795 int trailing;
1796
Daniel Veillardce682bc2004-11-05 17:22:25 +00001797 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00001798 if (in == NULL) {
1799 /*
1800 * initialization nothing to do
1801 */
1802 *outlen = 0;
1803 *inlen = 0;
1804 return(0);
1805 }
1806 inend = in + (*inlen);
1807 outend = out + (*outlen);
1808 while (in < inend) {
1809 d = *in++;
1810 if (d < 0x80) { c= d; trailing= 0; }
1811 else if (d < 0xC0) {
1812 /* trailing byte in leading position */
1813 *outlen = out - outstart;
1814 *inlen = processed - instart;
1815 return(-2);
1816 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1817 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1818 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1819 else {
1820 /* no chance for this in Ascii */
1821 *outlen = out - outstart;
1822 *inlen = processed - instart;
1823 return(-2);
1824 }
1825
1826 if (inend - in < trailing) {
1827 break;
1828 }
1829
1830 for ( ; trailing; trailing--) {
1831 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1832 break;
1833 c <<= 6;
1834 c |= d & 0x3F;
1835 }
1836
1837 /* assertion: c is a single UTF-4 value */
1838 if (c < 0x80) {
1839 if (out + 1 >= outend)
1840 break;
1841 *out++ = c;
1842 } else {
1843 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001844 const htmlEntityDesc * ent;
Daniel Veillard1032ac42006-11-23 16:18:30 +00001845 const char *cp;
1846 char nbuf[16];
Owen Taylor3473f882001-02-23 17:55:21 +00001847
1848 /*
1849 * Try to lookup a predefined HTML entity for it
1850 */
1851
1852 ent = htmlEntityValueLookup(c);
1853 if (ent == NULL) {
Daniel Veillard1032ac42006-11-23 16:18:30 +00001854 snprintf(nbuf, sizeof(nbuf), "#%u", c);
1855 cp = nbuf;
Owen Taylor3473f882001-02-23 17:55:21 +00001856 }
Daniel Veillard1032ac42006-11-23 16:18:30 +00001857 else
1858 cp = ent->name;
1859 len = strlen(cp);
Owen Taylor3473f882001-02-23 17:55:21 +00001860 if (out + 2 + len >= outend)
1861 break;
1862 *out++ = '&';
Daniel Veillard1032ac42006-11-23 16:18:30 +00001863 memcpy(out, cp, len);
Owen Taylor3473f882001-02-23 17:55:21 +00001864 out += len;
1865 *out++ = ';';
1866 }
1867 processed = in;
1868 }
1869 *outlen = out - outstart;
1870 *inlen = processed - instart;
1871 return(0);
1872}
1873
1874/**
1875 * htmlEncodeEntities:
1876 * @out: a pointer to an array of bytes to store the result
1877 * @outlen: the length of @out
1878 * @in: a pointer to an array of UTF-8 chars
1879 * @inlen: the length of @in
1880 * @quoteChar: the quote character to escape (' or ") or zero.
1881 *
1882 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1883 * plus HTML entities block of chars out.
1884 *
1885 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1886 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001887 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001888 * The value of @outlen after return is the number of octets consumed.
1889 */
1890int
1891htmlEncodeEntities(unsigned char* out, int *outlen,
1892 const unsigned char* in, int *inlen, int quoteChar) {
1893 const unsigned char* processed = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001894 const unsigned char* outend;
Owen Taylor3473f882001-02-23 17:55:21 +00001895 const unsigned char* outstart = out;
1896 const unsigned char* instart = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001897 const unsigned char* inend;
Owen Taylor3473f882001-02-23 17:55:21 +00001898 unsigned int c, d;
1899 int trailing;
1900
Daniel Veillardce682bc2004-11-05 17:22:25 +00001901 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
1902 return(-1);
1903 outend = out + (*outlen);
1904 inend = in + (*inlen);
Owen Taylor3473f882001-02-23 17:55:21 +00001905 while (in < inend) {
1906 d = *in++;
1907 if (d < 0x80) { c= d; trailing= 0; }
1908 else if (d < 0xC0) {
1909 /* trailing byte in leading position */
1910 *outlen = out - outstart;
1911 *inlen = processed - instart;
1912 return(-2);
1913 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1914 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1915 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1916 else {
1917 /* no chance for this in Ascii */
1918 *outlen = out - outstart;
1919 *inlen = processed - instart;
1920 return(-2);
1921 }
1922
1923 if (inend - in < trailing)
1924 break;
1925
1926 while (trailing--) {
1927 if (((d= *in++) & 0xC0) != 0x80) {
1928 *outlen = out - outstart;
1929 *inlen = processed - instart;
1930 return(-2);
1931 }
1932 c <<= 6;
1933 c |= d & 0x3F;
1934 }
1935
1936 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001937 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1938 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001939 if (out >= outend)
1940 break;
1941 *out++ = c;
1942 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001943 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001944 const char *cp;
1945 char nbuf[16];
1946 int len;
1947
1948 /*
1949 * Try to lookup a predefined HTML entity for it
1950 */
1951 ent = htmlEntityValueLookup(c);
1952 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001953 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001954 cp = nbuf;
1955 }
1956 else
1957 cp = ent->name;
1958 len = strlen(cp);
1959 if (out + 2 + len > outend)
1960 break;
1961 *out++ = '&';
1962 memcpy(out, cp, len);
1963 out += len;
1964 *out++ = ';';
1965 }
1966 processed = in;
1967 }
1968 *outlen = out - outstart;
1969 *inlen = processed - instart;
1970 return(0);
1971}
1972
Owen Taylor3473f882001-02-23 17:55:21 +00001973/************************************************************************
1974 * *
1975 * Commodity functions to handle streams *
1976 * *
1977 ************************************************************************/
1978
1979/**
Owen Taylor3473f882001-02-23 17:55:21 +00001980 * htmlNewInputStream:
1981 * @ctxt: an HTML parser context
1982 *
1983 * Create a new input stream structure
1984 * Returns the new input stream or NULL
1985 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001986static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001987htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1988 htmlParserInputPtr input;
1989
1990 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1991 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00001992 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001993 return(NULL);
1994 }
1995 memset(input, 0, sizeof(htmlParserInput));
1996 input->filename = NULL;
1997 input->directory = NULL;
1998 input->base = NULL;
1999 input->cur = NULL;
2000 input->buf = NULL;
2001 input->line = 1;
2002 input->col = 1;
2003 input->buf = NULL;
2004 input->free = NULL;
2005 input->version = NULL;
2006 input->consumed = 0;
2007 input->length = 0;
2008 return(input);
2009}
2010
2011
2012/************************************************************************
2013 * *
2014 * Commodity functions, cleanup needed ? *
2015 * *
2016 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002017/*
2018 * all tags allowing pc data from the html 4.01 loose dtd
2019 * NOTE: it might be more apropriate to integrate this information
2020 * into the html40ElementTable array but I don't want to risk any
2021 * binary incomptibility
2022 */
2023static const char *allowPCData[] = {
2024 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2025 "blockquote", "body", "button", "caption", "center", "cite", "code",
2026 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2027 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2028 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2029 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2030};
Owen Taylor3473f882001-02-23 17:55:21 +00002031
2032/**
2033 * areBlanks:
2034 * @ctxt: an HTML parser context
2035 * @str: a xmlChar *
2036 * @len: the size of @str
2037 *
2038 * Is this a sequence of blank chars that one can ignore ?
2039 *
2040 * Returns 1 if ignorable 0 otherwise.
2041 */
2042
2043static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002044 unsigned int i;
2045 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002046 xmlNodePtr lastChild;
Daniel Veillard36d73402005-09-01 09:52:30 +00002047 xmlDtdPtr dtd;
Owen Taylor3473f882001-02-23 17:55:21 +00002048
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002049 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002050 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002051
2052 if (CUR == 0) return(1);
2053 if (CUR != '<') return(0);
2054 if (ctxt->name == NULL)
2055 return(1);
2056 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2057 return(1);
2058 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2059 return(1);
Daniel Veillard36d73402005-09-01 09:52:30 +00002060
2061 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2062 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2063 dtd = xmlGetIntSubset(ctxt->myDoc);
2064 if (dtd != NULL && dtd->ExternalID != NULL) {
2065 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2066 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2067 return(1);
2068 }
2069 }
2070
Owen Taylor3473f882001-02-23 17:55:21 +00002071 if (ctxt->node == NULL) return(0);
2072 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002073 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2074 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002075 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002076 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2077 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002078 /* keep ws in constructs like ...<b> </b>...
2079 for all tags "b" allowing PCDATA */
2080 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2081 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2082 return(0);
2083 }
2084 }
Owen Taylor3473f882001-02-23 17:55:21 +00002085 } else if (xmlNodeIsText(lastChild)) {
2086 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002087 } else {
2088 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2089 for all tags "p" allowing PCDATA */
2090 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2091 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2092 return(0);
2093 }
2094 }
Owen Taylor3473f882001-02-23 17:55:21 +00002095 }
2096 return(1);
2097}
2098
2099/**
Owen Taylor3473f882001-02-23 17:55:21 +00002100 * htmlNewDocNoDtD:
2101 * @URI: URI for the dtd, or NULL
2102 * @ExternalID: the external ID of the DTD, or NULL
2103 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002104 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2105 * are NULL
2106 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002107 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002108 */
2109htmlDocPtr
2110htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2111 xmlDocPtr cur;
2112
2113 /*
2114 * Allocate a new document and fill the fields.
2115 */
2116 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2117 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002118 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002119 return(NULL);
2120 }
2121 memset(cur, 0, sizeof(xmlDoc));
2122
2123 cur->type = XML_HTML_DOCUMENT_NODE;
2124 cur->version = NULL;
2125 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002126 cur->doc = cur;
2127 cur->name = NULL;
2128 cur->children = NULL;
2129 cur->extSubset = NULL;
2130 cur->oldNs = NULL;
2131 cur->encoding = NULL;
2132 cur->standalone = 1;
2133 cur->compression = 0;
2134 cur->ids = NULL;
2135 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002136 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002137 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002138 if ((ExternalID != NULL) ||
2139 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002140 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002141 return(cur);
2142}
2143
2144/**
2145 * htmlNewDoc:
2146 * @URI: URI for the dtd, or NULL
2147 * @ExternalID: the external ID of the DTD, or NULL
2148 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002149 * Creates a new HTML document
2150 *
Owen Taylor3473f882001-02-23 17:55:21 +00002151 * Returns a new document
2152 */
2153htmlDocPtr
2154htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2155 if ((URI == NULL) && (ExternalID == NULL))
2156 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002157 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2158 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002159
2160 return(htmlNewDocNoDtD(URI, ExternalID));
2161}
2162
2163
2164/************************************************************************
2165 * *
2166 * The parser itself *
2167 * Relates to http://www.w3.org/TR/html40 *
2168 * *
2169 ************************************************************************/
2170
2171/************************************************************************
2172 * *
2173 * The parser itself *
2174 * *
2175 ************************************************************************/
2176
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002177static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002178
Owen Taylor3473f882001-02-23 17:55:21 +00002179/**
2180 * htmlParseHTMLName:
2181 * @ctxt: an HTML parser context
2182 *
2183 * parse an HTML tag or attribute name, note that we convert it to lowercase
2184 * since HTML names are not case-sensitive.
2185 *
2186 * Returns the Tag Name parsed or NULL
2187 */
2188
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002189static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002190htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002191 int i = 0;
2192 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2193
William M. Brackd1757ab2004-10-02 22:07:48 +00002194 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Owen Taylor3473f882001-02-23 17:55:21 +00002195 (CUR != ':')) return(NULL);
2196
2197 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002198 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Owen Taylor3473f882001-02-23 17:55:21 +00002199 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2200 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2201 else loc[i] = CUR;
2202 i++;
2203
2204 NEXT;
2205 }
2206
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002207 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002208}
2209
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002210
2211/**
2212 * htmlParseHTMLName_nonInvasive:
2213 * @ctxt: an HTML parser context
2214 *
2215 * parse an HTML tag or attribute name, note that we convert it to lowercase
2216 * since HTML names are not case-sensitive, this doesn't consume the data
2217 * from the stream, it's a look-ahead
2218 *
2219 * Returns the Tag Name parsed or NULL
2220 */
2221
2222static const xmlChar *
2223htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2224 int i = 0;
2225 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2226
2227 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2228 (NXT(1) != ':')) return(NULL);
2229
2230 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2231 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2232 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2233 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2234 else loc[i] = NXT(1+i);
2235 i++;
2236 }
2237
2238 return(xmlDictLookup(ctxt->dict, loc, i));
2239}
2240
2241
Owen Taylor3473f882001-02-23 17:55:21 +00002242/**
2243 * htmlParseName:
2244 * @ctxt: an HTML parser context
2245 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002246 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002247 *
2248 * Returns the Name parsed or NULL
2249 */
2250
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002251static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002252htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002253 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002254 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002255 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002256
2257 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002258
2259 /*
2260 * Accelerator for simple ASCII names
2261 */
2262 in = ctxt->input->cur;
2263 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2264 ((*in >= 0x41) && (*in <= 0x5A)) ||
2265 (*in == '_') || (*in == ':')) {
2266 in++;
2267 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2268 ((*in >= 0x41) && (*in <= 0x5A)) ||
2269 ((*in >= 0x30) && (*in <= 0x39)) ||
2270 (*in == '_') || (*in == '-') ||
2271 (*in == ':') || (*in == '.'))
2272 in++;
2273 if ((*in > 0) && (*in < 0x80)) {
2274 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002275 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002276 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002277 ctxt->nbChars += count;
2278 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002279 return(ret);
2280 }
2281 }
2282 return(htmlParseNameComplex(ctxt));
2283}
2284
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002285static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002286htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002287 int len = 0, l;
2288 int c;
2289 int count = 0;
2290
2291 /*
2292 * Handler for more complex cases
2293 */
2294 GROW;
2295 c = CUR_CHAR(l);
2296 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2297 (!IS_LETTER(c) && (c != '_') &&
2298 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002299 return(NULL);
2300 }
2301
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002302 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2303 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2304 (c == '.') || (c == '-') ||
2305 (c == '_') || (c == ':') ||
2306 (IS_COMBINING(c)) ||
2307 (IS_EXTENDER(c)))) {
2308 if (count++ > 100) {
2309 count = 0;
2310 GROW;
2311 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002312 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002313 NEXTL(l);
2314 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002315 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002316 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002317}
2318
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002319
Owen Taylor3473f882001-02-23 17:55:21 +00002320/**
2321 * htmlParseHTMLAttribute:
2322 * @ctxt: an HTML parser context
2323 * @stop: a char stop value
2324 *
2325 * parse an HTML attribute value till the stop (quote), if
2326 * stop is 0 then it stops at the first space
2327 *
2328 * Returns the attribute parsed or NULL
2329 */
2330
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002331static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002332htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2333 xmlChar *buffer = NULL;
2334 int buffer_size = 0;
2335 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002336 const xmlChar *name = NULL;
2337 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002338 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002339
2340 /*
2341 * allocate a translation buffer.
2342 */
2343 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002344 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002345 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002346 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002347 return(NULL);
2348 }
2349 out = buffer;
2350
2351 /*
2352 * Ok loop until we reach one of the ending chars
2353 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002354 while ((CUR != 0) && (CUR != stop)) {
2355 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002356 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002357 if (CUR == '&') {
2358 if (NXT(1) == '#') {
2359 unsigned int c;
2360 int bits;
2361
2362 c = htmlParseCharRef(ctxt);
2363 if (c < 0x80)
2364 { *out++ = c; bits= -6; }
2365 else if (c < 0x800)
2366 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2367 else if (c < 0x10000)
2368 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2369 else
2370 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2371
2372 for ( ; bits >= 0; bits-= 6) {
2373 *out++ = ((c >> bits) & 0x3F) | 0x80;
2374 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002375
2376 if (out - buffer > buffer_size - 100) {
2377 int indx = out - buffer;
2378
2379 growBuffer(buffer);
2380 out = &buffer[indx];
2381 }
Owen Taylor3473f882001-02-23 17:55:21 +00002382 } else {
2383 ent = htmlParseEntityRef(ctxt, &name);
2384 if (name == NULL) {
2385 *out++ = '&';
2386 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002387 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002388
2389 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002390 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002391 }
2392 } else if (ent == NULL) {
2393 *out++ = '&';
2394 cur = name;
2395 while (*cur != 0) {
2396 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002397 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002398
2399 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002400 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002401 }
2402 *out++ = *cur++;
2403 }
Owen Taylor3473f882001-02-23 17:55:21 +00002404 } else {
2405 unsigned int c;
2406 int bits;
2407
2408 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002409 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002410
2411 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002412 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002413 }
Daniel Veillard48519092006-10-17 15:56:35 +00002414 c = ent->value;
Owen Taylor3473f882001-02-23 17:55:21 +00002415 if (c < 0x80)
2416 { *out++ = c; bits= -6; }
2417 else if (c < 0x800)
2418 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2419 else if (c < 0x10000)
2420 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2421 else
2422 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2423
2424 for ( ; bits >= 0; bits-= 6) {
2425 *out++ = ((c >> bits) & 0x3F) | 0x80;
2426 }
Owen Taylor3473f882001-02-23 17:55:21 +00002427 }
2428 }
2429 } else {
2430 unsigned int c;
2431 int bits, l;
2432
2433 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002434 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002435
2436 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002437 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002438 }
2439 c = CUR_CHAR(l);
2440 if (c < 0x80)
2441 { *out++ = c; bits= -6; }
2442 else if (c < 0x800)
2443 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2444 else if (c < 0x10000)
2445 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2446 else
2447 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2448
2449 for ( ; bits >= 0; bits-= 6) {
2450 *out++ = ((c >> bits) & 0x3F) | 0x80;
2451 }
2452 NEXT;
2453 }
2454 }
2455 *out++ = 0;
2456 return(buffer);
2457}
2458
2459/**
Owen Taylor3473f882001-02-23 17:55:21 +00002460 * htmlParseEntityRef:
2461 * @ctxt: an HTML parser context
2462 * @str: location to store the entity name
2463 *
2464 * parse an HTML ENTITY references
2465 *
2466 * [68] EntityRef ::= '&' Name ';'
2467 *
2468 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2469 * if non-NULL *str will have to be freed by the caller.
2470 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002471const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002472htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2473 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002474 const htmlEntityDesc * ent = NULL;
Daniel Veillard42595322004-11-08 10:52:06 +00002475
2476 if (str != NULL) *str = NULL;
2477 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002478
2479 if (CUR == '&') {
2480 NEXT;
2481 name = htmlParseName(ctxt);
2482 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002483 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2484 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002485 } else {
2486 GROW;
2487 if (CUR == ';') {
Daniel Veillard42595322004-11-08 10:52:06 +00002488 if (str != NULL)
2489 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002490
2491 /*
2492 * Lookup the entity in the table.
2493 */
2494 ent = htmlEntityLookup(name);
2495 if (ent != NULL) /* OK that's ugly !!! */
2496 NEXT;
2497 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002498 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2499 "htmlParseEntityRef: expecting ';'\n",
2500 NULL, NULL);
Daniel Veillard42595322004-11-08 10:52:06 +00002501 if (str != NULL)
2502 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002503 }
2504 }
2505 }
2506 return(ent);
2507}
2508
2509/**
2510 * htmlParseAttValue:
2511 * @ctxt: an HTML parser context
2512 *
2513 * parse a value for an attribute
2514 * Note: the parser won't do substitution of entities here, this
2515 * will be handled later in xmlStringGetNodeList, unless it was
2516 * asked for ctxt->replaceEntities != 0
2517 *
2518 * Returns the AttValue parsed or NULL.
2519 */
2520
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002521static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002522htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2523 xmlChar *ret = NULL;
2524
2525 if (CUR == '"') {
2526 NEXT;
2527 ret = htmlParseHTMLAttribute(ctxt, '"');
2528 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002529 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2530 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002531 } else
2532 NEXT;
2533 } else if (CUR == '\'') {
2534 NEXT;
2535 ret = htmlParseHTMLAttribute(ctxt, '\'');
2536 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002537 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2538 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002539 } else
2540 NEXT;
2541 } else {
2542 /*
2543 * That's an HTMLism, the attribute value may not be quoted
2544 */
2545 ret = htmlParseHTMLAttribute(ctxt, 0);
2546 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002547 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2548 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002549 }
2550 }
2551 return(ret);
2552}
2553
2554/**
2555 * htmlParseSystemLiteral:
2556 * @ctxt: an HTML parser context
2557 *
2558 * parse an HTML Literal
2559 *
2560 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2561 *
2562 * Returns the SystemLiteral parsed or NULL
2563 */
2564
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002565static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002566htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2567 const xmlChar *q;
2568 xmlChar *ret = NULL;
2569
2570 if (CUR == '"') {
2571 NEXT;
2572 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002573 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002574 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002575 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002576 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2577 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002578 } else {
2579 ret = xmlStrndup(q, CUR_PTR - q);
2580 NEXT;
2581 }
2582 } else if (CUR == '\'') {
2583 NEXT;
2584 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002585 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002586 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002587 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002588 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2589 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002590 } else {
2591 ret = xmlStrndup(q, CUR_PTR - q);
2592 NEXT;
2593 }
2594 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002595 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2596 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002597 }
2598
2599 return(ret);
2600}
2601
2602/**
2603 * htmlParsePubidLiteral:
2604 * @ctxt: an HTML parser context
2605 *
2606 * parse an HTML public literal
2607 *
2608 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2609 *
2610 * Returns the PubidLiteral parsed or NULL.
2611 */
2612
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002613static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002614htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2615 const xmlChar *q;
2616 xmlChar *ret = NULL;
2617 /*
2618 * Name ::= (Letter | '_') (NameChar)*
2619 */
2620 if (CUR == '"') {
2621 NEXT;
2622 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002623 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002624 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002625 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2626 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002627 } else {
2628 ret = xmlStrndup(q, CUR_PTR - q);
2629 NEXT;
2630 }
2631 } else if (CUR == '\'') {
2632 NEXT;
2633 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002634 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002635 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002636 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002637 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2638 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002639 } else {
2640 ret = xmlStrndup(q, CUR_PTR - q);
2641 NEXT;
2642 }
2643 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002644 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2645 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002646 }
2647
2648 return(ret);
2649}
2650
2651/**
2652 * htmlParseScript:
2653 * @ctxt: an HTML parser context
2654 *
2655 * parse the content of an HTML SCRIPT or STYLE element
2656 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2657 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2658 * http://www.w3.org/TR/html4/types.html#type-script
2659 * http://www.w3.org/TR/html4/types.html#h-6.15
2660 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2661 *
2662 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2663 * element and the value of intrinsic event attributes. User agents must
2664 * not evaluate script data as HTML markup but instead must pass it on as
2665 * data to a script engine.
2666 * NOTES:
2667 * - The content is passed like CDATA
2668 * - the attributes for style and scripting "onXXX" are also described
2669 * as CDATA but SGML allows entities references in attributes so their
2670 * processing is identical as other attributes
2671 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002672static void
Owen Taylor3473f882001-02-23 17:55:21 +00002673htmlParseScript(htmlParserCtxtPtr ctxt) {
Daniel Veillard7d2b3232005-07-14 08:57:39 +00002674 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
Owen Taylor3473f882001-02-23 17:55:21 +00002675 int nbchar = 0;
Daniel Veillard358fef42005-07-13 16:37:38 +00002676 int cur,l;
Owen Taylor3473f882001-02-23 17:55:21 +00002677
2678 SHRINK;
Daniel Veillard358fef42005-07-13 16:37:38 +00002679 cur = CUR_CHAR(l);
William M. Brack76e95df2003-10-18 16:20:14 +00002680 while (IS_CHAR_CH(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002681 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2682 (NXT(3) == '-')) {
2683 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2684 if (ctxt->sax->cdataBlock!= NULL) {
2685 /*
2686 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2687 */
2688 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002689 } else if (ctxt->sax->characters != NULL) {
2690 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Daniel Veillardc1f78342001-11-10 11:43:05 +00002691 }
2692 }
2693 nbchar = 0;
2694 htmlParseComment(ctxt);
Daniel Veillard358fef42005-07-13 16:37:38 +00002695 cur = CUR_CHAR(l);
Daniel Veillardc1f78342001-11-10 11:43:05 +00002696 continue;
2697 } else if ((cur == '<') && (NXT(1) == '/')) {
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002698 /*
2699 * One should break here, the specification is clear:
2700 * Authors should therefore escape "</" within the content.
2701 * Escape mechanisms are specific to each scripting or
2702 * style sheet language.
2703 *
2704 * In recovery mode, only break if end tag match the
2705 * current tag, effectively ignoring all tags inside the
2706 * script/style block and treating the entire block as
2707 * CDATA.
2708 */
2709 if (ctxt->recovery) {
2710 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2711 xmlStrlen(ctxt->name)) == 0)
2712 {
2713 break; /* while */
2714 } else {
2715 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
Daniel Veillard2cf36a12005-10-25 12:21:29 +00002716 "Element %s embeds close tag\n",
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002717 ctxt->name, NULL);
2718 }
2719 } else {
2720 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2721 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2722 {
2723 break; /* while */
2724 }
2725 }
Owen Taylor3473f882001-02-23 17:55:21 +00002726 }
Daniel Veillard358fef42005-07-13 16:37:38 +00002727 COPY_BUF(l,buf,nbchar,cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002728 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2729 if (ctxt->sax->cdataBlock!= NULL) {
2730 /*
2731 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2732 */
2733 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002734 } else if (ctxt->sax->characters != NULL) {
2735 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002736 }
2737 nbchar = 0;
2738 }
Daniel Veillardb9900082005-10-25 12:36:29 +00002739 GROW;
Daniel Veillard358fef42005-07-13 16:37:38 +00002740 NEXTL(l);
2741 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002742 }
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002743
Daniel Veillard68716a72006-10-16 09:32:17 +00002744 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002745 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2746 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002747 NEXT;
2748 }
2749
2750 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2751 if (ctxt->sax->cdataBlock!= NULL) {
2752 /*
2753 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2754 */
2755 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002756 } else if (ctxt->sax->characters != NULL) {
2757 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002758 }
2759 }
2760}
2761
2762
2763/**
2764 * htmlParseCharData:
2765 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002766 *
2767 * parse a CharData section.
2768 * if we are within a CDATA section ']]>' marks an end of section.
2769 *
2770 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2771 */
2772
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002773static void
2774htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002775 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2776 int nbchar = 0;
2777 int cur, l;
2778
2779 SHRINK;
2780 cur = CUR_CHAR(l);
2781 while (((cur != '<') || (ctxt->token == '<')) &&
2782 ((cur != '&') || (ctxt->token == '&')) &&
2783 (IS_CHAR(cur))) {
2784 COPY_BUF(l,buf,nbchar,cur);
2785 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2786 /*
2787 * Ok the segment is to be consumed as chars.
2788 */
2789 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2790 if (areBlanks(ctxt, buf, nbchar)) {
2791 if (ctxt->sax->ignorableWhitespace != NULL)
2792 ctxt->sax->ignorableWhitespace(ctxt->userData,
2793 buf, nbchar);
2794 } else {
2795 htmlCheckParagraph(ctxt);
2796 if (ctxt->sax->characters != NULL)
2797 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2798 }
2799 }
2800 nbchar = 0;
2801 }
2802 NEXTL(l);
2803 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002804 if (cur == 0) {
2805 SHRINK;
2806 GROW;
2807 cur = CUR_CHAR(l);
2808 }
Owen Taylor3473f882001-02-23 17:55:21 +00002809 }
2810 if (nbchar != 0) {
Daniel Veillardd2755a82005-08-07 23:42:39 +00002811 buf[nbchar] = 0;
2812
Owen Taylor3473f882001-02-23 17:55:21 +00002813 /*
2814 * Ok the segment is to be consumed as chars.
2815 */
2816 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2817 if (areBlanks(ctxt, buf, nbchar)) {
2818 if (ctxt->sax->ignorableWhitespace != NULL)
2819 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2820 } else {
2821 htmlCheckParagraph(ctxt);
2822 if (ctxt->sax->characters != NULL)
2823 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2824 }
2825 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002826 } else {
2827 /*
2828 * Loop detection
2829 */
2830 if (cur == 0)
2831 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002832 }
2833}
2834
2835/**
2836 * htmlParseExternalID:
2837 * @ctxt: an HTML parser context
2838 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002839 *
2840 * Parse an External ID or a Public ID
2841 *
Owen Taylor3473f882001-02-23 17:55:21 +00002842 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2843 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2844 *
2845 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2846 *
2847 * Returns the function returns SystemLiteral and in the second
2848 * case publicID receives PubidLiteral, is strict is off
2849 * it is possible to return NULL and have publicID set.
2850 */
2851
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002852static xmlChar *
2853htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002854 xmlChar *URI = NULL;
2855
2856 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2857 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2858 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2859 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002860 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002861 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2862 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002863 }
2864 SKIP_BLANKS;
2865 URI = htmlParseSystemLiteral(ctxt);
2866 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002867 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2868 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002869 }
2870 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2871 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2872 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2873 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002874 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002875 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2876 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002877 }
2878 SKIP_BLANKS;
2879 *publicID = htmlParsePubidLiteral(ctxt);
2880 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002881 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2882 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2883 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002884 }
2885 SKIP_BLANKS;
2886 if ((CUR == '"') || (CUR == '\'')) {
2887 URI = htmlParseSystemLiteral(ctxt);
2888 }
2889 }
2890 return(URI);
2891}
2892
2893/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00002894 * xmlParsePI:
2895 * @ctxt: an XML parser context
2896 *
2897 * parse an XML Processing Instruction.
2898 *
2899 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2900 */
2901static void
2902htmlParsePI(htmlParserCtxtPtr ctxt) {
2903 xmlChar *buf = NULL;
2904 int len = 0;
2905 int size = HTML_PARSER_BUFFER_SIZE;
2906 int cur, l;
2907 const xmlChar *target;
2908 xmlParserInputState state;
2909 int count = 0;
2910
2911 if ((RAW == '<') && (NXT(1) == '?')) {
2912 state = ctxt->instate;
2913 ctxt->instate = XML_PARSER_PI;
2914 /*
2915 * this is a Processing Instruction.
2916 */
2917 SKIP(2);
2918 SHRINK;
2919
2920 /*
2921 * Parse the target name and check for special support like
2922 * namespace.
2923 */
2924 target = htmlParseName(ctxt);
2925 if (target != NULL) {
2926 if (RAW == '>') {
2927 SKIP(1);
2928
2929 /*
2930 * SAX: PI detected.
2931 */
2932 if ((ctxt->sax) && (!ctxt->disableSAX) &&
2933 (ctxt->sax->processingInstruction != NULL))
2934 ctxt->sax->processingInstruction(ctxt->userData,
2935 target, NULL);
2936 ctxt->instate = state;
2937 return;
2938 }
2939 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
2940 if (buf == NULL) {
2941 htmlErrMemory(ctxt, NULL);
2942 ctxt->instate = state;
2943 return;
2944 }
2945 cur = CUR;
2946 if (!IS_BLANK(cur)) {
2947 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2948 "ParsePI: PI %s space expected\n", target, NULL);
2949 }
2950 SKIP_BLANKS;
2951 cur = CUR_CHAR(l);
2952 while (IS_CHAR(cur) && (cur != '>')) {
2953 if (len + 5 >= size) {
2954 xmlChar *tmp;
2955
2956 size *= 2;
2957 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2958 if (tmp == NULL) {
2959 htmlErrMemory(ctxt, NULL);
2960 xmlFree(buf);
2961 ctxt->instate = state;
2962 return;
2963 }
2964 buf = tmp;
2965 }
2966 count++;
2967 if (count > 50) {
2968 GROW;
2969 count = 0;
2970 }
2971 COPY_BUF(l,buf,len,cur);
2972 NEXTL(l);
2973 cur = CUR_CHAR(l);
2974 if (cur == 0) {
2975 SHRINK;
2976 GROW;
2977 cur = CUR_CHAR(l);
2978 }
2979 }
2980 buf[len] = 0;
2981 if (cur != '>') {
2982 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
2983 "ParsePI: PI %s never end ...\n", target, NULL);
2984 } else {
2985 SKIP(1);
2986
2987 /*
2988 * SAX: PI detected.
2989 */
2990 if ((ctxt->sax) && (!ctxt->disableSAX) &&
2991 (ctxt->sax->processingInstruction != NULL))
2992 ctxt->sax->processingInstruction(ctxt->userData,
2993 target, buf);
2994 }
2995 xmlFree(buf);
2996 } else {
2997 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
2998 "PI is not started correctly", NULL, NULL);
2999 }
3000 ctxt->instate = state;
3001 }
3002}
3003
3004/**
Owen Taylor3473f882001-02-23 17:55:21 +00003005 * htmlParseComment:
3006 * @ctxt: an HTML parser context
3007 *
3008 * Parse an XML (SGML) comment <!-- .... -->
3009 *
3010 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3011 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003012static void
Owen Taylor3473f882001-02-23 17:55:21 +00003013htmlParseComment(htmlParserCtxtPtr ctxt) {
3014 xmlChar *buf = NULL;
3015 int len;
3016 int size = HTML_PARSER_BUFFER_SIZE;
3017 int q, ql;
3018 int r, rl;
3019 int cur, l;
3020 xmlParserInputState state;
3021
3022 /*
3023 * Check that there is a comment right here.
3024 */
3025 if ((RAW != '<') || (NXT(1) != '!') ||
3026 (NXT(2) != '-') || (NXT(3) != '-')) return;
3027
3028 state = ctxt->instate;
3029 ctxt->instate = XML_PARSER_COMMENT;
3030 SHRINK;
3031 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003032 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00003033 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003034 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003035 ctxt->instate = state;
3036 return;
3037 }
3038 q = CUR_CHAR(ql);
3039 NEXTL(ql);
3040 r = CUR_CHAR(rl);
3041 NEXTL(rl);
3042 cur = CUR_CHAR(l);
3043 len = 0;
3044 while (IS_CHAR(cur) &&
3045 ((cur != '>') ||
3046 (r != '-') || (q != '-'))) {
3047 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00003048 xmlChar *tmp;
3049
Owen Taylor3473f882001-02-23 17:55:21 +00003050 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00003051 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3052 if (tmp == NULL) {
3053 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00003054 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003055 ctxt->instate = state;
3056 return;
3057 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00003058 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00003059 }
3060 COPY_BUF(ql,buf,len,q);
3061 q = r;
3062 ql = rl;
3063 r = cur;
3064 rl = l;
3065 NEXTL(l);
3066 cur = CUR_CHAR(l);
3067 if (cur == 0) {
3068 SHRINK;
3069 GROW;
3070 cur = CUR_CHAR(l);
3071 }
3072 }
3073 buf[len] = 0;
3074 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003075 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3076 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003077 xmlFree(buf);
3078 } else {
3079 NEXT;
3080 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3081 (!ctxt->disableSAX))
3082 ctxt->sax->comment(ctxt->userData, buf);
3083 xmlFree(buf);
3084 }
3085 ctxt->instate = state;
3086}
3087
3088/**
3089 * htmlParseCharRef:
3090 * @ctxt: an HTML parser context
3091 *
3092 * parse Reference declarations
3093 *
3094 * [66] CharRef ::= '&#' [0-9]+ ';' |
3095 * '&#x' [0-9a-fA-F]+ ';'
3096 *
3097 * Returns the value parsed (as an int)
3098 */
3099int
3100htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3101 int val = 0;
3102
Daniel Veillarda03e3652004-11-02 18:45:30 +00003103 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3104 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3105 "htmlParseCharRef: context error\n",
3106 NULL, NULL);
3107 return(0);
3108 }
Owen Taylor3473f882001-02-23 17:55:21 +00003109 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003110 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003111 SKIP(3);
3112 while (CUR != ';') {
3113 if ((CUR >= '0') && (CUR <= '9'))
3114 val = val * 16 + (CUR - '0');
3115 else if ((CUR >= 'a') && (CUR <= 'f'))
3116 val = val * 16 + (CUR - 'a') + 10;
3117 else if ((CUR >= 'A') && (CUR <= 'F'))
3118 val = val * 16 + (CUR - 'A') + 10;
3119 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003120 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3121 "htmlParseCharRef: invalid hexadecimal value\n",
3122 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003123 return(0);
3124 }
3125 NEXT;
3126 }
3127 if (CUR == ';')
3128 NEXT;
3129 } else if ((CUR == '&') && (NXT(1) == '#')) {
3130 SKIP(2);
3131 while (CUR != ';') {
3132 if ((CUR >= '0') && (CUR <= '9'))
3133 val = val * 10 + (CUR - '0');
3134 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003135 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3136 "htmlParseCharRef: invalid decimal value\n",
3137 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003138 return(0);
3139 }
3140 NEXT;
3141 }
3142 if (CUR == ';')
3143 NEXT;
3144 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003145 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3146 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003147 }
3148 /*
3149 * Check the value IS_CHAR ...
3150 */
3151 if (IS_CHAR(val)) {
3152 return(val);
3153 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003154 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3155 "htmlParseCharRef: invalid xmlChar value %d\n",
3156 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003157 }
3158 return(0);
3159}
3160
3161
3162/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003163 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003164 * @ctxt: an HTML parser context
3165 *
3166 * parse a DOCTYPE declaration
3167 *
3168 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3169 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3170 */
3171
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003172static void
Owen Taylor3473f882001-02-23 17:55:21 +00003173htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003174 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003175 xmlChar *ExternalID = NULL;
3176 xmlChar *URI = NULL;
3177
3178 /*
3179 * We know that '<!DOCTYPE' has been detected.
3180 */
3181 SKIP(9);
3182
3183 SKIP_BLANKS;
3184
3185 /*
3186 * Parse the DOCTYPE name.
3187 */
3188 name = htmlParseName(ctxt);
3189 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003190 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3191 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3192 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003193 }
3194 /*
3195 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3196 */
3197
3198 SKIP_BLANKS;
3199
3200 /*
3201 * Check for SystemID and ExternalID
3202 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003203 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003204 SKIP_BLANKS;
3205
3206 /*
3207 * We should be at the end of the DOCTYPE declaration.
3208 */
3209 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003210 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3211 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003212 /* We shouldn't try to resynchronize ... */
3213 }
3214 NEXT;
3215
3216 /*
3217 * Create or update the document accordingly to the DOCTYPE
3218 */
3219 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3220 (!ctxt->disableSAX))
3221 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3222
3223 /*
3224 * Cleanup, since we don't use all those identifiers
3225 */
3226 if (URI != NULL) xmlFree(URI);
3227 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003228}
3229
3230/**
3231 * htmlParseAttribute:
3232 * @ctxt: an HTML parser context
3233 * @value: a xmlChar ** used to store the value of the attribute
3234 *
3235 * parse an attribute
3236 *
3237 * [41] Attribute ::= Name Eq AttValue
3238 *
3239 * [25] Eq ::= S? '=' S?
3240 *
3241 * With namespace:
3242 *
3243 * [NS 11] Attribute ::= QName Eq AttValue
3244 *
3245 * Also the case QName == xmlns:??? is handled independently as a namespace
3246 * definition.
3247 *
3248 * Returns the attribute name, and the value in *value.
3249 */
3250
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003251static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003252htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003253 const xmlChar *name;
3254 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003255
3256 *value = NULL;
3257 name = htmlParseHTMLName(ctxt);
3258 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003259 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3260 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003261 return(NULL);
3262 }
3263
3264 /*
3265 * read the value
3266 */
3267 SKIP_BLANKS;
3268 if (CUR == '=') {
3269 NEXT;
3270 SKIP_BLANKS;
3271 val = htmlParseAttValue(ctxt);
Daniel Veillardc47d2632006-10-17 16:13:27 +00003272 } else if (htmlIsBooleanAttr(name)) {
3273 /*
3274 * assume a minimized attribute
3275 */
3276 val = xmlStrdup(name);
Owen Taylor3473f882001-02-23 17:55:21 +00003277 }
3278
3279 *value = val;
3280 return(name);
3281}
3282
3283/**
3284 * htmlCheckEncoding:
3285 * @ctxt: an HTML parser context
3286 * @attvalue: the attribute value
3287 *
3288 * Checks an http-equiv attribute from a Meta tag to detect
3289 * the encoding
3290 * If a new encoding is detected the parser is switched to decode
3291 * it and pass UTF8
3292 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003293static void
Owen Taylor3473f882001-02-23 17:55:21 +00003294htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3295 const xmlChar *encoding;
3296
3297 if ((ctxt == NULL) || (attvalue == NULL))
3298 return;
3299
3300 /* do not change encoding */
3301 if (ctxt->input->encoding != NULL)
3302 return;
3303
3304 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3305 if (encoding != NULL) {
3306 encoding += 8;
3307 } else {
3308 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3309 if (encoding != NULL)
3310 encoding += 9;
3311 }
3312 if (encoding != NULL) {
3313 xmlCharEncoding enc;
3314 xmlCharEncodingHandlerPtr handler;
3315
3316 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3317
3318 if (ctxt->input->encoding != NULL)
3319 xmlFree((xmlChar *) ctxt->input->encoding);
3320 ctxt->input->encoding = xmlStrdup(encoding);
3321
3322 enc = xmlParseCharEncoding((const char *) encoding);
3323 /*
3324 * registered set of known encodings
3325 */
3326 if (enc != XML_CHAR_ENCODING_ERROR) {
Daniel Veillard7e303562006-10-16 13:14:55 +00003327 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3328 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3329 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3330 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3331 (ctxt->input->buf != NULL) &&
3332 (ctxt->input->buf->encoder == NULL)) {
3333 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3334 "htmlCheckEncoding: wrong encoding meta\n",
3335 NULL, NULL);
3336 } else {
3337 xmlSwitchEncoding(ctxt, enc);
3338 }
Owen Taylor3473f882001-02-23 17:55:21 +00003339 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3340 } else {
3341 /*
3342 * fallback for unknown encodings
3343 */
3344 handler = xmlFindCharEncodingHandler((const char *) encoding);
3345 if (handler != NULL) {
3346 xmlSwitchToEncoding(ctxt, handler);
3347 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3348 } else {
3349 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3350 }
3351 }
3352
3353 if ((ctxt->input->buf != NULL) &&
3354 (ctxt->input->buf->encoder != NULL) &&
3355 (ctxt->input->buf->raw != NULL) &&
3356 (ctxt->input->buf->buffer != NULL)) {
3357 int nbchars;
3358 int processed;
3359
3360 /*
3361 * convert as much as possible to the parser reading buffer.
3362 */
3363 processed = ctxt->input->cur - ctxt->input->base;
3364 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3365 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3366 ctxt->input->buf->buffer,
3367 ctxt->input->buf->raw);
3368 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003369 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3370 "htmlCheckEncoding: encoder error\n",
3371 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003372 }
3373 ctxt->input->base =
3374 ctxt->input->cur = ctxt->input->buf->buffer->content;
3375 }
3376 }
3377}
3378
3379/**
3380 * htmlCheckMeta:
3381 * @ctxt: an HTML parser context
3382 * @atts: the attributes values
3383 *
3384 * Checks an attributes from a Meta tag
3385 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003386static void
Owen Taylor3473f882001-02-23 17:55:21 +00003387htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3388 int i;
3389 const xmlChar *att, *value;
3390 int http = 0;
3391 const xmlChar *content = NULL;
3392
3393 if ((ctxt == NULL) || (atts == NULL))
3394 return;
3395
3396 i = 0;
3397 att = atts[i++];
3398 while (att != NULL) {
3399 value = atts[i++];
3400 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3401 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3402 http = 1;
3403 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3404 content = value;
3405 att = atts[i++];
3406 }
3407 if ((http) && (content != NULL))
3408 htmlCheckEncoding(ctxt, content);
3409
3410}
3411
3412/**
3413 * htmlParseStartTag:
3414 * @ctxt: an HTML parser context
3415 *
3416 * parse a start of tag either for rule element or
3417 * EmptyElement. In both case we don't parse the tag closing chars.
3418 *
3419 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3420 *
3421 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3422 *
3423 * With namespace:
3424 *
3425 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3426 *
3427 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3428 *
Daniel Veillard597f1c12005-07-03 23:00:18 +00003429 * Returns 0 in case of success and -1 in case of error.
Owen Taylor3473f882001-02-23 17:55:21 +00003430 */
3431
Daniel Veillard597f1c12005-07-03 23:00:18 +00003432static int
Owen Taylor3473f882001-02-23 17:55:21 +00003433htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003434 const xmlChar *name;
3435 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003436 xmlChar *attvalue;
Daniel Veillard30e76072006-03-09 14:13:55 +00003437 const xmlChar **atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003438 int nbatts = 0;
Daniel Veillard30e76072006-03-09 14:13:55 +00003439 int maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003440 int meta = 0;
3441 int i;
3442
Daniel Veillarda03e3652004-11-02 18:45:30 +00003443 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3444 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3445 "htmlParseStartTag: context error\n", NULL, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003446 return -1;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003447 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003448 if (CUR != '<') return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003449 NEXT;
3450
Daniel Veillard30e76072006-03-09 14:13:55 +00003451 atts = ctxt->atts;
3452 maxatts = ctxt->maxatts;
3453
Owen Taylor3473f882001-02-23 17:55:21 +00003454 GROW;
3455 name = htmlParseHTMLName(ctxt);
3456 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003457 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3458 "htmlParseStartTag: invalid element name\n",
3459 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003460 /* Dump the bogus tag like browsers do */
William M. Brack76e95df2003-10-18 16:20:14 +00003461 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Owen Taylor3473f882001-02-23 17:55:21 +00003462 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003463 return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003464 }
3465 if (xmlStrEqual(name, BAD_CAST"meta"))
3466 meta = 1;
3467
3468 /*
3469 * Check for auto-closure of HTML elements.
3470 */
3471 htmlAutoClose(ctxt, name);
3472
3473 /*
3474 * Check for implied HTML elements.
3475 */
3476 htmlCheckImplied(ctxt, name);
3477
3478 /*
3479 * Avoid html at any level > 0, head at any level != 1
3480 * or any attempt to recurse body
3481 */
3482 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003483 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3484 "htmlParseStartTag: misplaced <html> tag\n",
3485 name, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003486 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003487 }
3488 if ((ctxt->nameNr != 1) &&
3489 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003490 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3491 "htmlParseStartTag: misplaced <head> tag\n",
3492 name, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003493 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003494 }
3495 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003496 int indx;
3497 for (indx = 0;indx < ctxt->nameNr;indx++) {
3498 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003499 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3500 "htmlParseStartTag: misplaced <body> tag\n",
3501 name, NULL);
Daniel Veillardc59d8262003-11-20 21:59:12 +00003502 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3503 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003504 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003505 }
3506 }
3507 }
3508
3509 /*
3510 * Now parse the attributes, it ends up with the ending
3511 *
3512 * (S Attribute)* S?
3513 */
3514 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003515 while ((IS_CHAR_CH(CUR)) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003516 (CUR != '>') &&
3517 ((CUR != '/') || (NXT(1) != '>'))) {
3518 long cons = ctxt->nbChars;
3519
3520 GROW;
3521 attname = htmlParseAttribute(ctxt, &attvalue);
3522 if (attname != NULL) {
3523
3524 /*
3525 * Well formedness requires at most one declaration of an attribute
3526 */
3527 for (i = 0; i < nbatts;i += 2) {
3528 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003529 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3530 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003531 if (attvalue != NULL)
3532 xmlFree(attvalue);
3533 goto failed;
3534 }
3535 }
3536
3537 /*
3538 * Add the pair to atts
3539 */
3540 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003541 maxatts = 22; /* allow for 10 attrs by default */
3542 atts = (const xmlChar **)
3543 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003544 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003545 htmlErrMemory(ctxt, NULL);
3546 if (attvalue != NULL)
3547 xmlFree(attvalue);
3548 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003549 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003550 ctxt->atts = atts;
3551 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003552 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003553 const xmlChar **n;
3554
Owen Taylor3473f882001-02-23 17:55:21 +00003555 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003556 n = (const xmlChar **) xmlRealloc((void *) atts,
3557 maxatts * sizeof(const xmlChar *));
3558 if (n == NULL) {
3559 htmlErrMemory(ctxt, NULL);
3560 if (attvalue != NULL)
3561 xmlFree(attvalue);
3562 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003563 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003564 atts = n;
3565 ctxt->atts = atts;
3566 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003567 }
3568 atts[nbatts++] = attname;
3569 atts[nbatts++] = attvalue;
3570 atts[nbatts] = NULL;
3571 atts[nbatts + 1] = NULL;
3572 }
3573 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003574 if (attvalue != NULL)
3575 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003576 /* Dump the bogus attribute string up to the next blank or
3577 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003578 while ((IS_CHAR_CH(CUR)) &&
3579 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003580 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003581 NEXT;
3582 }
3583
3584failed:
3585 SKIP_BLANKS;
3586 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003587 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3588 "htmlParseStartTag: problem parsing attributes\n",
3589 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003590 break;
3591 }
3592 }
3593
3594 /*
3595 * Handle specific association to the META tag
3596 */
William M. Bracke978ae22007-03-21 06:16:02 +00003597 if (meta && (nbatts != 0))
Owen Taylor3473f882001-02-23 17:55:21 +00003598 htmlCheckMeta(ctxt, atts);
3599
3600 /*
3601 * SAX: Start of Element !
3602 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003603 htmlnamePush(ctxt, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003604 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3605 if (nbatts != 0)
3606 ctxt->sax->startElement(ctxt->userData, name, atts);
3607 else
3608 ctxt->sax->startElement(ctxt->userData, name, NULL);
3609 }
Owen Taylor3473f882001-02-23 17:55:21 +00003610
3611 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003612 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003613 if (atts[i] != NULL)
3614 xmlFree((xmlChar *) atts[i]);
3615 }
Owen Taylor3473f882001-02-23 17:55:21 +00003616 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003617
3618 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003619}
3620
3621/**
3622 * htmlParseEndTag:
3623 * @ctxt: an HTML parser context
3624 *
3625 * parse an end of tag
3626 *
3627 * [42] ETag ::= '</' Name S? '>'
3628 *
3629 * With namespace
3630 *
3631 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003632 *
3633 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003634 */
3635
Daniel Veillardf420ac52001-07-04 16:04:09 +00003636static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003637htmlParseEndTag(htmlParserCtxtPtr ctxt)
3638{
3639 const xmlChar *name;
3640 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003641 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003642
3643 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003644 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3645 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003646 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003647 }
3648 SKIP(2);
3649
3650 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003651 if (name == NULL)
3652 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003653
3654 /*
3655 * We should definitely be at the ending "S? '>'" part
3656 */
3657 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003658 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003659 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3660 "End tag : expected '>'\n", NULL, NULL);
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00003661 if (ctxt->recovery) {
3662 /*
3663 * We're not at the ending > !!
3664 * Error, unless in recover mode where we search forwards
3665 * until we find a >
3666 */
3667 while (CUR != '\0' && CUR != '>') NEXT;
3668 NEXT;
3669 }
Owen Taylor3473f882001-02-23 17:55:21 +00003670 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003671 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003672
3673 /*
3674 * If the name read is not one of the element in the parsing stack
3675 * then return, it's just an error.
3676 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003677 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3678 if (xmlStrEqual(name, ctxt->nameTab[i]))
3679 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003680 }
3681 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003682 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3683 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003684 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003685 }
3686
3687
3688 /*
3689 * Check for auto-closure of HTML elements.
3690 */
3691
3692 htmlAutoCloseOnClose(ctxt, name);
3693
3694 /*
3695 * Well formedness constraints, opening and closing must match.
3696 * With the exception that the autoclose may have popped stuff out
3697 * of the stack.
3698 */
3699 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003700 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003701 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3702 "Opening and ending tag mismatch: %s and %s\n",
3703 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003704 }
3705 }
3706
3707 /*
3708 * SAX: End of Tag
3709 */
3710 oldname = ctxt->name;
3711 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003712 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3713 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003714 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003715 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003716 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003717 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003718 }
3719
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003720 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003721}
3722
3723
3724/**
3725 * htmlParseReference:
3726 * @ctxt: an HTML parser context
3727 *
3728 * parse and handle entity references in content,
3729 * this will end-up in a call to character() since this is either a
3730 * CharRef, or a predefined entity.
3731 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003732static void
Owen Taylor3473f882001-02-23 17:55:21 +00003733htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003734 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003735 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003736 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003737 if (CUR != '&') return;
3738
3739 if (NXT(1) == '#') {
3740 unsigned int c;
3741 int bits, i = 0;
3742
3743 c = htmlParseCharRef(ctxt);
3744 if (c == 0)
3745 return;
3746
3747 if (c < 0x80) { out[i++]= c; bits= -6; }
3748 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3749 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3750 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3751
3752 for ( ; bits >= 0; bits-= 6) {
3753 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3754 }
3755 out[i] = 0;
3756
3757 htmlCheckParagraph(ctxt);
3758 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3759 ctxt->sax->characters(ctxt->userData, out, i);
3760 } else {
3761 ent = htmlParseEntityRef(ctxt, &name);
3762 if (name == NULL) {
3763 htmlCheckParagraph(ctxt);
3764 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3765 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3766 return;
3767 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003768 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003769 htmlCheckParagraph(ctxt);
3770 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3771 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3772 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3773 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3774 }
3775 } else {
3776 unsigned int c;
3777 int bits, i = 0;
3778
3779 c = ent->value;
3780 if (c < 0x80)
3781 { out[i++]= c; bits= -6; }
3782 else if (c < 0x800)
3783 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3784 else if (c < 0x10000)
3785 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3786 else
3787 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3788
3789 for ( ; bits >= 0; bits-= 6) {
3790 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3791 }
3792 out[i] = 0;
3793
3794 htmlCheckParagraph(ctxt);
3795 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3796 ctxt->sax->characters(ctxt->userData, out, i);
3797 }
Owen Taylor3473f882001-02-23 17:55:21 +00003798 }
3799}
3800
3801/**
3802 * htmlParseContent:
3803 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00003804 *
3805 * Parse a content: comment, sub-element, reference or text.
Owen Taylor3473f882001-02-23 17:55:21 +00003806 */
3807
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003808static void
Owen Taylor3473f882001-02-23 17:55:21 +00003809htmlParseContent(htmlParserCtxtPtr ctxt) {
3810 xmlChar *currentNode;
3811 int depth;
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003812 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003813
3814 currentNode = xmlStrdup(ctxt->name);
3815 depth = ctxt->nameNr;
3816 while (1) {
3817 long cons = ctxt->nbChars;
3818
3819 GROW;
3820 /*
3821 * Our tag or one of it's parent or children is ending.
3822 */
3823 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003824 if (htmlParseEndTag(ctxt) &&
3825 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3826 if (currentNode != NULL)
3827 xmlFree(currentNode);
3828 return;
3829 }
3830 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003831 }
3832
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003833 else if ((CUR == '<') &&
3834 ((IS_ASCII_LETTER(NXT(1))) ||
3835 (NXT(1) == '_') || (NXT(1) == ':'))) {
3836 name = htmlParseHTMLName_nonInvasive(ctxt);
3837 if (name == NULL) {
3838 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3839 "htmlParseStartTag: invalid element name\n",
3840 NULL, NULL);
3841 /* Dump the bogus tag like browsers do */
3842 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3843 NEXT;
3844
3845 if (currentNode != NULL)
3846 xmlFree(currentNode);
3847 return;
3848 }
3849
3850 if (ctxt->name != NULL) {
3851 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
3852 htmlAutoClose(ctxt, name);
3853 continue;
3854 }
3855 }
3856 }
3857
Owen Taylor3473f882001-02-23 17:55:21 +00003858 /*
3859 * Has this node been popped out during parsing of
3860 * the next element
3861 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003862 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3863 (!xmlStrEqual(currentNode, ctxt->name)))
3864 {
Owen Taylor3473f882001-02-23 17:55:21 +00003865 if (currentNode != NULL) xmlFree(currentNode);
3866 return;
3867 }
3868
Daniel Veillardf9533d12001-03-03 10:04:57 +00003869 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3870 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003871 /*
3872 * Handle SCRIPT/STYLE separately
3873 */
3874 htmlParseScript(ctxt);
3875 } else {
3876 /*
3877 * Sometimes DOCTYPE arrives in the middle of the document
3878 */
3879 if ((CUR == '<') && (NXT(1) == '!') &&
3880 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3881 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3882 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3883 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003884 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3885 "Misplaced DOCTYPE declaration\n",
3886 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003887 htmlParseDocTypeDecl(ctxt);
3888 }
3889
3890 /*
3891 * First case : a comment
3892 */
3893 if ((CUR == '<') && (NXT(1) == '!') &&
3894 (NXT(2) == '-') && (NXT(3) == '-')) {
3895 htmlParseComment(ctxt);
3896 }
3897
3898 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003899 * Second case : a Processing Instruction.
3900 */
3901 else if ((CUR == '<') && (NXT(1) == '?')) {
3902 htmlParsePI(ctxt);
3903 }
3904
3905 /*
3906 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00003907 */
3908 else if (CUR == '<') {
3909 htmlParseElement(ctxt);
3910 }
3911
3912 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003913 * Fourth case : a reference. If if has not been resolved,
Owen Taylor3473f882001-02-23 17:55:21 +00003914 * parsing returns it's Name, create the node
3915 */
3916 else if (CUR == '&') {
3917 htmlParseReference(ctxt);
3918 }
3919
3920 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003921 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00003922 */
3923 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003924 htmlAutoCloseOnEnd(ctxt);
3925 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003926 }
3927
3928 /*
3929 * Last case, text. Note that References are handled directly.
3930 */
3931 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003932 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003933 }
3934
3935 if (cons == ctxt->nbChars) {
3936 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003937 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3938 "detected an error in element content\n",
3939 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003940 }
3941 break;
3942 }
3943 }
3944 GROW;
3945 }
3946 if (currentNode != NULL) xmlFree(currentNode);
3947}
3948
3949/**
Daniel Veillard499cc922006-01-18 17:22:35 +00003950 * htmlParseContent:
3951 * @ctxt: an HTML parser context
3952 *
3953 * Parse a content: comment, sub-element, reference or text.
3954 */
3955
3956void
3957__htmlParseContent(void *ctxt) {
3958 if (ctxt != NULL)
3959 htmlParseContent((htmlParserCtxtPtr) ctxt);
3960}
3961
3962/**
Owen Taylor3473f882001-02-23 17:55:21 +00003963 * htmlParseElement:
3964 * @ctxt: an HTML parser context
3965 *
3966 * parse an HTML element, this is highly recursive
3967 *
3968 * [39] element ::= EmptyElemTag | STag content ETag
3969 *
3970 * [41] Attribute ::= Name Eq AttValue
3971 */
3972
3973void
3974htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003975 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003976 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003977 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003978 htmlParserNodeInfo node_info;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003979 int failed;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003980 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003981 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003982
Daniel Veillarda03e3652004-11-02 18:45:30 +00003983 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3984 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
Daniel Veillard597f1c12005-07-03 23:00:18 +00003985 "htmlParseElement: context error\n", NULL, NULL);
Daniel Veillarda03e3652004-11-02 18:45:30 +00003986 return;
3987 }
Owen Taylor3473f882001-02-23 17:55:21 +00003988 /* Capture start position */
3989 if (ctxt->record_info) {
3990 node_info.begin_pos = ctxt->input->consumed +
3991 (CUR_PTR - ctxt->input->base);
3992 node_info.begin_line = ctxt->input->line;
3993 }
3994
Daniel Veillard597f1c12005-07-03 23:00:18 +00003995 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003996 name = ctxt->name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003997 if (failed || (name == NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003998 if (CUR == '>')
3999 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004000 return;
4001 }
Owen Taylor3473f882001-02-23 17:55:21 +00004002
4003 /*
4004 * Lookup the info for that element.
4005 */
4006 info = htmlTagLookup(name);
4007 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004008 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4009 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004010 }
4011
4012 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004013 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004014 */
4015 if ((CUR == '/') && (NXT(1) == '>')) {
4016 SKIP(2);
4017 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4018 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004019 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004020 return;
4021 }
4022
4023 if (CUR == '>') {
4024 NEXT;
4025 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004026 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4027 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004028
4029 /*
4030 * end of parsing of this node.
4031 */
4032 if (xmlStrEqual(name, ctxt->name)) {
4033 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00004034 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004035 }
4036
4037 /*
4038 * Capture end position and add node
4039 */
Daniel Veillard30e76072006-03-09 14:13:55 +00004040 if (ctxt->record_info) {
Owen Taylor3473f882001-02-23 17:55:21 +00004041 node_info.end_pos = ctxt->input->consumed +
4042 (CUR_PTR - ctxt->input->base);
4043 node_info.end_line = ctxt->input->line;
4044 node_info.node = ctxt->node;
4045 xmlParserAddNodeInfo(ctxt, &node_info);
4046 }
4047 return;
4048 }
4049
4050 /*
4051 * Check for an Empty Element from DTD definition
4052 */
4053 if ((info != NULL) && (info->empty)) {
4054 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4055 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004056 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004057 return;
4058 }
4059
4060 /*
4061 * Parse the content of the element:
4062 */
4063 currentNode = xmlStrdup(ctxt->name);
4064 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00004065 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00004066 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00004067 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00004068 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00004069 if (ctxt->nameNr < depth) break;
4070 }
4071
Owen Taylor3473f882001-02-23 17:55:21 +00004072 /*
4073 * Capture end position and add node
4074 */
4075 if ( currentNode != NULL && ctxt->record_info ) {
4076 node_info.end_pos = ctxt->input->consumed +
4077 (CUR_PTR - ctxt->input->base);
4078 node_info.end_line = ctxt->input->line;
4079 node_info.node = ctxt->node;
4080 xmlParserAddNodeInfo(ctxt, &node_info);
4081 }
William M. Brack76e95df2003-10-18 16:20:14 +00004082 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004083 htmlAutoCloseOnEnd(ctxt);
4084 }
4085
Owen Taylor3473f882001-02-23 17:55:21 +00004086 if (currentNode != NULL)
4087 xmlFree(currentNode);
4088}
4089
4090/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004091 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00004092 * @ctxt: an HTML parser context
4093 *
4094 * parse an HTML document (and build a tree if using the standard SAX
4095 * interface).
4096 *
4097 * Returns 0, -1 in case of error. the parser context is augmented
4098 * as a result of the parsing.
4099 */
4100
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00004101int
Owen Taylor3473f882001-02-23 17:55:21 +00004102htmlParseDocument(htmlParserCtxtPtr ctxt) {
4103 xmlDtdPtr dtd;
4104
Daniel Veillardd0463562001-10-13 09:15:48 +00004105 xmlInitParser();
4106
Owen Taylor3473f882001-02-23 17:55:21 +00004107 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00004108
Daniel Veillarda03e3652004-11-02 18:45:30 +00004109 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4110 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4111 "htmlParseDocument: context error\n", NULL, NULL);
4112 return(XML_ERR_INTERNAL_ERROR);
4113 }
4114 ctxt->html = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00004115 GROW;
4116 /*
4117 * SAX: beginning of the document processing.
4118 */
4119 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4120 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4121
4122 /*
4123 * Wipe out everything which is before the first '<'
4124 */
4125 SKIP_BLANKS;
4126 if (CUR == 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004127 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4128 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004129 }
4130
4131 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4132 ctxt->sax->startDocument(ctxt->userData);
4133
4134
4135 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004136 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004137 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004138 while (((CUR == '<') && (NXT(1) == '!') &&
4139 (NXT(2) == '-') && (NXT(3) == '-')) ||
4140 ((CUR == '<') && (NXT(1) == '?'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004141 htmlParseComment(ctxt);
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004142 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004143 SKIP_BLANKS;
4144 }
4145
4146
4147 /*
4148 * Then possibly doc type declaration(s) and more Misc
4149 * (doctypedecl Misc*)?
4150 */
4151 if ((CUR == '<') && (NXT(1) == '!') &&
4152 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4153 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4154 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4155 (UPP(8) == 'E')) {
4156 htmlParseDocTypeDecl(ctxt);
4157 }
4158 SKIP_BLANKS;
4159
4160 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004161 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004162 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004163 while (((CUR == '<') && (NXT(1) == '!') &&
4164 (NXT(2) == '-') && (NXT(3) == '-')) ||
4165 ((CUR == '<') && (NXT(1) == '?'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004166 htmlParseComment(ctxt);
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004167 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004168 SKIP_BLANKS;
4169 }
4170
4171 /*
4172 * Time to start parsing the tree itself
4173 */
4174 htmlParseContent(ctxt);
4175
4176 /*
4177 * autoclose
4178 */
4179 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004180 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004181
4182
4183 /*
4184 * SAX: end of the document processing.
4185 */
4186 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4187 ctxt->sax->endDocument(ctxt->userData);
4188
4189 if (ctxt->myDoc != NULL) {
4190 dtd = xmlGetIntSubset(ctxt->myDoc);
4191 if (dtd == NULL)
4192 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00004193 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004194 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4195 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4196 }
4197 if (! ctxt->wellFormed) return(-1);
4198 return(0);
4199}
4200
4201
4202/************************************************************************
4203 * *
4204 * Parser contexts handling *
4205 * *
4206 ************************************************************************/
4207
4208/**
William M. Brackedb65a72004-02-06 07:36:04 +00004209 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004210 * @ctxt: an HTML parser context
4211 *
4212 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004213 *
4214 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004215 */
4216
Daniel Veillardf403d292003-10-05 13:51:35 +00004217static int
Owen Taylor3473f882001-02-23 17:55:21 +00004218htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4219{
4220 htmlSAXHandler *sax;
4221
Daniel Veillardf403d292003-10-05 13:51:35 +00004222 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004223 memset(ctxt, 0, sizeof(htmlParserCtxt));
4224
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004225 ctxt->dict = xmlDictCreate();
4226 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004227 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4228 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004229 }
Owen Taylor3473f882001-02-23 17:55:21 +00004230 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4231 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004232 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4233 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004234 }
4235 else
4236 memset(sax, 0, sizeof(htmlSAXHandler));
4237
4238 /* Allocate the Input stack */
4239 ctxt->inputTab = (htmlParserInputPtr *)
4240 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4241 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004242 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004243 ctxt->inputNr = 0;
4244 ctxt->inputMax = 0;
4245 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004246 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004247 }
4248 ctxt->inputNr = 0;
4249 ctxt->inputMax = 5;
4250 ctxt->input = NULL;
4251 ctxt->version = NULL;
4252 ctxt->encoding = NULL;
4253 ctxt->standalone = -1;
4254 ctxt->instate = XML_PARSER_START;
4255
4256 /* Allocate the Node stack */
4257 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4258 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004259 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004260 ctxt->nodeNr = 0;
4261 ctxt->nodeMax = 0;
4262 ctxt->node = NULL;
4263 ctxt->inputNr = 0;
4264 ctxt->inputMax = 0;
4265 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004266 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004267 }
4268 ctxt->nodeNr = 0;
4269 ctxt->nodeMax = 10;
4270 ctxt->node = NULL;
4271
4272 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004273 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004274 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004275 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004276 ctxt->nameNr = 0;
4277 ctxt->nameMax = 10;
4278 ctxt->name = NULL;
4279 ctxt->nodeNr = 0;
4280 ctxt->nodeMax = 0;
4281 ctxt->node = NULL;
4282 ctxt->inputNr = 0;
4283 ctxt->inputMax = 0;
4284 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004285 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004286 }
4287 ctxt->nameNr = 0;
4288 ctxt->nameMax = 10;
4289 ctxt->name = NULL;
4290
Daniel Veillard092643b2003-09-25 14:29:29 +00004291 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004292 else {
4293 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004294 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004295 }
4296 ctxt->userData = ctxt;
4297 ctxt->myDoc = NULL;
4298 ctxt->wellFormed = 1;
4299 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004300 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004301 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004302 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004303 ctxt->vctxt.userData = ctxt;
4304 ctxt->vctxt.error = xmlParserValidityError;
4305 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004306 ctxt->record_info = 0;
4307 ctxt->validate = 0;
4308 ctxt->nbChars = 0;
4309 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004310 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004311 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004312 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004313}
4314
4315/**
4316 * htmlFreeParserCtxt:
4317 * @ctxt: an HTML parser context
4318 *
4319 * Free all the memory used by a parser context. However the parsed
4320 * document in ctxt->myDoc is not freed.
4321 */
4322
4323void
4324htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4325{
4326 xmlFreeParserCtxt(ctxt);
4327}
4328
4329/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004330 * htmlNewParserCtxt:
4331 *
4332 * Allocate and initialize a new parser context.
4333 *
Daniel Veillard34c647c2006-09-21 06:53:59 +00004334 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
Daniel Veillard1d995272002-07-22 16:43:32 +00004335 */
4336
Daniel Veillard34c647c2006-09-21 06:53:59 +00004337htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004338htmlNewParserCtxt(void)
4339{
4340 xmlParserCtxtPtr ctxt;
4341
4342 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4343 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004344 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004345 return(NULL);
4346 }
4347 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004348 if (htmlInitParserCtxt(ctxt) < 0) {
4349 htmlFreeParserCtxt(ctxt);
4350 return(NULL);
4351 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004352 return(ctxt);
4353}
4354
4355/**
4356 * htmlCreateMemoryParserCtxt:
4357 * @buffer: a pointer to a char array
4358 * @size: the size of the array
4359 *
4360 * Create a parser context for an HTML in-memory document.
4361 *
4362 * Returns the new parser context or NULL
4363 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004364htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004365htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4366 xmlParserCtxtPtr ctxt;
4367 xmlParserInputPtr input;
4368 xmlParserInputBufferPtr buf;
4369
4370 if (buffer == NULL)
4371 return(NULL);
4372 if (size <= 0)
4373 return(NULL);
4374
4375 ctxt = htmlNewParserCtxt();
4376 if (ctxt == NULL)
4377 return(NULL);
4378
4379 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4380 if (buf == NULL) return(NULL);
4381
4382 input = xmlNewInputStream(ctxt);
4383 if (input == NULL) {
4384 xmlFreeParserCtxt(ctxt);
4385 return(NULL);
4386 }
4387
4388 input->filename = NULL;
4389 input->buf = buf;
4390 input->base = input->buf->buffer->content;
4391 input->cur = input->buf->buffer->content;
4392 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4393
4394 inputPush(ctxt, input);
4395 return(ctxt);
4396}
4397
4398/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004399 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004400 * @cur: a pointer to an array of xmlChar
4401 * @encoding: a free form C string describing the HTML document encoding, or NULL
4402 *
4403 * Create a parser context for an HTML document.
4404 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004405 * TODO: check the need to add encoding handling there
4406 *
Owen Taylor3473f882001-02-23 17:55:21 +00004407 * Returns the new parser context or NULL
4408 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004409static htmlParserCtxtPtr
Daniel Veillard8a82ae12006-10-17 20:04:10 +00004410htmlCreateDocParserCtxt(const xmlChar *cur,
4411 const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004412 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004413 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004414
Daniel Veillard1d995272002-07-22 16:43:32 +00004415 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004416 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004417 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004418 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4419
4420 if (encoding != NULL) {
4421 xmlCharEncoding enc;
4422 xmlCharEncodingHandlerPtr handler;
4423
4424 if (ctxt->input->encoding != NULL)
4425 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004426 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004427
4428 enc = xmlParseCharEncoding(encoding);
4429 /*
4430 * registered set of known encodings
4431 */
4432 if (enc != XML_CHAR_ENCODING_ERROR) {
4433 xmlSwitchEncoding(ctxt, enc);
4434 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004435 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4436 "Unsupported encoding %s\n",
4437 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004438 }
4439 } else {
4440 /*
4441 * fallback for unknown encodings
4442 */
4443 handler = xmlFindCharEncodingHandler((const char *) encoding);
4444 if (handler != NULL) {
4445 xmlSwitchToEncoding(ctxt, handler);
4446 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004447 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4448 "Unsupported encoding %s\n",
4449 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004450 }
4451 }
4452 }
4453 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004454}
4455
Daniel Veillard73b013f2003-09-30 12:36:01 +00004456#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004457/************************************************************************
4458 * *
4459 * Progressive parsing interfaces *
4460 * *
4461 ************************************************************************/
4462
4463/**
4464 * htmlParseLookupSequence:
4465 * @ctxt: an HTML parser context
4466 * @first: the first char to lookup
4467 * @next: the next char to lookup or zero
4468 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004469 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004470 *
4471 * Try to find if a sequence (first, next, third) or just (first next) or
4472 * (first) is available in the input stream.
4473 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4474 * to avoid rescanning sequences of bytes, it DOES change the state of the
4475 * parser, do not use liberally.
4476 * This is basically similar to xmlParseLookupSequence()
4477 *
4478 * Returns the index to the current parsing point if the full sequence
4479 * is available, -1 otherwise.
4480 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004481static int
Owen Taylor3473f882001-02-23 17:55:21 +00004482htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
William M. Brackc1939562003-08-05 15:52:22 +00004483 xmlChar next, xmlChar third, int iscomment) {
Owen Taylor3473f882001-02-23 17:55:21 +00004484 int base, len;
4485 htmlParserInputPtr in;
4486 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004487 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004488
4489 in = ctxt->input;
4490 if (in == NULL) return(-1);
4491 base = in->cur - in->base;
4492 if (base < 0) return(-1);
4493 if (ctxt->checkIndex > base)
4494 base = ctxt->checkIndex;
4495 if (in->buf == NULL) {
4496 buf = in->base;
4497 len = in->length;
4498 } else {
4499 buf = in->buf->buffer->content;
4500 len = in->buf->buffer->use;
4501 }
4502 /* take into account the sequence length */
4503 if (third) len -= 2;
4504 else if (next) len --;
4505 for (;base < len;base++) {
William M. Brackc1939562003-08-05 15:52:22 +00004506 if (!incomment && (base + 4 < len) && !iscomment) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004507 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4508 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4509 incomment = 1;
Daniel Veillard97e01882003-07-30 18:59:19 +00004510 /* do not increment past <! - some people use <!--> */
4511 base += 2;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004512 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004513 }
4514 if (incomment) {
William M. Brack4a557d92003-07-29 04:28:04 +00004515 if (base + 3 > len)
Daniel Veillardc1f78342001-11-10 11:43:05 +00004516 return(-1);
4517 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4518 (buf[base + 2] == '>')) {
4519 incomment = 0;
4520 base += 2;
4521 }
4522 continue;
4523 }
Owen Taylor3473f882001-02-23 17:55:21 +00004524 if (buf[base] == first) {
4525 if (third != 0) {
4526 if ((buf[base + 1] != next) ||
4527 (buf[base + 2] != third)) continue;
4528 } else if (next != 0) {
4529 if (buf[base + 1] != next) continue;
4530 }
4531 ctxt->checkIndex = 0;
4532#ifdef DEBUG_PUSH
4533 if (next == 0)
4534 xmlGenericError(xmlGenericErrorContext,
4535 "HPP: lookup '%c' found at %d\n",
4536 first, base);
4537 else if (third == 0)
4538 xmlGenericError(xmlGenericErrorContext,
4539 "HPP: lookup '%c%c' found at %d\n",
4540 first, next, base);
4541 else
4542 xmlGenericError(xmlGenericErrorContext,
4543 "HPP: lookup '%c%c%c' found at %d\n",
4544 first, next, third, base);
4545#endif
4546 return(base - (in->cur - in->base));
4547 }
4548 }
4549 ctxt->checkIndex = base;
4550#ifdef DEBUG_PUSH
4551 if (next == 0)
4552 xmlGenericError(xmlGenericErrorContext,
4553 "HPP: lookup '%c' failed\n", first);
4554 else if (third == 0)
4555 xmlGenericError(xmlGenericErrorContext,
4556 "HPP: lookup '%c%c' failed\n", first, next);
4557 else
4558 xmlGenericError(xmlGenericErrorContext,
4559 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4560#endif
4561 return(-1);
4562}
4563
4564/**
4565 * htmlParseTryOrFinish:
4566 * @ctxt: an HTML parser context
4567 * @terminate: last chunk indicator
4568 *
4569 * Try to progress on parsing
4570 *
4571 * Returns zero if no parsing was possible
4572 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004573static int
Owen Taylor3473f882001-02-23 17:55:21 +00004574htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4575 int ret = 0;
4576 htmlParserInputPtr in;
4577 int avail = 0;
4578 xmlChar cur, next;
4579
4580#ifdef DEBUG_PUSH
4581 switch (ctxt->instate) {
4582 case XML_PARSER_EOF:
4583 xmlGenericError(xmlGenericErrorContext,
4584 "HPP: try EOF\n"); break;
4585 case XML_PARSER_START:
4586 xmlGenericError(xmlGenericErrorContext,
4587 "HPP: try START\n"); break;
4588 case XML_PARSER_MISC:
4589 xmlGenericError(xmlGenericErrorContext,
4590 "HPP: try MISC\n");break;
4591 case XML_PARSER_COMMENT:
4592 xmlGenericError(xmlGenericErrorContext,
4593 "HPP: try COMMENT\n");break;
4594 case XML_PARSER_PROLOG:
4595 xmlGenericError(xmlGenericErrorContext,
4596 "HPP: try PROLOG\n");break;
4597 case XML_PARSER_START_TAG:
4598 xmlGenericError(xmlGenericErrorContext,
4599 "HPP: try START_TAG\n");break;
4600 case XML_PARSER_CONTENT:
4601 xmlGenericError(xmlGenericErrorContext,
4602 "HPP: try CONTENT\n");break;
4603 case XML_PARSER_CDATA_SECTION:
4604 xmlGenericError(xmlGenericErrorContext,
4605 "HPP: try CDATA_SECTION\n");break;
4606 case XML_PARSER_END_TAG:
4607 xmlGenericError(xmlGenericErrorContext,
4608 "HPP: try END_TAG\n");break;
4609 case XML_PARSER_ENTITY_DECL:
4610 xmlGenericError(xmlGenericErrorContext,
4611 "HPP: try ENTITY_DECL\n");break;
4612 case XML_PARSER_ENTITY_VALUE:
4613 xmlGenericError(xmlGenericErrorContext,
4614 "HPP: try ENTITY_VALUE\n");break;
4615 case XML_PARSER_ATTRIBUTE_VALUE:
4616 xmlGenericError(xmlGenericErrorContext,
4617 "HPP: try ATTRIBUTE_VALUE\n");break;
4618 case XML_PARSER_DTD:
4619 xmlGenericError(xmlGenericErrorContext,
4620 "HPP: try DTD\n");break;
4621 case XML_PARSER_EPILOG:
4622 xmlGenericError(xmlGenericErrorContext,
4623 "HPP: try EPILOG\n");break;
4624 case XML_PARSER_PI:
4625 xmlGenericError(xmlGenericErrorContext,
4626 "HPP: try PI\n");break;
4627 case XML_PARSER_SYSTEM_LITERAL:
4628 xmlGenericError(xmlGenericErrorContext,
4629 "HPP: try SYSTEM_LITERAL\n");break;
4630 }
4631#endif
4632
4633 while (1) {
4634
4635 in = ctxt->input;
4636 if (in == NULL) break;
4637 if (in->buf == NULL)
4638 avail = in->length - (in->cur - in->base);
4639 else
4640 avail = in->buf->buffer->use - (in->cur - in->base);
4641 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004642 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004643 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4644 /*
4645 * SAX: end of the document processing.
4646 */
4647 ctxt->instate = XML_PARSER_EOF;
4648 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4649 ctxt->sax->endDocument(ctxt->userData);
4650 }
4651 }
4652 if (avail < 1)
4653 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004654 cur = in->cur[0];
4655 if (cur == 0) {
4656 SKIP(1);
4657 continue;
4658 }
4659
Owen Taylor3473f882001-02-23 17:55:21 +00004660 switch (ctxt->instate) {
4661 case XML_PARSER_EOF:
4662 /*
4663 * Document parsing is done !
4664 */
4665 goto done;
4666 case XML_PARSER_START:
4667 /*
4668 * Very first chars read from the document flow.
4669 */
4670 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004671 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004672 SKIP_BLANKS;
4673 if (in->buf == NULL)
4674 avail = in->length - (in->cur - in->base);
4675 else
4676 avail = in->buf->buffer->use - (in->cur - in->base);
4677 }
4678 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4679 ctxt->sax->setDocumentLocator(ctxt->userData,
4680 &xmlDefaultSAXLocator);
4681 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4682 (!ctxt->disableSAX))
4683 ctxt->sax->startDocument(ctxt->userData);
4684
4685 cur = in->cur[0];
4686 next = in->cur[1];
4687 if ((cur == '<') && (next == '!') &&
4688 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4689 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4690 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4691 (UPP(8) == 'E')) {
4692 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004693 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004694 goto done;
4695#ifdef DEBUG_PUSH
4696 xmlGenericError(xmlGenericErrorContext,
4697 "HPP: Parsing internal subset\n");
4698#endif
4699 htmlParseDocTypeDecl(ctxt);
4700 ctxt->instate = XML_PARSER_PROLOG;
4701#ifdef DEBUG_PUSH
4702 xmlGenericError(xmlGenericErrorContext,
4703 "HPP: entering PROLOG\n");
4704#endif
4705 } else {
4706 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004707#ifdef DEBUG_PUSH
Daniel Veillard597f1c12005-07-03 23:00:18 +00004708 xmlGenericError(xmlGenericErrorContext,
4709 "HPP: entering MISC\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004710#endif
Daniel Veillard597f1c12005-07-03 23:00:18 +00004711 }
Owen Taylor3473f882001-02-23 17:55:21 +00004712 break;
4713 case XML_PARSER_MISC:
4714 SKIP_BLANKS;
4715 if (in->buf == NULL)
4716 avail = in->length - (in->cur - in->base);
4717 else
4718 avail = in->buf->buffer->use - (in->cur - in->base);
4719 if (avail < 2)
4720 goto done;
4721 cur = in->cur[0];
4722 next = in->cur[1];
4723 if ((cur == '<') && (next == '!') &&
4724 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4725 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004726 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004727 goto done;
4728#ifdef DEBUG_PUSH
4729 xmlGenericError(xmlGenericErrorContext,
4730 "HPP: Parsing Comment\n");
4731#endif
4732 htmlParseComment(ctxt);
4733 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004734 } else if ((cur == '<') && (next == '?')) {
4735 if ((!terminate) &&
4736 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4737 goto done;
4738#ifdef DEBUG_PUSH
4739 xmlGenericError(xmlGenericErrorContext,
4740 "HPP: Parsing PI\n");
4741#endif
4742 htmlParsePI(ctxt);
4743 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004744 } else if ((cur == '<') && (next == '!') &&
4745 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4746 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4747 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4748 (UPP(8) == 'E')) {
4749 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004750 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004751 goto done;
4752#ifdef DEBUG_PUSH
4753 xmlGenericError(xmlGenericErrorContext,
4754 "HPP: Parsing internal subset\n");
4755#endif
4756 htmlParseDocTypeDecl(ctxt);
4757 ctxt->instate = XML_PARSER_PROLOG;
4758#ifdef DEBUG_PUSH
4759 xmlGenericError(xmlGenericErrorContext,
4760 "HPP: entering PROLOG\n");
4761#endif
4762 } else if ((cur == '<') && (next == '!') &&
4763 (avail < 9)) {
4764 goto done;
4765 } else {
4766 ctxt->instate = XML_PARSER_START_TAG;
4767#ifdef DEBUG_PUSH
4768 xmlGenericError(xmlGenericErrorContext,
4769 "HPP: entering START_TAG\n");
4770#endif
4771 }
4772 break;
4773 case XML_PARSER_PROLOG:
4774 SKIP_BLANKS;
4775 if (in->buf == NULL)
4776 avail = in->length - (in->cur - in->base);
4777 else
4778 avail = in->buf->buffer->use - (in->cur - in->base);
4779 if (avail < 2)
4780 goto done;
4781 cur = in->cur[0];
4782 next = in->cur[1];
4783 if ((cur == '<') && (next == '!') &&
4784 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4785 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004786 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004787 goto done;
4788#ifdef DEBUG_PUSH
4789 xmlGenericError(xmlGenericErrorContext,
4790 "HPP: Parsing Comment\n");
4791#endif
4792 htmlParseComment(ctxt);
4793 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004794 } else if ((cur == '<') && (next == '?')) {
4795 if ((!terminate) &&
4796 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4797 goto done;
4798#ifdef DEBUG_PUSH
4799 xmlGenericError(xmlGenericErrorContext,
4800 "HPP: Parsing PI\n");
4801#endif
4802 htmlParsePI(ctxt);
4803 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004804 } else if ((cur == '<') && (next == '!') &&
4805 (avail < 4)) {
4806 goto done;
4807 } else {
4808 ctxt->instate = XML_PARSER_START_TAG;
4809#ifdef DEBUG_PUSH
4810 xmlGenericError(xmlGenericErrorContext,
4811 "HPP: entering START_TAG\n");
4812#endif
4813 }
4814 break;
4815 case XML_PARSER_EPILOG:
4816 if (in->buf == NULL)
4817 avail = in->length - (in->cur - in->base);
4818 else
4819 avail = in->buf->buffer->use - (in->cur - in->base);
4820 if (avail < 1)
4821 goto done;
4822 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004823 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004824 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004825 goto done;
4826 }
4827 if (avail < 2)
4828 goto done;
4829 next = in->cur[1];
4830 if ((cur == '<') && (next == '!') &&
4831 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4832 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004833 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004834 goto done;
4835#ifdef DEBUG_PUSH
4836 xmlGenericError(xmlGenericErrorContext,
4837 "HPP: Parsing Comment\n");
4838#endif
4839 htmlParseComment(ctxt);
4840 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004841 } else if ((cur == '<') && (next == '?')) {
4842 if ((!terminate) &&
4843 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4844 goto done;
4845#ifdef DEBUG_PUSH
4846 xmlGenericError(xmlGenericErrorContext,
4847 "HPP: Parsing PI\n");
4848#endif
4849 htmlParsePI(ctxt);
4850 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004851 } else if ((cur == '<') && (next == '!') &&
4852 (avail < 4)) {
4853 goto done;
4854 } else {
4855 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004856 ctxt->wellFormed = 0;
4857 ctxt->instate = XML_PARSER_EOF;
4858#ifdef DEBUG_PUSH
4859 xmlGenericError(xmlGenericErrorContext,
4860 "HPP: entering EOF\n");
4861#endif
4862 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4863 ctxt->sax->endDocument(ctxt->userData);
4864 goto done;
4865 }
4866 break;
4867 case XML_PARSER_START_TAG: {
Daniel Veillard6a0baa02005-12-10 11:11:12 +00004868 const xmlChar *name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004869 int failed;
Daniel Veillardbb371292001-08-16 23:26:59 +00004870 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004871
4872 if (avail < 2)
4873 goto done;
4874 cur = in->cur[0];
4875 if (cur != '<') {
4876 ctxt->instate = XML_PARSER_CONTENT;
4877#ifdef DEBUG_PUSH
4878 xmlGenericError(xmlGenericErrorContext,
4879 "HPP: entering CONTENT\n");
4880#endif
4881 break;
4882 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004883 if (in->cur[1] == '/') {
4884 ctxt->instate = XML_PARSER_END_TAG;
4885 ctxt->checkIndex = 0;
4886#ifdef DEBUG_PUSH
4887 xmlGenericError(xmlGenericErrorContext,
4888 "HPP: entering END_TAG\n");
4889#endif
4890 break;
4891 }
Owen Taylor3473f882001-02-23 17:55:21 +00004892 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004893 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004894 goto done;
4895
Daniel Veillard597f1c12005-07-03 23:00:18 +00004896 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004897 name = ctxt->name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004898 if (failed ||
Owen Taylor3473f882001-02-23 17:55:21 +00004899 (name == NULL)) {
4900 if (CUR == '>')
4901 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004902 break;
4903 }
Owen Taylor3473f882001-02-23 17:55:21 +00004904
4905 /*
4906 * Lookup the info for that element.
4907 */
4908 info = htmlTagLookup(name);
4909 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004910 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4911 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004912 }
4913
4914 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004915 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004916 */
4917 if ((CUR == '/') && (NXT(1) == '>')) {
4918 SKIP(2);
4919 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4920 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00004921 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004922 ctxt->instate = XML_PARSER_CONTENT;
4923#ifdef DEBUG_PUSH
4924 xmlGenericError(xmlGenericErrorContext,
4925 "HPP: entering CONTENT\n");
4926#endif
4927 break;
4928 }
4929
4930 if (CUR == '>') {
4931 NEXT;
4932 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004933 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4934 "Couldn't find end of Start Tag %s\n",
4935 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004936
4937 /*
4938 * end of parsing of this node.
4939 */
4940 if (xmlStrEqual(name, ctxt->name)) {
4941 nodePop(ctxt);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00004942 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004943 }
4944
4945 ctxt->instate = XML_PARSER_CONTENT;
4946#ifdef DEBUG_PUSH
4947 xmlGenericError(xmlGenericErrorContext,
4948 "HPP: entering CONTENT\n");
4949#endif
4950 break;
4951 }
4952
4953 /*
4954 * Check for an Empty Element from DTD definition
4955 */
4956 if ((info != NULL) && (info->empty)) {
4957 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4958 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00004959 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004960 }
4961 ctxt->instate = XML_PARSER_CONTENT;
4962#ifdef DEBUG_PUSH
4963 xmlGenericError(xmlGenericErrorContext,
4964 "HPP: entering CONTENT\n");
4965#endif
4966 break;
4967 }
4968 case XML_PARSER_CONTENT: {
4969 long cons;
4970 /*
4971 * Handle preparsed entities and charRef
4972 */
4973 if (ctxt->token != 0) {
4974 xmlChar chr[2] = { 0 , 0 } ;
4975
4976 chr[0] = (xmlChar) ctxt->token;
4977 htmlCheckParagraph(ctxt);
4978 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4979 ctxt->sax->characters(ctxt->userData, chr, 1);
4980 ctxt->token = 0;
4981 ctxt->checkIndex = 0;
4982 }
4983 if ((avail == 1) && (terminate)) {
4984 cur = in->cur[0];
4985 if ((cur != '<') && (cur != '&')) {
4986 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00004987 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004988 if (ctxt->sax->ignorableWhitespace != NULL)
4989 ctxt->sax->ignorableWhitespace(
4990 ctxt->userData, &cur, 1);
4991 } else {
4992 htmlCheckParagraph(ctxt);
4993 if (ctxt->sax->characters != NULL)
4994 ctxt->sax->characters(
4995 ctxt->userData, &cur, 1);
4996 }
4997 }
4998 ctxt->token = 0;
4999 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00005000 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00005001 break;
Owen Taylor3473f882001-02-23 17:55:21 +00005002 }
Owen Taylor3473f882001-02-23 17:55:21 +00005003 }
5004 if (avail < 2)
5005 goto done;
5006 cur = in->cur[0];
5007 next = in->cur[1];
5008 cons = ctxt->nbChars;
5009 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5010 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5011 /*
5012 * Handle SCRIPT/STYLE separately
5013 */
Daniel Veillard68716a72006-10-16 09:32:17 +00005014 if (!terminate) {
5015 int idx;
5016 xmlChar val;
5017
5018 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5019 if (idx < 0)
5020 goto done;
5021 val = in->cur[idx + 2];
5022 if (val == 0) /* bad cut of input */
5023 goto done;
5024 }
Owen Taylor3473f882001-02-23 17:55:21 +00005025 htmlParseScript(ctxt);
5026 if ((cur == '<') && (next == '/')) {
5027 ctxt->instate = XML_PARSER_END_TAG;
5028 ctxt->checkIndex = 0;
5029#ifdef DEBUG_PUSH
5030 xmlGenericError(xmlGenericErrorContext,
5031 "HPP: entering END_TAG\n");
5032#endif
5033 break;
5034 }
5035 } else {
5036 /*
5037 * Sometimes DOCTYPE arrives in the middle of the document
5038 */
5039 if ((cur == '<') && (next == '!') &&
5040 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5041 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5042 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5043 (UPP(8) == 'E')) {
5044 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005045 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005046 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00005047 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5048 "Misplaced DOCTYPE declaration\n",
5049 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005050 htmlParseDocTypeDecl(ctxt);
5051 } else if ((cur == '<') && (next == '!') &&
5052 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5053 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005054 (htmlParseLookupSequence(
5055 ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005056 goto done;
5057#ifdef DEBUG_PUSH
5058 xmlGenericError(xmlGenericErrorContext,
5059 "HPP: Parsing Comment\n");
5060#endif
5061 htmlParseComment(ctxt);
5062 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005063 } else if ((cur == '<') && (next == '?')) {
5064 if ((!terminate) &&
5065 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5066 goto done;
5067#ifdef DEBUG_PUSH
5068 xmlGenericError(xmlGenericErrorContext,
5069 "HPP: Parsing PI\n");
5070#endif
5071 htmlParsePI(ctxt);
5072 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00005073 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5074 goto done;
5075 } else if ((cur == '<') && (next == '/')) {
5076 ctxt->instate = XML_PARSER_END_TAG;
5077 ctxt->checkIndex = 0;
5078#ifdef DEBUG_PUSH
5079 xmlGenericError(xmlGenericErrorContext,
5080 "HPP: entering END_TAG\n");
5081#endif
5082 break;
5083 } else if (cur == '<') {
5084 ctxt->instate = XML_PARSER_START_TAG;
5085 ctxt->checkIndex = 0;
5086#ifdef DEBUG_PUSH
5087 xmlGenericError(xmlGenericErrorContext,
5088 "HPP: entering START_TAG\n");
5089#endif
5090 break;
5091 } else if (cur == '&') {
5092 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005093 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005094 goto done;
5095#ifdef DEBUG_PUSH
5096 xmlGenericError(xmlGenericErrorContext,
5097 "HPP: Parsing Reference\n");
5098#endif
5099 /* TODO: check generation of subtrees if noent !!! */
5100 htmlParseReference(ctxt);
5101 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005102 /*
5103 * check that the text sequence is complete
5104 * before handing out the data to the parser
5105 * to avoid problems with erroneous end of
5106 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00005107 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00005108 if ((!terminate) &&
5109 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5110 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00005111 ctxt->checkIndex = 0;
5112#ifdef DEBUG_PUSH
5113 xmlGenericError(xmlGenericErrorContext,
5114 "HPP: Parsing char data\n");
5115#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005116 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005117 }
5118 }
5119 if (cons == ctxt->nbChars) {
5120 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005121 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5122 "detected an error in element content\n",
5123 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005124 }
5125 NEXT;
5126 break;
5127 }
5128
5129 break;
5130 }
5131 case XML_PARSER_END_TAG:
5132 if (avail < 2)
5133 goto done;
5134 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005135 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005136 goto done;
5137 htmlParseEndTag(ctxt);
5138 if (ctxt->nameNr == 0) {
5139 ctxt->instate = XML_PARSER_EPILOG;
5140 } else {
5141 ctxt->instate = XML_PARSER_CONTENT;
5142 }
5143 ctxt->checkIndex = 0;
5144#ifdef DEBUG_PUSH
5145 xmlGenericError(xmlGenericErrorContext,
5146 "HPP: entering CONTENT\n");
5147#endif
5148 break;
5149 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005150 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5151 "HPP: internal error, state == CDATA\n",
5152 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005153 ctxt->instate = XML_PARSER_CONTENT;
5154 ctxt->checkIndex = 0;
5155#ifdef DEBUG_PUSH
5156 xmlGenericError(xmlGenericErrorContext,
5157 "HPP: entering CONTENT\n");
5158#endif
5159 break;
5160 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005161 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5162 "HPP: internal error, state == DTD\n",
5163 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005164 ctxt->instate = XML_PARSER_CONTENT;
5165 ctxt->checkIndex = 0;
5166#ifdef DEBUG_PUSH
5167 xmlGenericError(xmlGenericErrorContext,
5168 "HPP: entering CONTENT\n");
5169#endif
5170 break;
5171 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005172 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5173 "HPP: internal error, state == COMMENT\n",
5174 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005175 ctxt->instate = XML_PARSER_CONTENT;
5176 ctxt->checkIndex = 0;
5177#ifdef DEBUG_PUSH
5178 xmlGenericError(xmlGenericErrorContext,
5179 "HPP: entering CONTENT\n");
5180#endif
5181 break;
5182 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005183 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5184 "HPP: internal error, state == PI\n",
5185 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005186 ctxt->instate = XML_PARSER_CONTENT;
5187 ctxt->checkIndex = 0;
5188#ifdef DEBUG_PUSH
5189 xmlGenericError(xmlGenericErrorContext,
5190 "HPP: entering CONTENT\n");
5191#endif
5192 break;
5193 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005194 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5195 "HPP: internal error, state == ENTITY_DECL\n",
5196 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005197 ctxt->instate = XML_PARSER_CONTENT;
5198 ctxt->checkIndex = 0;
5199#ifdef DEBUG_PUSH
5200 xmlGenericError(xmlGenericErrorContext,
5201 "HPP: entering CONTENT\n");
5202#endif
5203 break;
5204 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005205 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5206 "HPP: internal error, state == ENTITY_VALUE\n",
5207 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005208 ctxt->instate = XML_PARSER_CONTENT;
5209 ctxt->checkIndex = 0;
5210#ifdef DEBUG_PUSH
5211 xmlGenericError(xmlGenericErrorContext,
5212 "HPP: entering DTD\n");
5213#endif
5214 break;
5215 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005216 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5217 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5218 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005219 ctxt->instate = XML_PARSER_START_TAG;
5220 ctxt->checkIndex = 0;
5221#ifdef DEBUG_PUSH
5222 xmlGenericError(xmlGenericErrorContext,
5223 "HPP: entering START_TAG\n");
5224#endif
5225 break;
5226 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005227 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5228 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5229 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005230 ctxt->instate = XML_PARSER_CONTENT;
5231 ctxt->checkIndex = 0;
5232#ifdef DEBUG_PUSH
5233 xmlGenericError(xmlGenericErrorContext,
5234 "HPP: entering CONTENT\n");
5235#endif
5236 break;
5237 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005238 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5239 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5240 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005241 ctxt->instate = XML_PARSER_CONTENT;
5242 ctxt->checkIndex = 0;
5243#ifdef DEBUG_PUSH
5244 xmlGenericError(xmlGenericErrorContext,
5245 "HPP: entering CONTENT\n");
5246#endif
5247 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005248 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005249 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5250 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5251 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005252 ctxt->instate = XML_PARSER_CONTENT;
5253 ctxt->checkIndex = 0;
5254#ifdef DEBUG_PUSH
5255 xmlGenericError(xmlGenericErrorContext,
5256 "HPP: entering CONTENT\n");
5257#endif
5258 break;
5259
Owen Taylor3473f882001-02-23 17:55:21 +00005260 }
5261 }
5262done:
5263 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005264 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005265 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5266 /*
5267 * SAX: end of the document processing.
5268 */
5269 ctxt->instate = XML_PARSER_EOF;
5270 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5271 ctxt->sax->endDocument(ctxt->userData);
5272 }
5273 }
5274 if ((ctxt->myDoc != NULL) &&
5275 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5276 (ctxt->instate == XML_PARSER_EPILOG))) {
5277 xmlDtdPtr dtd;
5278 dtd = xmlGetIntSubset(ctxt->myDoc);
5279 if (dtd == NULL)
5280 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00005281 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005282 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5283 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5284 }
5285#ifdef DEBUG_PUSH
5286 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5287#endif
5288 return(ret);
5289}
5290
5291/**
Owen Taylor3473f882001-02-23 17:55:21 +00005292 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00005293 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00005294 * @chunk: an char array
5295 * @size: the size in byte of the chunk
5296 * @terminate: last chunk indicator
5297 *
5298 * Parse a Chunk of memory
5299 *
5300 * Returns zero if no error, the xmlParserErrors otherwise.
5301 */
5302int
5303htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5304 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00005305 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5306 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5307 "htmlParseChunk: context error\n", NULL, NULL);
5308 return(XML_ERR_INTERNAL_ERROR);
5309 }
Owen Taylor3473f882001-02-23 17:55:21 +00005310 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5311 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5312 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5313 int cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005314 int res;
Owen Taylor3473f882001-02-23 17:55:21 +00005315
Daniel Veillardd2755a82005-08-07 23:42:39 +00005316 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5317 if (res < 0) {
5318 ctxt->errNo = XML_PARSER_EOF;
5319 ctxt->disableSAX = 1;
5320 return (XML_PARSER_EOF);
5321 }
Owen Taylor3473f882001-02-23 17:55:21 +00005322 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5323 ctxt->input->cur = ctxt->input->base + cur;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005324 ctxt->input->end =
5325 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005326#ifdef DEBUG_PUSH
5327 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5328#endif
5329
Daniel Veillard14f752c2003-08-09 11:44:50 +00005330#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00005331 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5332 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005333#endif
Owen Taylor3473f882001-02-23 17:55:21 +00005334 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005335 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5336 xmlParserInputBufferPtr in = ctxt->input->buf;
5337 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5338 (in->raw != NULL)) {
5339 int nbchars;
5340
5341 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5342 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005343 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5344 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005345 return(XML_ERR_INVALID_ENCODING);
5346 }
5347 }
5348 }
Owen Taylor3473f882001-02-23 17:55:21 +00005349 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00005350 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00005351 if (terminate) {
5352 if ((ctxt->instate != XML_PARSER_EOF) &&
5353 (ctxt->instate != XML_PARSER_EPILOG) &&
5354 (ctxt->instate != XML_PARSER_MISC)) {
5355 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005356 ctxt->wellFormed = 0;
5357 }
5358 if (ctxt->instate != XML_PARSER_EOF) {
5359 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5360 ctxt->sax->endDocument(ctxt->userData);
5361 }
5362 ctxt->instate = XML_PARSER_EOF;
5363 }
5364 return((xmlParserErrors) ctxt->errNo);
5365}
5366
5367/************************************************************************
5368 * *
5369 * User entry points *
5370 * *
5371 ************************************************************************/
5372
5373/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005374 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005375 * @sax: a SAX handler
5376 * @user_data: The user data returned on SAX callbacks
5377 * @chunk: a pointer to an array of chars
5378 * @size: number of chars in the array
5379 * @filename: an optional file name or URI
5380 * @enc: an optional encoding
5381 *
5382 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005383 * The value of @filename is used for fetching external entities
5384 * and error/warning reports.
5385 *
5386 * Returns the new parser context or NULL
5387 */
5388htmlParserCtxtPtr
5389htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5390 const char *chunk, int size, const char *filename,
5391 xmlCharEncoding enc) {
5392 htmlParserCtxtPtr ctxt;
5393 htmlParserInputPtr inputStream;
5394 xmlParserInputBufferPtr buf;
5395
Daniel Veillardd0463562001-10-13 09:15:48 +00005396 xmlInitParser();
5397
Owen Taylor3473f882001-02-23 17:55:21 +00005398 buf = xmlAllocParserInputBuffer(enc);
5399 if (buf == NULL) return(NULL);
5400
Daniel Veillardf403d292003-10-05 13:51:35 +00005401 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005402 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005403 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005404 return(NULL);
5405 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00005406 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5407 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005408 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005409 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005410 xmlFree(ctxt->sax);
5411 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5412 if (ctxt->sax == NULL) {
5413 xmlFree(buf);
5414 xmlFree(ctxt);
5415 return(NULL);
5416 }
5417 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5418 if (user_data != NULL)
5419 ctxt->userData = user_data;
5420 }
5421 if (filename == NULL) {
5422 ctxt->directory = NULL;
5423 } else {
5424 ctxt->directory = xmlParserGetDirectory(filename);
5425 }
5426
5427 inputStream = htmlNewInputStream(ctxt);
5428 if (inputStream == NULL) {
5429 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005430 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005431 return(NULL);
5432 }
5433
5434 if (filename == NULL)
5435 inputStream->filename = NULL;
5436 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005437 inputStream->filename = (char *)
5438 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005439 inputStream->buf = buf;
5440 inputStream->base = inputStream->buf->buffer->content;
5441 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillard5f704af2003-03-05 10:01:43 +00005442 inputStream->end =
5443 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005444
5445 inputPush(ctxt, inputStream);
5446
5447 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5448 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005449 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5450 int cur = ctxt->input->cur - ctxt->input->base;
5451
Owen Taylor3473f882001-02-23 17:55:21 +00005452 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005453
5454 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5455 ctxt->input->cur = ctxt->input->base + cur;
5456 ctxt->input->end =
5457 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005458#ifdef DEBUG_PUSH
5459 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5460#endif
5461 }
Daniel Veillard68716a72006-10-16 09:32:17 +00005462 ctxt->progressive = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00005463
5464 return(ctxt);
5465}
William M. Brack21e4ef22005-01-02 09:53:13 +00005466#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00005467
5468/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005469 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005470 * @cur: a pointer to an array of xmlChar
5471 * @encoding: a free form C string describing the HTML document encoding, or NULL
5472 * @sax: the SAX handler block
5473 * @userData: if using SAX, this pointer will be provided on callbacks.
5474 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005475 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5476 * to handle parse events. If sax is NULL, fallback to the default DOM
5477 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00005478 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005479 * Returns the resulting document tree unless SAX is NULL or the document is
5480 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005481 */
5482
5483htmlDocPtr
5484htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5485 htmlDocPtr ret;
5486 htmlParserCtxtPtr ctxt;
5487
Daniel Veillardd0463562001-10-13 09:15:48 +00005488 xmlInitParser();
5489
Owen Taylor3473f882001-02-23 17:55:21 +00005490 if (cur == NULL) return(NULL);
5491
5492
5493 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5494 if (ctxt == NULL) return(NULL);
5495 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005496 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005497 ctxt->sax = sax;
5498 ctxt->userData = userData;
5499 }
5500
5501 htmlParseDocument(ctxt);
5502 ret = ctxt->myDoc;
5503 if (sax != NULL) {
5504 ctxt->sax = NULL;
5505 ctxt->userData = NULL;
5506 }
5507 htmlFreeParserCtxt(ctxt);
5508
5509 return(ret);
5510}
5511
5512/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005513 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005514 * @cur: a pointer to an array of xmlChar
5515 * @encoding: a free form C string describing the HTML document encoding, or NULL
5516 *
5517 * parse an HTML in-memory document and build a tree.
5518 *
5519 * Returns the resulting document tree
5520 */
5521
5522htmlDocPtr
5523htmlParseDoc(xmlChar *cur, const char *encoding) {
5524 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5525}
5526
5527
5528/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005529 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005530 * @filename: the filename
5531 * @encoding: a free form C string describing the HTML document encoding, or NULL
5532 *
5533 * Create a parser context for a file content.
5534 * Automatic support for ZLIB/Compress compressed document is provided
5535 * by default if found at compile-time.
5536 *
5537 * Returns the new parser context or NULL
5538 */
5539htmlParserCtxtPtr
5540htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5541{
5542 htmlParserCtxtPtr ctxt;
5543 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005544 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005545 /* htmlCharEncoding enc; */
5546 xmlChar *content, *content_line = (xmlChar *) "charset=";
5547
Daniel Veillarda03e3652004-11-02 18:45:30 +00005548 if (filename == NULL)
5549 return(NULL);
5550
Daniel Veillardf403d292003-10-05 13:51:35 +00005551 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005552 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00005553 return(NULL);
5554 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005555 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5556 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00005557#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005558 if (xmlDefaultSAXHandler.error != NULL) {
5559 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5560 }
Daniel Veillard87247e82004-01-13 20:42:02 +00005561#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00005562 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005563 return(NULL);
5564 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005565
5566 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5567 xmlFree(canonicFilename);
5568 if (inputStream == NULL) {
5569 xmlFreeParserCtxt(ctxt);
5570 return(NULL);
5571 }
Owen Taylor3473f882001-02-23 17:55:21 +00005572
5573 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005574
Owen Taylor3473f882001-02-23 17:55:21 +00005575 /* set encoding */
5576 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005577 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Owen Taylor3473f882001-02-23 17:55:21 +00005578 if (content) {
5579 strcpy ((char *)content, (char *)content_line);
5580 strcat ((char *)content, (char *)encoding);
5581 htmlCheckEncoding (ctxt, content);
5582 xmlFree (content);
5583 }
5584 }
5585
5586 return(ctxt);
5587}
5588
5589/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005590 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005591 * @filename: the filename
5592 * @encoding: a free form C string describing the HTML document encoding, or NULL
5593 * @sax: the SAX handler block
5594 * @userData: if using SAX, this pointer will be provided on callbacks.
5595 *
5596 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5597 * compressed document is provided by default if found at compile-time.
5598 * It use the given SAX function block to handle the parsing callback.
5599 * If sax is NULL, fallback to the default DOM tree building routines.
5600 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005601 * Returns the resulting document tree unless SAX is NULL or the document is
5602 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005603 */
5604
5605htmlDocPtr
5606htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5607 void *userData) {
5608 htmlDocPtr ret;
5609 htmlParserCtxtPtr ctxt;
5610 htmlSAXHandlerPtr oldsax = NULL;
5611
Daniel Veillardd0463562001-10-13 09:15:48 +00005612 xmlInitParser();
5613
Owen Taylor3473f882001-02-23 17:55:21 +00005614 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5615 if (ctxt == NULL) return(NULL);
5616 if (sax != NULL) {
5617 oldsax = ctxt->sax;
5618 ctxt->sax = sax;
5619 ctxt->userData = userData;
5620 }
5621
5622 htmlParseDocument(ctxt);
5623
5624 ret = ctxt->myDoc;
5625 if (sax != NULL) {
5626 ctxt->sax = oldsax;
5627 ctxt->userData = NULL;
5628 }
5629 htmlFreeParserCtxt(ctxt);
5630
5631 return(ret);
5632}
5633
5634/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005635 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005636 * @filename: the filename
5637 * @encoding: a free form C string describing the HTML document encoding, or NULL
5638 *
5639 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5640 * compressed document is provided by default if found at compile-time.
5641 *
5642 * Returns the resulting document tree
5643 */
5644
5645htmlDocPtr
5646htmlParseFile(const char *filename, const char *encoding) {
5647 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5648}
5649
5650/**
5651 * htmlHandleOmittedElem:
5652 * @val: int 0 or 1
5653 *
5654 * Set and return the previous value for handling HTML omitted tags.
5655 *
5656 * Returns the last value for 0 for no handling, 1 for auto insertion.
5657 */
5658
5659int
5660htmlHandleOmittedElem(int val) {
5661 int old = htmlOmittedDefaultValue;
5662
5663 htmlOmittedDefaultValue = val;
5664 return(old);
5665}
5666
Daniel Veillard930dfb62003-02-05 10:17:38 +00005667/**
5668 * htmlElementAllowedHere:
5669 * @parent: HTML parent element
5670 * @elt: HTML element
5671 *
5672 * Checks whether an HTML element may be a direct child of a parent element.
5673 * Note - doesn't check for deprecated elements
5674 *
5675 * Returns 1 if allowed; 0 otherwise.
5676 */
5677int
5678htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5679 const char** p ;
5680
5681 if ( ! elt || ! parent || ! parent->subelts )
5682 return 0 ;
5683
5684 for ( p = parent->subelts; *p; ++p )
5685 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5686 return 1 ;
5687
5688 return 0 ;
5689}
5690/**
5691 * htmlElementStatusHere:
5692 * @parent: HTML parent element
5693 * @elt: HTML element
5694 *
5695 * Checks whether an HTML element may be a direct child of a parent element.
5696 * and if so whether it is valid or deprecated.
5697 *
5698 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5699 */
5700htmlStatus
5701htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5702 if ( ! parent || ! elt )
5703 return HTML_INVALID ;
5704 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5705 return HTML_INVALID ;
5706
5707 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5708}
5709/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005710 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005711 * @elt: HTML element
5712 * @attr: HTML attribute
5713 * @legacy: whether to allow deprecated attributes
5714 *
5715 * Checks whether an attribute is valid for an element
5716 * Has full knowledge of Required and Deprecated attributes
5717 *
5718 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5719 */
5720htmlStatus
5721htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5722 const char** p ;
5723
5724 if ( !elt || ! attr )
5725 return HTML_INVALID ;
5726
5727 if ( elt->attrs_req )
5728 for ( p = elt->attrs_req; *p; ++p)
5729 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5730 return HTML_REQUIRED ;
5731
5732 if ( elt->attrs_opt )
5733 for ( p = elt->attrs_opt; *p; ++p)
5734 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5735 return HTML_VALID ;
5736
5737 if ( legacy && elt->attrs_depr )
5738 for ( p = elt->attrs_depr; *p; ++p)
5739 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5740 return HTML_DEPRECATED ;
5741
5742 return HTML_INVALID ;
5743}
5744/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005745 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005746 * @node: an htmlNodePtr in a tree
5747 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005748 * for Element nodes)
5749 *
5750 * Checks whether the tree node is valid. Experimental (the author
5751 * only uses the HTML enhancements in a SAX parser)
5752 *
5753 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5754 * legacy allowed) or htmlElementStatusHere (otherwise).
5755 * for Attribute nodes, a return from htmlAttrAllowed
5756 * for other nodes, HTML_NA (no checks performed)
5757 */
5758htmlStatus
5759htmlNodeStatus(const htmlNodePtr node, int legacy) {
5760 if ( ! node )
5761 return HTML_INVALID ;
5762
5763 switch ( node->type ) {
5764 case XML_ELEMENT_NODE:
5765 return legacy
5766 ? ( htmlElementAllowedHere (
5767 htmlTagLookup(node->parent->name) , node->name
5768 ) ? HTML_VALID : HTML_INVALID )
5769 : htmlElementStatusHere(
5770 htmlTagLookup(node->parent->name) ,
5771 htmlTagLookup(node->name) )
5772 ;
5773 case XML_ATTRIBUTE_NODE:
5774 return htmlAttrAllowed(
5775 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5776 default: return HTML_NA ;
5777 }
5778}
Daniel Veillard9475a352003-09-26 12:47:50 +00005779/************************************************************************
5780 * *
5781 * New set (2.6.0) of simpler and more flexible APIs *
5782 * *
5783 ************************************************************************/
5784/**
5785 * DICT_FREE:
5786 * @str: a string
5787 *
5788 * Free a string if it is not owned by the "dict" dictionnary in the
5789 * current scope
5790 */
5791#define DICT_FREE(str) \
5792 if ((str) && ((!dict) || \
5793 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5794 xmlFree((char *)(str));
5795
5796/**
5797 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00005798 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00005799 *
5800 * Reset a parser context
5801 */
5802void
5803htmlCtxtReset(htmlParserCtxtPtr ctxt)
5804{
5805 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00005806 xmlDictPtr dict;
5807
5808 if (ctxt == NULL)
5809 return;
5810
Daniel Veillardf1a27c62006-10-13 22:33:03 +00005811 xmlInitParser();
Daniel Veillarda03e3652004-11-02 18:45:30 +00005812 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00005813
5814 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5815 xmlFreeInputStream(input);
5816 }
5817 ctxt->inputNr = 0;
5818 ctxt->input = NULL;
5819
5820 ctxt->spaceNr = 0;
Daniel Veillarda521d282004-11-09 14:59:59 +00005821 if (ctxt->spaceTab != NULL) {
5822 ctxt->spaceTab[0] = -1;
5823 ctxt->space = &ctxt->spaceTab[0];
5824 } else {
5825 ctxt->space = NULL;
5826 }
Daniel Veillard9475a352003-09-26 12:47:50 +00005827
5828
5829 ctxt->nodeNr = 0;
5830 ctxt->node = NULL;
5831
5832 ctxt->nameNr = 0;
5833 ctxt->name = NULL;
5834
5835 DICT_FREE(ctxt->version);
5836 ctxt->version = NULL;
5837 DICT_FREE(ctxt->encoding);
5838 ctxt->encoding = NULL;
5839 DICT_FREE(ctxt->directory);
5840 ctxt->directory = NULL;
5841 DICT_FREE(ctxt->extSubURI);
5842 ctxt->extSubURI = NULL;
5843 DICT_FREE(ctxt->extSubSystem);
5844 ctxt->extSubSystem = NULL;
5845 if (ctxt->myDoc != NULL)
5846 xmlFreeDoc(ctxt->myDoc);
5847 ctxt->myDoc = NULL;
5848
5849 ctxt->standalone = -1;
5850 ctxt->hasExternalSubset = 0;
5851 ctxt->hasPErefs = 0;
5852 ctxt->html = 1;
5853 ctxt->external = 0;
5854 ctxt->instate = XML_PARSER_START;
5855 ctxt->token = 0;
5856
5857 ctxt->wellFormed = 1;
5858 ctxt->nsWellFormed = 1;
5859 ctxt->valid = 1;
5860 ctxt->vctxt.userData = ctxt;
5861 ctxt->vctxt.error = xmlParserValidityError;
5862 ctxt->vctxt.warning = xmlParserValidityWarning;
5863 ctxt->record_info = 0;
5864 ctxt->nbChars = 0;
5865 ctxt->checkIndex = 0;
5866 ctxt->inSubset = 0;
5867 ctxt->errNo = XML_ERR_OK;
5868 ctxt->depth = 0;
Daniel Veillard772869f2006-11-08 09:16:56 +00005869 ctxt->charset = XML_CHAR_ENCODING_NONE;
Daniel Veillard9475a352003-09-26 12:47:50 +00005870 ctxt->catalogs = NULL;
5871 xmlInitNodeInfoSeq(&ctxt->node_seq);
5872
5873 if (ctxt->attsDefault != NULL) {
5874 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5875 ctxt->attsDefault = NULL;
5876 }
5877 if (ctxt->attsSpecial != NULL) {
5878 xmlHashFree(ctxt->attsSpecial, NULL);
5879 ctxt->attsSpecial = NULL;
5880 }
5881}
5882
5883/**
5884 * htmlCtxtUseOptions:
5885 * @ctxt: an HTML parser context
5886 * @options: a combination of htmlParserOption(s)
5887 *
5888 * Applies the options to the parser context
5889 *
5890 * Returns 0 in case of success, the set of unknown or unimplemented options
5891 * in case of error.
5892 */
5893int
5894htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5895{
Daniel Veillarda03e3652004-11-02 18:45:30 +00005896 if (ctxt == NULL)
5897 return(-1);
5898
Daniel Veillard9475a352003-09-26 12:47:50 +00005899 if (options & HTML_PARSE_NOWARNING) {
5900 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005901 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00005902 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005903 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00005904 }
5905 if (options & HTML_PARSE_NOERROR) {
5906 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005907 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00005908 ctxt->sax->fatalError = NULL;
5909 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005910 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00005911 }
5912 if (options & HTML_PARSE_PEDANTIC) {
5913 ctxt->pedantic = 1;
5914 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005915 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00005916 } else
5917 ctxt->pedantic = 0;
5918 if (options & XML_PARSE_NOBLANKS) {
5919 ctxt->keepBlanks = 0;
5920 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5921 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005922 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00005923 } else
5924 ctxt->keepBlanks = 1;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00005925 if (options & HTML_PARSE_RECOVER) {
5926 ctxt->recovery = 1;
Daniel Veillardaf616a72006-10-17 20:18:39 +00005927 options -= HTML_PARSE_RECOVER;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00005928 } else
5929 ctxt->recovery = 0;
Daniel Veillard8874b942005-08-25 13:19:21 +00005930 if (options & HTML_PARSE_COMPACT) {
5931 ctxt->options |= HTML_PARSE_COMPACT;
5932 options -= HTML_PARSE_COMPACT;
5933 }
Daniel Veillard9475a352003-09-26 12:47:50 +00005934 ctxt->dictNames = 0;
5935 return (options);
5936}
5937
5938/**
5939 * htmlDoRead:
5940 * @ctxt: an HTML parser context
5941 * @URL: the base URL to use for the document
5942 * @encoding: the document encoding, or NULL
5943 * @options: a combination of htmlParserOption(s)
5944 * @reuse: keep the context for reuse
5945 *
5946 * Common front-end for the htmlRead functions
5947 *
5948 * Returns the resulting document tree or NULL
5949 */
5950static htmlDocPtr
5951htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5952 int options, int reuse)
5953{
5954 htmlDocPtr ret;
5955
5956 htmlCtxtUseOptions(ctxt, options);
5957 ctxt->html = 1;
5958 if (encoding != NULL) {
5959 xmlCharEncodingHandlerPtr hdlr;
5960
5961 hdlr = xmlFindCharEncodingHandler(encoding);
5962 if (hdlr != NULL)
5963 xmlSwitchToEncoding(ctxt, hdlr);
5964 }
5965 if ((URL != NULL) && (ctxt->input != NULL) &&
5966 (ctxt->input->filename == NULL))
5967 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
5968 htmlParseDocument(ctxt);
5969 ret = ctxt->myDoc;
5970 ctxt->myDoc = NULL;
5971 if (!reuse) {
5972 if ((ctxt->dictNames) &&
5973 (ret != NULL) &&
5974 (ret->dict == ctxt->dict))
5975 ctxt->dict = NULL;
5976 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00005977 }
5978 return (ret);
5979}
5980
5981/**
5982 * htmlReadDoc:
5983 * @cur: a pointer to a zero terminated string
5984 * @URL: the base URL to use for the document
5985 * @encoding: the document encoding, or NULL
5986 * @options: a combination of htmlParserOption(s)
5987 *
5988 * parse an XML in-memory document and build a tree.
5989 *
5990 * Returns the resulting document tree
5991 */
5992htmlDocPtr
5993htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
5994{
5995 htmlParserCtxtPtr ctxt;
5996
5997 if (cur == NULL)
5998 return (NULL);
5999
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006000 xmlInitParser();
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006001 ctxt = htmlCreateDocParserCtxt(cur, NULL);
Daniel Veillard9475a352003-09-26 12:47:50 +00006002 if (ctxt == NULL)
6003 return (NULL);
6004 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6005}
6006
6007/**
6008 * htmlReadFile:
6009 * @filename: a file or URL
6010 * @encoding: the document encoding, or NULL
6011 * @options: a combination of htmlParserOption(s)
6012 *
6013 * parse an XML file from the filesystem or the network.
6014 *
6015 * Returns the resulting document tree
6016 */
6017htmlDocPtr
6018htmlReadFile(const char *filename, const char *encoding, int options)
6019{
6020 htmlParserCtxtPtr ctxt;
6021
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006022 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006023 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6024 if (ctxt == NULL)
6025 return (NULL);
6026 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6027}
6028
6029/**
6030 * htmlReadMemory:
6031 * @buffer: a pointer to a char array
6032 * @size: the size of the array
6033 * @URL: the base URL to use for the document
6034 * @encoding: the document encoding, or NULL
6035 * @options: a combination of htmlParserOption(s)
6036 *
6037 * parse an XML in-memory document and build a tree.
6038 *
6039 * Returns the resulting document tree
6040 */
6041htmlDocPtr
6042htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6043{
6044 htmlParserCtxtPtr ctxt;
6045
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006046 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006047 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6048 if (ctxt == NULL)
6049 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006050 htmlDefaultSAXHandlerInit();
William M. Brackd43cdcd2004-08-03 15:13:29 +00006051 if (ctxt->sax != NULL)
6052 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00006053 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6054}
6055
6056/**
6057 * htmlReadFd:
6058 * @fd: an open file descriptor
6059 * @URL: the base URL to use for the document
6060 * @encoding: the document encoding, or NULL
6061 * @options: a combination of htmlParserOption(s)
6062 *
6063 * parse an XML from a file descriptor and build a tree.
6064 *
6065 * Returns the resulting document tree
6066 */
6067htmlDocPtr
6068htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6069{
6070 htmlParserCtxtPtr ctxt;
6071 xmlParserInputBufferPtr input;
6072 xmlParserInputPtr stream;
6073
6074 if (fd < 0)
6075 return (NULL);
6076
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006077 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006078 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6079 if (input == NULL)
6080 return (NULL);
6081 ctxt = xmlNewParserCtxt();
6082 if (ctxt == NULL) {
6083 xmlFreeParserInputBuffer(input);
6084 return (NULL);
6085 }
6086 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6087 if (stream == NULL) {
6088 xmlFreeParserInputBuffer(input);
6089 xmlFreeParserCtxt(ctxt);
6090 return (NULL);
6091 }
6092 inputPush(ctxt, stream);
6093 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6094}
6095
6096/**
6097 * htmlReadIO:
6098 * @ioread: an I/O read function
6099 * @ioclose: an I/O close function
6100 * @ioctx: an I/O handler
6101 * @URL: the base URL to use for the document
6102 * @encoding: the document encoding, or NULL
6103 * @options: a combination of htmlParserOption(s)
6104 *
6105 * parse an HTML document from I/O functions and source and build a tree.
6106 *
6107 * Returns the resulting document tree
6108 */
6109htmlDocPtr
6110htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6111 void *ioctx, const char *URL, const char *encoding, int options)
6112{
6113 htmlParserCtxtPtr ctxt;
6114 xmlParserInputBufferPtr input;
6115 xmlParserInputPtr stream;
6116
6117 if (ioread == NULL)
6118 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006119 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006120
6121 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6122 XML_CHAR_ENCODING_NONE);
6123 if (input == NULL)
6124 return (NULL);
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006125 ctxt = htmlNewParserCtxt();
Daniel Veillard9475a352003-09-26 12:47:50 +00006126 if (ctxt == NULL) {
6127 xmlFreeParserInputBuffer(input);
6128 return (NULL);
6129 }
6130 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6131 if (stream == NULL) {
6132 xmlFreeParserInputBuffer(input);
6133 xmlFreeParserCtxt(ctxt);
6134 return (NULL);
6135 }
6136 inputPush(ctxt, stream);
6137 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6138}
6139
6140/**
6141 * htmlCtxtReadDoc:
6142 * @ctxt: an HTML parser context
6143 * @cur: a pointer to a zero terminated string
6144 * @URL: the base URL to use for the document
6145 * @encoding: the document encoding, or NULL
6146 * @options: a combination of htmlParserOption(s)
6147 *
6148 * parse an XML in-memory document and build a tree.
6149 * This reuses the existing @ctxt parser context
6150 *
6151 * Returns the resulting document tree
6152 */
6153htmlDocPtr
6154htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6155 const char *URL, const char *encoding, int options)
6156{
6157 xmlParserInputPtr stream;
6158
6159 if (cur == NULL)
6160 return (NULL);
6161 if (ctxt == NULL)
6162 return (NULL);
6163
6164 htmlCtxtReset(ctxt);
6165
6166 stream = xmlNewStringInputStream(ctxt, cur);
6167 if (stream == NULL) {
6168 return (NULL);
6169 }
6170 inputPush(ctxt, stream);
6171 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6172}
6173
6174/**
6175 * htmlCtxtReadFile:
6176 * @ctxt: an HTML parser context
6177 * @filename: a file or URL
6178 * @encoding: the document encoding, or NULL
6179 * @options: a combination of htmlParserOption(s)
6180 *
6181 * parse an XML file from the filesystem or the network.
6182 * This reuses the existing @ctxt parser context
6183 *
6184 * Returns the resulting document tree
6185 */
6186htmlDocPtr
6187htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6188 const char *encoding, int options)
6189{
6190 xmlParserInputPtr stream;
6191
6192 if (filename == NULL)
6193 return (NULL);
6194 if (ctxt == NULL)
6195 return (NULL);
6196
6197 htmlCtxtReset(ctxt);
6198
Daniel Veillard29614c72004-11-26 10:47:26 +00006199 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006200 if (stream == NULL) {
6201 return (NULL);
6202 }
6203 inputPush(ctxt, stream);
6204 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6205}
6206
6207/**
6208 * htmlCtxtReadMemory:
6209 * @ctxt: an HTML parser context
6210 * @buffer: a pointer to a char array
6211 * @size: the size of the array
6212 * @URL: the base URL to use for the document
6213 * @encoding: the document encoding, or NULL
6214 * @options: a combination of htmlParserOption(s)
6215 *
6216 * parse an XML in-memory document and build a tree.
6217 * This reuses the existing @ctxt parser context
6218 *
6219 * Returns the resulting document tree
6220 */
6221htmlDocPtr
6222htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6223 const char *URL, const char *encoding, int options)
6224{
6225 xmlParserInputBufferPtr input;
6226 xmlParserInputPtr stream;
6227
6228 if (ctxt == NULL)
6229 return (NULL);
6230 if (buffer == NULL)
6231 return (NULL);
6232
6233 htmlCtxtReset(ctxt);
6234
6235 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6236 if (input == NULL) {
6237 return(NULL);
6238 }
6239
6240 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6241 if (stream == NULL) {
6242 xmlFreeParserInputBuffer(input);
6243 return(NULL);
6244 }
6245
6246 inputPush(ctxt, stream);
6247 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6248}
6249
6250/**
6251 * htmlCtxtReadFd:
6252 * @ctxt: an HTML parser context
6253 * @fd: an open file descriptor
6254 * @URL: the base URL to use for the document
6255 * @encoding: the document encoding, or NULL
6256 * @options: a combination of htmlParserOption(s)
6257 *
6258 * parse an XML from a file descriptor and build a tree.
6259 * This reuses the existing @ctxt parser context
6260 *
6261 * Returns the resulting document tree
6262 */
6263htmlDocPtr
6264htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6265 const char *URL, const char *encoding, int options)
6266{
6267 xmlParserInputBufferPtr input;
6268 xmlParserInputPtr stream;
6269
6270 if (fd < 0)
6271 return (NULL);
6272 if (ctxt == NULL)
6273 return (NULL);
6274
6275 htmlCtxtReset(ctxt);
6276
6277
6278 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6279 if (input == NULL)
6280 return (NULL);
6281 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6282 if (stream == NULL) {
6283 xmlFreeParserInputBuffer(input);
6284 return (NULL);
6285 }
6286 inputPush(ctxt, stream);
6287 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6288}
6289
6290/**
6291 * htmlCtxtReadIO:
6292 * @ctxt: an HTML parser context
6293 * @ioread: an I/O read function
6294 * @ioclose: an I/O close function
6295 * @ioctx: an I/O handler
6296 * @URL: the base URL to use for the document
6297 * @encoding: the document encoding, or NULL
6298 * @options: a combination of htmlParserOption(s)
6299 *
6300 * parse an HTML document from I/O functions and source and build a tree.
6301 * This reuses the existing @ctxt parser context
6302 *
6303 * Returns the resulting document tree
6304 */
6305htmlDocPtr
6306htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6307 xmlInputCloseCallback ioclose, void *ioctx,
6308 const char *URL,
6309 const char *encoding, int options)
6310{
6311 xmlParserInputBufferPtr input;
6312 xmlParserInputPtr stream;
6313
6314 if (ioread == NULL)
6315 return (NULL);
6316 if (ctxt == NULL)
6317 return (NULL);
6318
6319 htmlCtxtReset(ctxt);
6320
6321 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6322 XML_CHAR_ENCODING_NONE);
6323 if (input == NULL)
6324 return (NULL);
6325 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6326 if (stream == NULL) {
6327 xmlFreeParserInputBuffer(input);
6328 return (NULL);
6329 }
6330 inputPush(ctxt, stream);
6331 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6332}
6333
Daniel Veillard5d4644e2005-04-01 13:11:58 +00006334#define bottom_HTMLparser
6335#include "elfgcchack.h"
Owen Taylor3473f882001-02-23 17:55:21 +00006336#endif /* LIBXML_HTML_ENABLED */