blob: 6b8b5624a69ec7940f69d1686858db30e7013de7 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillardf403d292003-10-05 13:51:35 +000062 * Some factorized error routines *
63 * *
64 ************************************************************************/
65
66/**
William M. Brackedb65a72004-02-06 07:36:04 +000067 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000068 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
Daniel Veillard157fee02003-10-31 10:36:03 +000076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000079 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
Daniel Veillard157fee02003-10-31 10:36:03 +0000109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000112 if (ctxt != NULL)
113 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
117 NULL, 0, 0,
118 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000119 if (ctxt != NULL)
120 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
128 * @val: integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
135{
Daniel Veillard157fee02003-10-31 10:36:03 +0000136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
138 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000139 if (ctxt != NULL)
140 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000144 if (ctxt != NULL)
145 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000146}
147
148/************************************************************************
149 * *
Owen Taylor3473f882001-02-23 17:55:21 +0000150 * Parser stacks related functions and macros *
151 * *
152 ************************************************************************/
153
Daniel Veillard1c732d22002-11-30 11:22:59 +0000154/**
155 * htmlnamePush:
156 * @ctxt: an HTML parser context
157 * @value: the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000162 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000163static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000165{
166 if (ctxt->nameNr >= ctxt->nameMax) {
167 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000168 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000169 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000170 ctxt->nameMax *
171 sizeof(ctxt->nameTab[0]));
172 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000173 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000174 return (0);
175 }
176 }
177 ctxt->nameTab[ctxt->nameNr] = value;
178 ctxt->name = value;
179 return (ctxt->nameNr++);
180}
181/**
182 * htmlnamePop:
183 * @ctxt: an HTML parser context
184 *
185 * Pops the top element name from the name stack
186 *
187 * Returns the name just removed
188 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000189static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000190htmlnamePop(htmlParserCtxtPtr ctxt)
191{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000192 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000193
Daniel Veillard1c732d22002-11-30 11:22:59 +0000194 if (ctxt->nameNr <= 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000195 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000196 ctxt->nameNr--;
197 if (ctxt->nameNr < 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000198 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000199 if (ctxt->nameNr > 0)
200 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
201 else
202 ctxt->name = NULL;
203 ret = ctxt->nameTab[ctxt->nameNr];
Daniel Veillard24505b02005-07-28 23:49:35 +0000204 ctxt->nameTab[ctxt->nameNr] = NULL;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000205 return (ret);
206}
Owen Taylor3473f882001-02-23 17:55:21 +0000207
208/*
209 * Macros for accessing the content. Those should be used only by the parser,
210 * and not exported.
211 *
212 * Dirty macros, i.e. one need to make assumption on the context to use them
213 *
214 * CUR_PTR return the current pointer to the xmlChar to be parsed.
215 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
216 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
217 * in UNICODE mode. This should be used internally by the parser
218 * only to compare to ASCII values otherwise it would break when
219 * running with UTF-8 encoding.
220 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
221 * to compare on ASCII based substring.
222 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
223 * it should be used only to compare on ASCII based substring.
224 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000225 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000226 *
227 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
228 *
229 * CURRENT Returns the current char value, with the full decoding of
230 * UTF-8 if we are using this mode. It returns an int.
231 * NEXT Skip to the next character, this does the proper decoding
232 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000233 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000234 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
235 */
236
237#define UPPER (toupper(*ctxt->input->cur))
238
Daniel Veillard77a90a72003-03-22 00:04:05 +0000239#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000240
241#define NXT(val) ctxt->input->cur[(val)]
242
243#define UPP(val) (toupper(ctxt->input->cur[(val)]))
244
245#define CUR_PTR ctxt->input->cur
246
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000247#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
248 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
249 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000250
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000251#define GROW if ((ctxt->progressive == 0) && \
252 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
253 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000254
255#define CURRENT ((int) (*ctxt->input->cur))
256
257#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
258
259/* Inported from XML */
260
Daniel Veillard561b7f82002-03-20 21:55:57 +0000261/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
262#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000263#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000264
Daniel Veillard561b7f82002-03-20 21:55:57 +0000265#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000266#define NXT(val) ctxt->input->cur[(val)]
267#define CUR_PTR ctxt->input->cur
268
269
270#define NEXTL(l) do { \
271 if (*(ctxt->input->cur) == '\n') { \
272 ctxt->input->line++; ctxt->input->col = 1; \
273 } else ctxt->input->col++; \
274 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
275 } while (0)
276
277/************
278 \
279 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
280 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
281 ************/
282
283#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
284#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
285
286#define COPY_BUF(l,b,i,v) \
287 if (l == 1) b[i++] = (xmlChar) v; \
288 else i += xmlCopyChar(l,&b[i],v)
289
290/**
291 * htmlCurrentChar:
292 * @ctxt: the HTML parser context
293 * @len: pointer to the length of the char read
294 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000295 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000296 * bytes in the input buffer. Implement the end of line normalization:
297 * 2.11 End-of-Line Handling
298 * If the encoding is unspecified, in the case we find an ISO-Latin-1
299 * char, then the encoding converter is plugged in automatically.
300 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000301 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000302 */
303
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000304static int
Owen Taylor3473f882001-02-23 17:55:21 +0000305htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
306 if (ctxt->instate == XML_PARSER_EOF)
307 return(0);
308
309 if (ctxt->token != 0) {
310 *len = 0;
311 return(ctxt->token);
312 }
313 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
314 /*
315 * We are supposed to handle UTF8, check it's valid
316 * From rfc2044: encoding of the Unicode values on UTF-8:
317 *
318 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
319 * 0000 0000-0000 007F 0xxxxxxx
320 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
321 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
322 *
323 * Check for the 0x110000 limit too
324 */
325 const unsigned char *cur = ctxt->input->cur;
326 unsigned char c;
327 unsigned int val;
328
329 c = *cur;
330 if (c & 0x80) {
331 if (cur[1] == 0)
332 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
333 if ((cur[1] & 0xc0) != 0x80)
334 goto encoding_error;
335 if ((c & 0xe0) == 0xe0) {
336
337 if (cur[2] == 0)
338 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
339 if ((cur[2] & 0xc0) != 0x80)
340 goto encoding_error;
341 if ((c & 0xf0) == 0xf0) {
342 if (cur[3] == 0)
343 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
344 if (((c & 0xf8) != 0xf0) ||
345 ((cur[3] & 0xc0) != 0x80))
346 goto encoding_error;
347 /* 4-byte code */
348 *len = 4;
349 val = (cur[0] & 0x7) << 18;
350 val |= (cur[1] & 0x3f) << 12;
351 val |= (cur[2] & 0x3f) << 6;
352 val |= cur[3] & 0x3f;
353 } else {
354 /* 3-byte code */
355 *len = 3;
356 val = (cur[0] & 0xf) << 12;
357 val |= (cur[1] & 0x3f) << 6;
358 val |= cur[2] & 0x3f;
359 }
360 } else {
361 /* 2-byte code */
362 *len = 2;
363 val = (cur[0] & 0x1f) << 6;
364 val |= cur[1] & 0x3f;
365 }
366 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000367 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
368 "Char 0x%X out of allowed range\n", val);
Owen Taylor3473f882001-02-23 17:55:21 +0000369 }
370 return(val);
371 } else {
372 /* 1-byte code */
373 *len = 1;
374 return((int) *ctxt->input->cur);
375 }
376 }
377 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000378 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000379 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000380 * XML constructs only use < 128 chars
381 */
382 *len = 1;
383 if ((int) *ctxt->input->cur < 0x80)
384 return((int) *ctxt->input->cur);
385
386 /*
387 * Humm this is bad, do an automatic flow conversion
388 */
389 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
390 ctxt->charset = XML_CHAR_ENCODING_UTF8;
391 return(xmlCurrentChar(ctxt, len));
392
393encoding_error:
394 /*
395 * If we detect an UTF8 error that probably mean that the
396 * input encoding didn't get properly advertized in the
397 * declaration header. Report the error and switch the encoding
398 * to ISO-Latin-1 (if you don't like this policy, just declare the
399 * encoding !)
400 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000401 {
402 char buffer[150];
403
404 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
Owen Taylor3473f882001-02-23 17:55:21 +0000405 ctxt->input->cur[0], ctxt->input->cur[1],
406 ctxt->input->cur[2], ctxt->input->cur[3]);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000407 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
408 "Input is not proper UTF-8, indicate encoding !\n",
409 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000410 }
411
412 ctxt->charset = XML_CHAR_ENCODING_8859_1;
413 *len = 1;
414 return((int) *ctxt->input->cur);
415}
416
417/**
Owen Taylor3473f882001-02-23 17:55:21 +0000418 * htmlSkipBlankChars:
419 * @ctxt: the HTML parser context
420 *
421 * skip all blanks character found at that point in the input streams.
422 *
423 * Returns the number of space chars skipped
424 */
425
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000426static int
Owen Taylor3473f882001-02-23 17:55:21 +0000427htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
428 int res = 0;
429
William M. Brack76e95df2003-10-18 16:20:14 +0000430 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000431 if ((*ctxt->input->cur == 0) &&
432 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
433 xmlPopInput(ctxt);
434 } else {
435 if (*(ctxt->input->cur) == '\n') {
436 ctxt->input->line++; ctxt->input->col = 1;
437 } else ctxt->input->col++;
438 ctxt->input->cur++;
439 ctxt->nbChars++;
440 if (*ctxt->input->cur == 0)
441 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
442 }
443 res++;
444 }
445 return(res);
446}
447
448
449
450/************************************************************************
451 * *
452 * The list of HTML elements and their properties *
453 * *
454 ************************************************************************/
455
456/*
457 * Start Tag: 1 means the start tag can be ommited
458 * End Tag: 1 means the end tag can be ommited
459 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000460 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000461 * Depr: this element is deprecated
462 * DTD: 1 means that this element is valid only in the Loose DTD
463 * 2 means that this element is valid only in the Frameset DTD
464 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000465 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000466 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000467 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000468
469/* Definitions and a couple of vars for HTML Elements */
470
471#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000472#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000473#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000474#define NB_PHRASE 10
Daniel Veillard930dfb62003-02-05 10:17:38 +0000475#define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000476#define NB_SPECIAL 15
Daniel Veillard930dfb62003-02-05 10:17:38 +0000477#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000478#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
479#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
480#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000481#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000482#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000483#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000484#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000485#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000486#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000487#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000488#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000489#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000490#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000491#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000492#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000493#define EMPTY NULL
494
495
496static const char* html_flow[] = { FLOW, NULL } ;
497static const char* html_inline[] = { INLINE, NULL } ;
498
499/* placeholders: elts with content but no subelements */
500static const char* html_pcdata[] = { NULL } ;
501#define html_cdata html_pcdata
502
503
504/* ... and for HTML Attributes */
505
506#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000507#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000508#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000509#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000510#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000511#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000512#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000513#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000514#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000515#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000516#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000517#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000518
519static const char* html_attrs[] = { ATTRS, NULL } ;
520static const char* core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
521static const char* core_attrs[] = { COREATTRS, NULL } ;
522static const char* i18n_attrs[] = { I18N, NULL } ;
523
524
525/* Other declarations that should go inline ... */
526static const char* a_attrs[] = { ATTRS, "charset", "type", "name",
527 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
528 "tabindex", "onfocus", "onblur", NULL } ;
529static const char* target_attr[] = { "target", NULL } ;
530static const char* rows_cols_attr[] = { "rows", "cols", NULL } ;
531static const char* alt_attr[] = { "alt", NULL } ;
532static const char* src_alt_attrs[] = { "src", "alt", NULL } ;
533static const char* href_attrs[] = { "href", NULL } ;
534static const char* clear_attrs[] = { "clear", NULL } ;
535static const char* inline_p[] = { INLINE, "p", NULL } ;
536static const char* flow_param[] = { FLOW, "param", NULL } ;
537static const char* applet_attrs[] = { COREATTRS , "codebase",
538 "archive", "alt", "name", "height", "width", "align",
539 "hspace", "vspace", NULL } ;
540static const char* area_attrs[] = { "shape", "coords", "href", "nohref",
541 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
542static const char* basefont_attrs[] =
543 { "id", "size", "color", "face", NULL } ;
544static const char* quote_attrs[] = { ATTRS, "cite", NULL } ;
545static const char* body_contents[] = { FLOW, "ins", "del", NULL } ;
546static const char* body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
547static const char* body_depr[] = { "background", "bgcolor", "text",
548 "link", "vlink", "alink", NULL } ;
549static const char* button_attrs[] = { ATTRS, "name", "value", "type",
550 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
551
552
553static const char* col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
554static const char* col_elt[] = { "col", NULL } ;
555static const char* edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
556static const char* compact_attrs[] = { ATTRS, "compact", NULL } ;
557static const char* dl_contents[] = { "dt", "dd", NULL } ;
558static const char* compact_attr[] = { "compact", NULL } ;
559static const char* label_attr[] = { "label", NULL } ;
560static const char* fieldset_contents[] = { FLOW, "legend" } ;
561static const char* font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
562static const char* form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
563static const char* form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
564static const char* frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
565static const char* frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
566static const char* frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
567static const char* head_attrs[] = { I18N, "profile", NULL } ;
568static const char* head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
569static const char* hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
570static const char* version_attr[] = { "version", NULL } ;
571static const char* html_content[] = { "head", "body", "frameset", NULL } ;
572static const char* iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
573static const char* img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
574static const char* input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
575static const char* prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
576static const char* label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
577static const char* legend_attrs[] = { ATTRS, "accesskey", NULL } ;
578static const char* align_attr[] = { "align", NULL } ;
579static const char* link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
580static const char* map_contents[] = { BLOCK, "area", NULL } ;
581static const char* name_attr[] = { "name", NULL } ;
582static const char* action_attr[] = { "action", NULL } ;
583static const char* blockli_elt[] = { BLOCK, "li", NULL } ;
584static const char* meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
585static const char* content_attr[] = { "content", NULL } ;
586static const char* type_attr[] = { "type", NULL } ;
587static const char* noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
588static const char* object_contents[] = { FLOW, "param", NULL } ;
589static const char* object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
590static const char* object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
591static const char* ol_attrs[] = { "type", "compact", "start", NULL} ;
592static const char* option_elt[] = { "option", NULL } ;
593static const char* optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
594static const char* option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
595static const char* param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
596static const char* width_attr[] = { "width", NULL } ;
597static const char* pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
598static const char* script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
599static const char* language_attr[] = { "language", NULL } ;
600static const char* select_content[] = { "optgroup", "option", NULL } ;
601static const char* select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
602static const char* style_attrs[] = { I18N, "media", "title", NULL } ;
603static const char* table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
604static const char* table_depr[] = { "align", "bgcolor", NULL } ;
605static const char* table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
606static const char* tr_elt[] = { "tr", NULL } ;
607static const char* talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
608static const char* th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
609static const char* th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
610static const char* textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
611static const char* tr_contents[] = { "th", "td", NULL } ;
612static const char* bgcolor_attr[] = { "bgcolor", NULL } ;
613static const char* li_elt[] = { "li", NULL } ;
614static const char* ul_depr[] = { "type", "compact", NULL} ;
615static const char* dir_attr[] = { "dir", NULL} ;
616
617#define DECL (const char**)
618
Daniel Veillard22090732001-07-16 00:06:07 +0000619static const htmlElemDesc
620html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000621{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
622 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
623},
624{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
625 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
626},
627{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
628 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
629},
630{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
631 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
632},
633{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
634 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
635},
636{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
637 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
638},
639{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
640 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
641},
642{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
643 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
644},
645{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
646 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
647},
648{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
649 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
650},
651{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
652 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
653},
654{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
655 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
656},
657{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
658 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
659},
660{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
661 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
662},
663{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
664 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
665},
666{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
667 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
668},
669{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
670 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
671},
672{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
673 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
674},
675{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
676 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
677},
678{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
679 EMPTY , NULL , DECL col_attrs , NULL, NULL
680},
681{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
682 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
683},
684{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
685 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
686},
687{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
688 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
689},
690{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
691 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
692},
693{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
694 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
695},
696{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
697 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
698},
699{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
700 DECL dl_contents , "dd" , html_attrs, DECL compact_attr, NULL
701},
702{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
703 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
704},
705{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
706 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
707},
708{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
709 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
710},
711{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
712 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
713},
714{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
715 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
716},
717{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
718 EMPTY, NULL, NULL, DECL frame_attrs, NULL
719},
720{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
721 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
722},
723{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
724 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
725},
726{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
727 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
728},
729{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
730 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
731},
732{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
733 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
734},
735{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
736 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
737},
738{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
739 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
740},
741{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
742 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
743},
744{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
745 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
746},
747{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
748 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
749},
750{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
751 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
752},
753{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
754 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
755},
756{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
757 EMPTY, NULL, DECL img_attrs, DECL align_attr, src_alt_attrs
758},
759{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
760 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
761},
762{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
763 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
764},
765{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
766 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
767},
768{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
769 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
770},
771{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
772 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
773},
774{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
775 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
776},
777{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
778 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
779},
780{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
781 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
782},
783{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
784 DECL map_contents , NULL, DECL html_attrs , NULL, name_attr
785},
786{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
787 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
788},
789{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
790 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
791},
792{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
793 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
794},
795{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
796 DECL html_flow, "div", DECL html_attrs, NULL, NULL
797},
798{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
799 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
800},
801{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
802 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
803},
804{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
805 option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
806},
807{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
808 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
809},
810{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
811 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
812},
813{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
814 EMPTY, NULL, DECL param_attrs, NULL, name_attr
815},
816{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
817 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
818},
819{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
820 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
821},
822{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
823 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
824},
825{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
826 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
827},
828{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
829 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
830},
831{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
832 DECL select_content, NULL, DECL select_attrs, NULL, NULL
833},
834{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
835 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
836},
837{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
838 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
839},
840{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
841 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
842},
843{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
844 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
845},
846{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
847 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
848},
849{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
850 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
851},
852{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
853 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
854},
855{ "table", 0, 0, 0, 0, 0, 0, 0, "",
856 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
857},
858{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
859 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
860},
861{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
862 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
863},
864{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
865 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
866},
867{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
868 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
869},
870{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
871 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
872},
873{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
874 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
875},
876{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
877 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
878},
879{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
880 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
881},
882{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
883 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
884},
885{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
886 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
887},
888{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
889 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
890},
891{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
892 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
893}
Owen Taylor3473f882001-02-23 17:55:21 +0000894};
895
896/*
Owen Taylor3473f882001-02-23 17:55:21 +0000897 * start tags that imply the end of current element
898 */
Daniel Veillard22090732001-07-16 00:06:07 +0000899static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000900"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
901 "dl", "ul", "ol", "menu", "dir", "address", "pre",
902 "listing", "xmp", "head", NULL,
903"head", "p", NULL,
904"title", "p", NULL,
905"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +0000906"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000907"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
908 "pre", "listing", "xmp", "head", "li", NULL,
909"hr", "p", "head", NULL,
910"h1", "p", "head", NULL,
911"h2", "p", "head", NULL,
912"h3", "p", "head", NULL,
913"h4", "p", "head", NULL,
914"h5", "p", "head", NULL,
915"h6", "p", "head", NULL,
916"dir", "p", "head", NULL,
917"address", "p", "head", "ul", NULL,
918"pre", "p", "head", "ul", NULL,
919"listing", "p", "head", NULL,
920"xmp", "p", "head", NULL,
921"blockquote", "p", "head", NULL,
922"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
923 "xmp", "head", NULL,
924"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
925 "head", "dd", NULL,
926"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
927 "head", "dt", NULL,
928"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
929 "listing", "xmp", NULL,
930"ol", "p", "head", "ul", NULL,
931"menu", "p", "head", "ul", NULL,
932"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
933"div", "p", "head", NULL,
934"noscript", "p", "head", NULL,
935"center", "font", "b", "i", "p", "head", NULL,
936"a", "a", NULL,
937"caption", "p", NULL,
938"colgroup", "caption", "colgroup", "col", "p", NULL,
939"col", "caption", "col", "p", NULL,
940"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
941 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000942"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
943"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000944"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
945"thead", "caption", "col", "colgroup", NULL,
946"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
947 "tbody", "p", NULL,
948"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
949 "tfoot", "tbody", "p", NULL,
950"optgroup", "option", NULL,
951"option", "option", NULL,
952"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
953 "pre", "listing", "xmp", "a", NULL,
954NULL
955};
956
957/*
958 * The list of HTML elements which are supposed not to have
959 * CDATA content and where a p element will be implied
960 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000961 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000962 * implied paragraph
963 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000964static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000965 "html",
966 "head",
967 "body",
968 NULL
969};
970
971/*
972 * The list of HTML attributes which are of content %Script;
973 * NOTE: when adding ones, check htmlIsScriptAttribute() since
974 * it assumes the name starts with 'on'
975 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000976static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000977 "onclick",
978 "ondblclick",
979 "onmousedown",
980 "onmouseup",
981 "onmouseover",
982 "onmousemove",
983 "onmouseout",
984 "onkeypress",
985 "onkeydown",
986 "onkeyup",
987 "onload",
988 "onunload",
989 "onfocus",
990 "onblur",
991 "onsubmit",
992 "onrest",
993 "onchange",
994 "onselect"
995};
996
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000997/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000998 * This table is used by the htmlparser to know what to do with
999 * broken html pages. By assigning different priorities to different
1000 * elements the parser can decide how to handle extra endtags.
1001 * Endtags are only allowed to close elements with lower or equal
1002 * priority.
1003 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001004
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001005typedef struct {
1006 const char *name;
1007 int priority;
1008} elementPriority;
1009
Daniel Veillard22090732001-07-16 00:06:07 +00001010static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001011 {"div", 150},
1012 {"td", 160},
1013 {"th", 160},
1014 {"tr", 170},
1015 {"thead", 180},
1016 {"tbody", 180},
1017 {"tfoot", 180},
1018 {"table", 190},
1019 {"head", 200},
1020 {"body", 200},
1021 {"html", 220},
1022 {NULL, 100} /* Default priority */
1023};
Owen Taylor3473f882001-02-23 17:55:21 +00001024
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001025static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001026static int htmlStartCloseIndexinitialized = 0;
1027
1028/************************************************************************
1029 * *
1030 * functions to handle HTML specific data *
1031 * *
1032 ************************************************************************/
1033
1034/**
1035 * htmlInitAutoClose:
1036 *
1037 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1038 * This is not reentrant. Call xmlInitParser() once before processing in
1039 * case of use in multithreaded programs.
1040 */
1041void
1042htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001043 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001044
1045 if (htmlStartCloseIndexinitialized) return;
1046
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001047 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1048 indx = 0;
1049 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1050 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001051 while (htmlStartClose[i] != NULL) i++;
1052 i++;
1053 }
1054 htmlStartCloseIndexinitialized = 1;
1055}
1056
1057/**
1058 * htmlTagLookup:
1059 * @tag: The tag name in lowercase
1060 *
1061 * Lookup the HTML tag in the ElementTable
1062 *
1063 * Returns the related htmlElemDescPtr or NULL if not found.
1064 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001065const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001066htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001067 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001068
1069 for (i = 0; i < (sizeof(html40ElementTable) /
1070 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001071 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001072 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001073 }
1074 return(NULL);
1075}
1076
1077/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001078 * htmlGetEndPriority:
1079 * @name: The name of the element to look up the priority for.
1080 *
1081 * Return value: The "endtag" priority.
1082 **/
1083static int
1084htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001085 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001086
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001087 while ((htmlEndPriority[i].name != NULL) &&
1088 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1089 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001090
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001091 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001092}
1093
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001094
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001095/**
Owen Taylor3473f882001-02-23 17:55:21 +00001096 * htmlCheckAutoClose:
1097 * @newtag: The new tag name
1098 * @oldtag: The old tag name
1099 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001100 * Checks whether the new tag is one of the registered valid tags for
1101 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001102 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1103 *
1104 * Returns 0 if no, 1 if yes.
1105 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001106static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001107htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1108{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001109 int i, indx;
1110 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001111
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001112 if (htmlStartCloseIndexinitialized == 0)
1113 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001114
1115 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001116 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001117 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001118 if (closed == NULL)
1119 return (0);
1120 if (xmlStrEqual(BAD_CAST * closed, newtag))
1121 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001122 }
1123
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001124 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001125 i++;
1126 while (htmlStartClose[i] != NULL) {
1127 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001128 return (1);
1129 }
1130 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001131 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001132 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001133}
1134
1135/**
1136 * htmlAutoCloseOnClose:
1137 * @ctxt: an HTML parser context
1138 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001139 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001140 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001141 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001142 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001143static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001144htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1145{
1146 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001147 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001148
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001149 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001150
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001151 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001152
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001153 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1154 break;
1155 /*
1156 * A missplaced endtag can only close elements with lower
1157 * or equal priority, so if we find an element with higher
1158 * priority before we find an element with
1159 * matching name, we just ignore this endtag
1160 */
1161 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1162 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001163 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001164 if (i < 0)
1165 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001166
1167 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001168 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001169 if ((info != NULL) && (info->endTag == 3)) {
1170 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1171 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001172 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001173 }
1174 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1175 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001176 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001177 }
1178}
1179
1180/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001181 * htmlAutoCloseOnEnd:
1182 * @ctxt: an HTML parser context
1183 *
1184 * Close all remaining tags at the end of the stream
1185 */
1186static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001187htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1188{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001189 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001190
William M. Brack899e64a2003-09-26 18:03:42 +00001191 if (ctxt->nameNr == 0)
1192 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001193 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001194 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1195 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001196 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001197 }
1198}
1199
1200/**
Owen Taylor3473f882001-02-23 17:55:21 +00001201 * htmlAutoClose:
1202 * @ctxt: an HTML parser context
1203 * @newtag: The new tag name or NULL
1204 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001205 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001206 * The list is kept in htmlStartClose array. This function is
1207 * called when a new tag has been detected and generates the
1208 * appropriates closes if possible/needed.
1209 * If newtag is NULL this mean we are at the end of the resource
1210 * and we should check
1211 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001212static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001213htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1214{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001215 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001216 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001217 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1218 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001219 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001220 }
1221 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001222 htmlAutoCloseOnEnd(ctxt);
1223 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001224 }
1225 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001226 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1227 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1228 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001229 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1230 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001231 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001232 }
Owen Taylor3473f882001-02-23 17:55:21 +00001233}
1234
1235/**
1236 * htmlAutoCloseTag:
1237 * @doc: the HTML document
1238 * @name: The tag name
1239 * @elem: the HTML element
1240 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001241 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001242 * The list is kept in htmlStartClose array. This function checks
1243 * if the element or one of it's children would autoclose the
1244 * given tag.
1245 *
1246 * Returns 1 if autoclose, 0 otherwise
1247 */
1248int
1249htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1250 htmlNodePtr child;
1251
1252 if (elem == NULL) return(1);
1253 if (xmlStrEqual(name, elem->name)) return(0);
1254 if (htmlCheckAutoClose(elem->name, name)) return(1);
1255 child = elem->children;
1256 while (child != NULL) {
1257 if (htmlAutoCloseTag(doc, name, child)) return(1);
1258 child = child->next;
1259 }
1260 return(0);
1261}
1262
1263/**
1264 * htmlIsAutoClosed:
1265 * @doc: the HTML document
1266 * @elem: the HTML element
1267 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001268 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001269 * The list is kept in htmlStartClose array. This function checks
1270 * if a tag is autoclosed by one of it's child
1271 *
1272 * Returns 1 if autoclosed, 0 otherwise
1273 */
1274int
1275htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1276 htmlNodePtr child;
1277
1278 if (elem == NULL) return(1);
1279 child = elem->children;
1280 while (child != NULL) {
1281 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1282 child = child->next;
1283 }
1284 return(0);
1285}
1286
1287/**
1288 * htmlCheckImplied:
1289 * @ctxt: an HTML parser context
1290 * @newtag: The new tag name
1291 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001292 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001293 * called when a new tag has been detected and generates the
1294 * appropriates implicit tags if missing
1295 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001296static void
Owen Taylor3473f882001-02-23 17:55:21 +00001297htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1298 if (!htmlOmittedDefaultValue)
1299 return;
1300 if (xmlStrEqual(newtag, BAD_CAST"html"))
1301 return;
1302 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001303 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001304 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1305 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1306 }
1307 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1308 return;
1309 if ((ctxt->nameNr <= 1) &&
1310 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1311 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1312 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1313 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1314 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1315 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1316 /*
1317 * dropped OBJECT ... i you put it first BODY will be
1318 * assumed !
1319 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001320 htmlnamePush(ctxt, BAD_CAST"head");
Owen Taylor3473f882001-02-23 17:55:21 +00001321 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1322 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1323 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1324 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1325 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1326 int i;
1327 for (i = 0;i < ctxt->nameNr;i++) {
1328 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1329 return;
1330 }
1331 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1332 return;
1333 }
1334 }
1335
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001336 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001337 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1338 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1339 }
1340}
1341
1342/**
1343 * htmlCheckParagraph
1344 * @ctxt: an HTML parser context
1345 *
1346 * Check whether a p element need to be implied before inserting
1347 * characters in the current element.
1348 *
1349 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1350 * in case of error.
1351 */
1352
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001353static int
Owen Taylor3473f882001-02-23 17:55:21 +00001354htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1355 const xmlChar *tag;
1356 int i;
1357
1358 if (ctxt == NULL)
1359 return(-1);
1360 tag = ctxt->name;
1361 if (tag == NULL) {
1362 htmlAutoClose(ctxt, BAD_CAST"p");
1363 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001364 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001365 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1366 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1367 return(1);
1368 }
1369 if (!htmlOmittedDefaultValue)
1370 return(0);
1371 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1372 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001373 htmlAutoClose(ctxt, BAD_CAST"p");
1374 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001375 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001376 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1377 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1378 return(1);
1379 }
1380 }
1381 return(0);
1382}
1383
1384/**
1385 * htmlIsScriptAttribute:
1386 * @name: an attribute name
1387 *
1388 * Check if an attribute is of content type Script
1389 *
1390 * Returns 1 is the attribute is a script 0 otherwise
1391 */
1392int
1393htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001394 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001395
1396 if (name == NULL)
1397 return(0);
1398 /*
1399 * all script attributes start with 'on'
1400 */
1401 if ((name[0] != 'o') || (name[1] != 'n'))
1402 return(0);
1403 for (i = 0;
1404 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1405 i++) {
1406 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1407 return(1);
1408 }
1409 return(0);
1410}
1411
1412/************************************************************************
1413 * *
1414 * The list of HTML predefined entities *
1415 * *
1416 ************************************************************************/
1417
1418
Daniel Veillard22090732001-07-16 00:06:07 +00001419static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001420/*
1421 * the 4 absolute ones, plus apostrophe.
1422 */
1423{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1424{ 38, "amp", "ampersand, U+0026 ISOnum" },
1425{ 39, "apos", "single quote" },
1426{ 60, "lt", "less-than sign, U+003C ISOnum" },
1427{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1428
1429/*
1430 * A bunch still in the 128-255 range
1431 * Replacing them depend really on the charset used.
1432 */
1433{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1434{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1435{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1436{ 163, "pound","pound sign, U+00A3 ISOnum" },
1437{ 164, "curren","currency sign, U+00A4 ISOnum" },
1438{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1439{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1440{ 167, "sect", "section sign, U+00A7 ISOnum" },
1441{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1442{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1443{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1444{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1445{ 172, "not", "not sign, U+00AC ISOnum" },
1446{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1447{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1448{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1449{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1450{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1451{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1452{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1453{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1454{ 181, "micro","micro sign, U+00B5 ISOnum" },
1455{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1456{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1457{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1458{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1459{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1460{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1461{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1462{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1463{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1464{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1465{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1466{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1467{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1468{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1469{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1470{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1471{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1472{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1473{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1474{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1475{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1476{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1477{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1478{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1479{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1480{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1481{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1482{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1483{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1484{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1485{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1486{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1487{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1488{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1489{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1490{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1491{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1492{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1493{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1494{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1495{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1496{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1497{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1498{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1499{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1500{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1501{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1502{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1503{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1504{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1505{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1506{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1507{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1508{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1509{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1510{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1511{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1512{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1513{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1514{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1515{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1516{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1517{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1518{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1519{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1520{ 247, "divide","division sign, U+00F7 ISOnum" },
1521{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1522{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1523{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1524{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1525{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1526{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1527{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1528{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1529
1530{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1531{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1532{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1533{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1534{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1535
1536/*
1537 * Anything below should really be kept as entities references
1538 */
1539{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1540
1541{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1542{ 732, "tilde","small tilde, U+02DC ISOdia" },
1543
1544{ 913, "Alpha","greek capital letter alpha, U+0391" },
1545{ 914, "Beta", "greek capital letter beta, U+0392" },
1546{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1547{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1548{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1549{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1550{ 919, "Eta", "greek capital letter eta, U+0397" },
1551{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1552{ 921, "Iota", "greek capital letter iota, U+0399" },
1553{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001554{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001555{ 924, "Mu", "greek capital letter mu, U+039C" },
1556{ 925, "Nu", "greek capital letter nu, U+039D" },
1557{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1558{ 927, "Omicron","greek capital letter omicron, U+039F" },
1559{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1560{ 929, "Rho", "greek capital letter rho, U+03A1" },
1561{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1562{ 932, "Tau", "greek capital letter tau, U+03A4" },
1563{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1564{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1565{ 935, "Chi", "greek capital letter chi, U+03A7" },
1566{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1567{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1568
1569{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1570{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1571{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1572{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1573{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1574{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1575{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1576{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1577{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1578{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1579{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1580{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1581{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1582{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1583{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1584{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1585{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1586{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1587{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1588{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1589{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1590{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1591{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1592{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1593{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1594{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1595{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1596{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1597
1598{ 8194, "ensp", "en space, U+2002 ISOpub" },
1599{ 8195, "emsp", "em space, U+2003 ISOpub" },
1600{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1601{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1602{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1603{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1604{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1605{ 8211, "ndash","en dash, U+2013 ISOpub" },
1606{ 8212, "mdash","em dash, U+2014 ISOpub" },
1607{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1608{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1609{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1610{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1611{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1612{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1613{ 8224, "dagger","dagger, U+2020 ISOpub" },
1614{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1615
1616{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1617{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1618
1619{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1620
1621{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1622{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1623
1624{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1625{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1626
1627{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1628{ 8260, "frasl","fraction slash, U+2044 NEW" },
1629
1630{ 8364, "euro", "euro sign, U+20AC NEW" },
1631
1632{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1633{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1634{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1635{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1636{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1637{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1638{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1639{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1640{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1641{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1642{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1643{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1644{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1645{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1646{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1647{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1648
1649{ 8704, "forall","for all, U+2200 ISOtech" },
1650{ 8706, "part", "partial differential, U+2202 ISOtech" },
1651{ 8707, "exist","there exists, U+2203 ISOtech" },
1652{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1653{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1654{ 8712, "isin", "element of, U+2208 ISOtech" },
1655{ 8713, "notin","not an element of, U+2209 ISOtech" },
1656{ 8715, "ni", "contains as member, U+220B ISOtech" },
1657{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001658{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001659{ 8722, "minus","minus sign, U+2212 ISOtech" },
1660{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1661{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1662{ 8733, "prop", "proportional to, U+221D ISOtech" },
1663{ 8734, "infin","infinity, U+221E ISOtech" },
1664{ 8736, "ang", "angle, U+2220 ISOamso" },
1665{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1666{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1667{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1668{ 8746, "cup", "union = cup, U+222A ISOtech" },
1669{ 8747, "int", "integral, U+222B ISOtech" },
1670{ 8756, "there4","therefore, U+2234 ISOtech" },
1671{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1672{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1673{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1674{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1675{ 8801, "equiv","identical to, U+2261 ISOtech" },
1676{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1677{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1678{ 8834, "sub", "subset of, U+2282 ISOtech" },
1679{ 8835, "sup", "superset of, U+2283 ISOtech" },
1680{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1681{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1682{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1683{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1684{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1685{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1686{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1687{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1688{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1689{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1690{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1691{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1692{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1693{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1694
1695{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1696{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1697{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1698{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1699
1700};
1701
1702/************************************************************************
1703 * *
1704 * Commodity functions to handle entities *
1705 * *
1706 ************************************************************************/
1707
1708/*
1709 * Macro used to grow the current buffer.
1710 */
1711#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001712 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001713 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001714 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1715 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001716 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001717 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001718 return(NULL); \
1719 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001720 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001721}
1722
1723/**
1724 * htmlEntityLookup:
1725 * @name: the entity name
1726 *
1727 * Lookup the given entity in EntitiesTable
1728 *
1729 * TODO: the linear scan is really ugly, an hash table is really needed.
1730 *
1731 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1732 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001733const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001734htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001735 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001736
1737 for (i = 0;i < (sizeof(html40EntitiesTable)/
1738 sizeof(html40EntitiesTable[0]));i++) {
1739 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001740 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001741 }
1742 }
1743 return(NULL);
1744}
1745
1746/**
1747 * htmlEntityValueLookup:
1748 * @value: the entity's unicode value
1749 *
1750 * Lookup the given entity in EntitiesTable
1751 *
1752 * TODO: the linear scan is really ugly, an hash table is really needed.
1753 *
1754 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1755 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001756const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001757htmlEntityValueLookup(unsigned int value) {
1758 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001759
1760 for (i = 0;i < (sizeof(html40EntitiesTable)/
1761 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001762 if (html40EntitiesTable[i].value >= value) {
1763 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001764 break;
William M. Brack78637da2003-07-31 14:47:38 +00001765 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001766 }
Owen Taylor3473f882001-02-23 17:55:21 +00001767 }
1768 return(NULL);
1769}
1770
1771/**
1772 * UTF8ToHtml:
1773 * @out: a pointer to an array of bytes to store the result
1774 * @outlen: the length of @out
1775 * @in: a pointer to an array of UTF-8 chars
1776 * @inlen: the length of @in
1777 *
1778 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1779 * plus HTML entities block of chars out.
1780 *
1781 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1782 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001783 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001784 * The value of @outlen after return is the number of octets consumed.
1785 */
1786int
1787UTF8ToHtml(unsigned char* out, int *outlen,
1788 const unsigned char* in, int *inlen) {
1789 const unsigned char* processed = in;
1790 const unsigned char* outend;
1791 const unsigned char* outstart = out;
1792 const unsigned char* instart = in;
1793 const unsigned char* inend;
1794 unsigned int c, d;
1795 int trailing;
1796
Daniel Veillardce682bc2004-11-05 17:22:25 +00001797 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00001798 if (in == NULL) {
1799 /*
1800 * initialization nothing to do
1801 */
1802 *outlen = 0;
1803 *inlen = 0;
1804 return(0);
1805 }
1806 inend = in + (*inlen);
1807 outend = out + (*outlen);
1808 while (in < inend) {
1809 d = *in++;
1810 if (d < 0x80) { c= d; trailing= 0; }
1811 else if (d < 0xC0) {
1812 /* trailing byte in leading position */
1813 *outlen = out - outstart;
1814 *inlen = processed - instart;
1815 return(-2);
1816 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1817 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1818 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1819 else {
1820 /* no chance for this in Ascii */
1821 *outlen = out - outstart;
1822 *inlen = processed - instart;
1823 return(-2);
1824 }
1825
1826 if (inend - in < trailing) {
1827 break;
1828 }
1829
1830 for ( ; trailing; trailing--) {
1831 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1832 break;
1833 c <<= 6;
1834 c |= d & 0x3F;
1835 }
1836
1837 /* assertion: c is a single UTF-4 value */
1838 if (c < 0x80) {
1839 if (out + 1 >= outend)
1840 break;
1841 *out++ = c;
1842 } else {
1843 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001844 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001845
1846 /*
1847 * Try to lookup a predefined HTML entity for it
1848 */
1849
1850 ent = htmlEntityValueLookup(c);
1851 if (ent == NULL) {
1852 /* no chance for this in Ascii */
1853 *outlen = out - outstart;
1854 *inlen = processed - instart;
1855 return(-2);
1856 }
1857 len = strlen(ent->name);
1858 if (out + 2 + len >= outend)
1859 break;
1860 *out++ = '&';
1861 memcpy(out, ent->name, len);
1862 out += len;
1863 *out++ = ';';
1864 }
1865 processed = in;
1866 }
1867 *outlen = out - outstart;
1868 *inlen = processed - instart;
1869 return(0);
1870}
1871
1872/**
1873 * htmlEncodeEntities:
1874 * @out: a pointer to an array of bytes to store the result
1875 * @outlen: the length of @out
1876 * @in: a pointer to an array of UTF-8 chars
1877 * @inlen: the length of @in
1878 * @quoteChar: the quote character to escape (' or ") or zero.
1879 *
1880 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1881 * plus HTML entities block of chars out.
1882 *
1883 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1884 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001885 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001886 * The value of @outlen after return is the number of octets consumed.
1887 */
1888int
1889htmlEncodeEntities(unsigned char* out, int *outlen,
1890 const unsigned char* in, int *inlen, int quoteChar) {
1891 const unsigned char* processed = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001892 const unsigned char* outend;
Owen Taylor3473f882001-02-23 17:55:21 +00001893 const unsigned char* outstart = out;
1894 const unsigned char* instart = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001895 const unsigned char* inend;
Owen Taylor3473f882001-02-23 17:55:21 +00001896 unsigned int c, d;
1897 int trailing;
1898
Daniel Veillardce682bc2004-11-05 17:22:25 +00001899 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
1900 return(-1);
1901 outend = out + (*outlen);
1902 inend = in + (*inlen);
Owen Taylor3473f882001-02-23 17:55:21 +00001903 while (in < inend) {
1904 d = *in++;
1905 if (d < 0x80) { c= d; trailing= 0; }
1906 else if (d < 0xC0) {
1907 /* trailing byte in leading position */
1908 *outlen = out - outstart;
1909 *inlen = processed - instart;
1910 return(-2);
1911 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1912 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1913 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1914 else {
1915 /* no chance for this in Ascii */
1916 *outlen = out - outstart;
1917 *inlen = processed - instart;
1918 return(-2);
1919 }
1920
1921 if (inend - in < trailing)
1922 break;
1923
1924 while (trailing--) {
1925 if (((d= *in++) & 0xC0) != 0x80) {
1926 *outlen = out - outstart;
1927 *inlen = processed - instart;
1928 return(-2);
1929 }
1930 c <<= 6;
1931 c |= d & 0x3F;
1932 }
1933
1934 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001935 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1936 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001937 if (out >= outend)
1938 break;
1939 *out++ = c;
1940 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001941 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001942 const char *cp;
1943 char nbuf[16];
1944 int len;
1945
1946 /*
1947 * Try to lookup a predefined HTML entity for it
1948 */
1949 ent = htmlEntityValueLookup(c);
1950 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001951 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001952 cp = nbuf;
1953 }
1954 else
1955 cp = ent->name;
1956 len = strlen(cp);
1957 if (out + 2 + len > outend)
1958 break;
1959 *out++ = '&';
1960 memcpy(out, cp, len);
1961 out += len;
1962 *out++ = ';';
1963 }
1964 processed = in;
1965 }
1966 *outlen = out - outstart;
1967 *inlen = processed - instart;
1968 return(0);
1969}
1970
Owen Taylor3473f882001-02-23 17:55:21 +00001971/************************************************************************
1972 * *
1973 * Commodity functions to handle streams *
1974 * *
1975 ************************************************************************/
1976
1977/**
Owen Taylor3473f882001-02-23 17:55:21 +00001978 * htmlNewInputStream:
1979 * @ctxt: an HTML parser context
1980 *
1981 * Create a new input stream structure
1982 * Returns the new input stream or NULL
1983 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001984static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001985htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1986 htmlParserInputPtr input;
1987
1988 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1989 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00001990 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001991 return(NULL);
1992 }
1993 memset(input, 0, sizeof(htmlParserInput));
1994 input->filename = NULL;
1995 input->directory = NULL;
1996 input->base = NULL;
1997 input->cur = NULL;
1998 input->buf = NULL;
1999 input->line = 1;
2000 input->col = 1;
2001 input->buf = NULL;
2002 input->free = NULL;
2003 input->version = NULL;
2004 input->consumed = 0;
2005 input->length = 0;
2006 return(input);
2007}
2008
2009
2010/************************************************************************
2011 * *
2012 * Commodity functions, cleanup needed ? *
2013 * *
2014 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002015/*
2016 * all tags allowing pc data from the html 4.01 loose dtd
2017 * NOTE: it might be more apropriate to integrate this information
2018 * into the html40ElementTable array but I don't want to risk any
2019 * binary incomptibility
2020 */
2021static const char *allowPCData[] = {
2022 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2023 "blockquote", "body", "button", "caption", "center", "cite", "code",
2024 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2025 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2026 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2027 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2028};
Owen Taylor3473f882001-02-23 17:55:21 +00002029
2030/**
2031 * areBlanks:
2032 * @ctxt: an HTML parser context
2033 * @str: a xmlChar *
2034 * @len: the size of @str
2035 *
2036 * Is this a sequence of blank chars that one can ignore ?
2037 *
2038 * Returns 1 if ignorable 0 otherwise.
2039 */
2040
2041static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002042 unsigned int i;
2043 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002044 xmlNodePtr lastChild;
2045
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002046 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002047 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002048
2049 if (CUR == 0) return(1);
2050 if (CUR != '<') return(0);
2051 if (ctxt->name == NULL)
2052 return(1);
2053 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2054 return(1);
2055 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2056 return(1);
2057 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
2058 return(1);
2059 if (ctxt->node == NULL) return(0);
2060 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002061 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2062 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002063 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002064 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2065 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002066 /* keep ws in constructs like ...<b> </b>...
2067 for all tags "b" allowing PCDATA */
2068 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2069 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2070 return(0);
2071 }
2072 }
Owen Taylor3473f882001-02-23 17:55:21 +00002073 } else if (xmlNodeIsText(lastChild)) {
2074 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002075 } else {
2076 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2077 for all tags "p" allowing PCDATA */
2078 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2079 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2080 return(0);
2081 }
2082 }
Owen Taylor3473f882001-02-23 17:55:21 +00002083 }
2084 return(1);
2085}
2086
2087/**
Owen Taylor3473f882001-02-23 17:55:21 +00002088 * htmlNewDocNoDtD:
2089 * @URI: URI for the dtd, or NULL
2090 * @ExternalID: the external ID of the DTD, or NULL
2091 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002092 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2093 * are NULL
2094 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002095 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002096 */
2097htmlDocPtr
2098htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2099 xmlDocPtr cur;
2100
2101 /*
2102 * Allocate a new document and fill the fields.
2103 */
2104 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2105 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002106 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002107 return(NULL);
2108 }
2109 memset(cur, 0, sizeof(xmlDoc));
2110
2111 cur->type = XML_HTML_DOCUMENT_NODE;
2112 cur->version = NULL;
2113 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002114 cur->doc = cur;
2115 cur->name = NULL;
2116 cur->children = NULL;
2117 cur->extSubset = NULL;
2118 cur->oldNs = NULL;
2119 cur->encoding = NULL;
2120 cur->standalone = 1;
2121 cur->compression = 0;
2122 cur->ids = NULL;
2123 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002124 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002125 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002126 if ((ExternalID != NULL) ||
2127 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002128 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002129 return(cur);
2130}
2131
2132/**
2133 * htmlNewDoc:
2134 * @URI: URI for the dtd, or NULL
2135 * @ExternalID: the external ID of the DTD, or NULL
2136 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002137 * Creates a new HTML document
2138 *
Owen Taylor3473f882001-02-23 17:55:21 +00002139 * Returns a new document
2140 */
2141htmlDocPtr
2142htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2143 if ((URI == NULL) && (ExternalID == NULL))
2144 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002145 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2146 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002147
2148 return(htmlNewDocNoDtD(URI, ExternalID));
2149}
2150
2151
2152/************************************************************************
2153 * *
2154 * The parser itself *
2155 * Relates to http://www.w3.org/TR/html40 *
2156 * *
2157 ************************************************************************/
2158
2159/************************************************************************
2160 * *
2161 * The parser itself *
2162 * *
2163 ************************************************************************/
2164
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002165static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002166
Owen Taylor3473f882001-02-23 17:55:21 +00002167/**
2168 * htmlParseHTMLName:
2169 * @ctxt: an HTML parser context
2170 *
2171 * parse an HTML tag or attribute name, note that we convert it to lowercase
2172 * since HTML names are not case-sensitive.
2173 *
2174 * Returns the Tag Name parsed or NULL
2175 */
2176
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002177static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002178htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002179 int i = 0;
2180 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2181
William M. Brackd1757ab2004-10-02 22:07:48 +00002182 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Owen Taylor3473f882001-02-23 17:55:21 +00002183 (CUR != ':')) return(NULL);
2184
2185 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002186 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Owen Taylor3473f882001-02-23 17:55:21 +00002187 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2188 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2189 else loc[i] = CUR;
2190 i++;
2191
2192 NEXT;
2193 }
2194
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002195 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002196}
2197
2198/**
2199 * htmlParseName:
2200 * @ctxt: an HTML parser context
2201 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002202 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002203 *
2204 * Returns the Name parsed or NULL
2205 */
2206
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002207static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002208htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002209 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002210 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002211 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002212
2213 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002214
2215 /*
2216 * Accelerator for simple ASCII names
2217 */
2218 in = ctxt->input->cur;
2219 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2220 ((*in >= 0x41) && (*in <= 0x5A)) ||
2221 (*in == '_') || (*in == ':')) {
2222 in++;
2223 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2224 ((*in >= 0x41) && (*in <= 0x5A)) ||
2225 ((*in >= 0x30) && (*in <= 0x39)) ||
2226 (*in == '_') || (*in == '-') ||
2227 (*in == ':') || (*in == '.'))
2228 in++;
2229 if ((*in > 0) && (*in < 0x80)) {
2230 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002231 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002232 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002233 ctxt->nbChars += count;
2234 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002235 return(ret);
2236 }
2237 }
2238 return(htmlParseNameComplex(ctxt));
2239}
2240
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002241static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002242htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002243 int len = 0, l;
2244 int c;
2245 int count = 0;
2246
2247 /*
2248 * Handler for more complex cases
2249 */
2250 GROW;
2251 c = CUR_CHAR(l);
2252 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2253 (!IS_LETTER(c) && (c != '_') &&
2254 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002255 return(NULL);
2256 }
2257
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002258 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2259 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2260 (c == '.') || (c == '-') ||
2261 (c == '_') || (c == ':') ||
2262 (IS_COMBINING(c)) ||
2263 (IS_EXTENDER(c)))) {
2264 if (count++ > 100) {
2265 count = 0;
2266 GROW;
2267 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002268 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002269 NEXTL(l);
2270 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002271 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002272 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002273}
2274
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002275
Owen Taylor3473f882001-02-23 17:55:21 +00002276/**
2277 * htmlParseHTMLAttribute:
2278 * @ctxt: an HTML parser context
2279 * @stop: a char stop value
2280 *
2281 * parse an HTML attribute value till the stop (quote), if
2282 * stop is 0 then it stops at the first space
2283 *
2284 * Returns the attribute parsed or NULL
2285 */
2286
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002287static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002288htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2289 xmlChar *buffer = NULL;
2290 int buffer_size = 0;
2291 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002292 const xmlChar *name = NULL;
2293 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002294 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002295
2296 /*
2297 * allocate a translation buffer.
2298 */
2299 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002300 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002301 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002302 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002303 return(NULL);
2304 }
2305 out = buffer;
2306
2307 /*
2308 * Ok loop until we reach one of the ending chars
2309 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002310 while ((CUR != 0) && (CUR != stop)) {
2311 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002312 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002313 if (CUR == '&') {
2314 if (NXT(1) == '#') {
2315 unsigned int c;
2316 int bits;
2317
2318 c = htmlParseCharRef(ctxt);
2319 if (c < 0x80)
2320 { *out++ = c; bits= -6; }
2321 else if (c < 0x800)
2322 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2323 else if (c < 0x10000)
2324 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2325 else
2326 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2327
2328 for ( ; bits >= 0; bits-= 6) {
2329 *out++ = ((c >> bits) & 0x3F) | 0x80;
2330 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002331
2332 if (out - buffer > buffer_size - 100) {
2333 int indx = out - buffer;
2334
2335 growBuffer(buffer);
2336 out = &buffer[indx];
2337 }
Owen Taylor3473f882001-02-23 17:55:21 +00002338 } else {
2339 ent = htmlParseEntityRef(ctxt, &name);
2340 if (name == NULL) {
2341 *out++ = '&';
2342 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002343 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002344
2345 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002346 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002347 }
2348 } else if (ent == NULL) {
2349 *out++ = '&';
2350 cur = name;
2351 while (*cur != 0) {
2352 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002353 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002354
2355 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002356 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002357 }
2358 *out++ = *cur++;
2359 }
Owen Taylor3473f882001-02-23 17:55:21 +00002360 } else {
2361 unsigned int c;
2362 int bits;
2363
2364 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002365 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002366
2367 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002368 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002369 }
2370 c = (xmlChar)ent->value;
2371 if (c < 0x80)
2372 { *out++ = c; bits= -6; }
2373 else if (c < 0x800)
2374 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2375 else if (c < 0x10000)
2376 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2377 else
2378 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2379
2380 for ( ; bits >= 0; bits-= 6) {
2381 *out++ = ((c >> bits) & 0x3F) | 0x80;
2382 }
Owen Taylor3473f882001-02-23 17:55:21 +00002383 }
2384 }
2385 } else {
2386 unsigned int c;
2387 int bits, l;
2388
2389 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002390 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002391
2392 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002393 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002394 }
2395 c = CUR_CHAR(l);
2396 if (c < 0x80)
2397 { *out++ = c; bits= -6; }
2398 else if (c < 0x800)
2399 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2400 else if (c < 0x10000)
2401 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2402 else
2403 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2404
2405 for ( ; bits >= 0; bits-= 6) {
2406 *out++ = ((c >> bits) & 0x3F) | 0x80;
2407 }
2408 NEXT;
2409 }
2410 }
2411 *out++ = 0;
2412 return(buffer);
2413}
2414
2415/**
Owen Taylor3473f882001-02-23 17:55:21 +00002416 * htmlParseEntityRef:
2417 * @ctxt: an HTML parser context
2418 * @str: location to store the entity name
2419 *
2420 * parse an HTML ENTITY references
2421 *
2422 * [68] EntityRef ::= '&' Name ';'
2423 *
2424 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2425 * if non-NULL *str will have to be freed by the caller.
2426 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002427const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002428htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2429 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002430 const htmlEntityDesc * ent = NULL;
Daniel Veillard42595322004-11-08 10:52:06 +00002431
2432 if (str != NULL) *str = NULL;
2433 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002434
2435 if (CUR == '&') {
2436 NEXT;
2437 name = htmlParseName(ctxt);
2438 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002439 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2440 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002441 } else {
2442 GROW;
2443 if (CUR == ';') {
Daniel Veillard42595322004-11-08 10:52:06 +00002444 if (str != NULL)
2445 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002446
2447 /*
2448 * Lookup the entity in the table.
2449 */
2450 ent = htmlEntityLookup(name);
2451 if (ent != NULL) /* OK that's ugly !!! */
2452 NEXT;
2453 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002454 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2455 "htmlParseEntityRef: expecting ';'\n",
2456 NULL, NULL);
Daniel Veillard42595322004-11-08 10:52:06 +00002457 if (str != NULL)
2458 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002459 }
2460 }
2461 }
2462 return(ent);
2463}
2464
2465/**
2466 * htmlParseAttValue:
2467 * @ctxt: an HTML parser context
2468 *
2469 * parse a value for an attribute
2470 * Note: the parser won't do substitution of entities here, this
2471 * will be handled later in xmlStringGetNodeList, unless it was
2472 * asked for ctxt->replaceEntities != 0
2473 *
2474 * Returns the AttValue parsed or NULL.
2475 */
2476
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002477static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002478htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2479 xmlChar *ret = NULL;
2480
2481 if (CUR == '"') {
2482 NEXT;
2483 ret = htmlParseHTMLAttribute(ctxt, '"');
2484 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002485 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2486 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002487 } else
2488 NEXT;
2489 } else if (CUR == '\'') {
2490 NEXT;
2491 ret = htmlParseHTMLAttribute(ctxt, '\'');
2492 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002493 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2494 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002495 } else
2496 NEXT;
2497 } else {
2498 /*
2499 * That's an HTMLism, the attribute value may not be quoted
2500 */
2501 ret = htmlParseHTMLAttribute(ctxt, 0);
2502 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002503 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2504 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002505 }
2506 }
2507 return(ret);
2508}
2509
2510/**
2511 * htmlParseSystemLiteral:
2512 * @ctxt: an HTML parser context
2513 *
2514 * parse an HTML Literal
2515 *
2516 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2517 *
2518 * Returns the SystemLiteral parsed or NULL
2519 */
2520
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002521static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002522htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2523 const xmlChar *q;
2524 xmlChar *ret = NULL;
2525
2526 if (CUR == '"') {
2527 NEXT;
2528 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002529 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002530 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002531 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002532 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2533 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002534 } else {
2535 ret = xmlStrndup(q, CUR_PTR - q);
2536 NEXT;
2537 }
2538 } else if (CUR == '\'') {
2539 NEXT;
2540 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002541 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002542 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002543 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002544 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2545 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002546 } else {
2547 ret = xmlStrndup(q, CUR_PTR - q);
2548 NEXT;
2549 }
2550 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002551 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2552 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002553 }
2554
2555 return(ret);
2556}
2557
2558/**
2559 * htmlParsePubidLiteral:
2560 * @ctxt: an HTML parser context
2561 *
2562 * parse an HTML public literal
2563 *
2564 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2565 *
2566 * Returns the PubidLiteral parsed or NULL.
2567 */
2568
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002569static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002570htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2571 const xmlChar *q;
2572 xmlChar *ret = NULL;
2573 /*
2574 * Name ::= (Letter | '_') (NameChar)*
2575 */
2576 if (CUR == '"') {
2577 NEXT;
2578 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002579 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002580 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002581 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2582 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002583 } else {
2584 ret = xmlStrndup(q, CUR_PTR - q);
2585 NEXT;
2586 }
2587 } else if (CUR == '\'') {
2588 NEXT;
2589 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002590 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002591 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002592 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002593 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2594 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002595 } else {
2596 ret = xmlStrndup(q, CUR_PTR - q);
2597 NEXT;
2598 }
2599 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002600 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2601 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002602 }
2603
2604 return(ret);
2605}
2606
2607/**
2608 * htmlParseScript:
2609 * @ctxt: an HTML parser context
2610 *
2611 * parse the content of an HTML SCRIPT or STYLE element
2612 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2613 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2614 * http://www.w3.org/TR/html4/types.html#type-script
2615 * http://www.w3.org/TR/html4/types.html#h-6.15
2616 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2617 *
2618 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2619 * element and the value of intrinsic event attributes. User agents must
2620 * not evaluate script data as HTML markup but instead must pass it on as
2621 * data to a script engine.
2622 * NOTES:
2623 * - The content is passed like CDATA
2624 * - the attributes for style and scripting "onXXX" are also described
2625 * as CDATA but SGML allows entities references in attributes so their
2626 * processing is identical as other attributes
2627 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002628static void
Owen Taylor3473f882001-02-23 17:55:21 +00002629htmlParseScript(htmlParserCtxtPtr ctxt) {
Daniel Veillard7d2b3232005-07-14 08:57:39 +00002630 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
Owen Taylor3473f882001-02-23 17:55:21 +00002631 int nbchar = 0;
Daniel Veillard358fef42005-07-13 16:37:38 +00002632 int cur,l;
Owen Taylor3473f882001-02-23 17:55:21 +00002633
2634 SHRINK;
Daniel Veillard358fef42005-07-13 16:37:38 +00002635 cur = CUR_CHAR(l);
William M. Brack76e95df2003-10-18 16:20:14 +00002636 while (IS_CHAR_CH(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002637 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2638 (NXT(3) == '-')) {
2639 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2640 if (ctxt->sax->cdataBlock!= NULL) {
2641 /*
2642 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2643 */
2644 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002645 } else if (ctxt->sax->characters != NULL) {
2646 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Daniel Veillardc1f78342001-11-10 11:43:05 +00002647 }
2648 }
2649 nbchar = 0;
2650 htmlParseComment(ctxt);
Daniel Veillard358fef42005-07-13 16:37:38 +00002651 cur = CUR_CHAR(l);
Daniel Veillardc1f78342001-11-10 11:43:05 +00002652 continue;
2653 } else if ((cur == '<') && (NXT(1) == '/')) {
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002654 /*
2655 * One should break here, the specification is clear:
2656 * Authors should therefore escape "</" within the content.
2657 * Escape mechanisms are specific to each scripting or
2658 * style sheet language.
2659 *
2660 * In recovery mode, only break if end tag match the
2661 * current tag, effectively ignoring all tags inside the
2662 * script/style block and treating the entire block as
2663 * CDATA.
2664 */
2665 if (ctxt->recovery) {
2666 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2667 xmlStrlen(ctxt->name)) == 0)
2668 {
2669 break; /* while */
2670 } else {
2671 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2672 "Element %s embbeds close tag\n",
2673 ctxt->name, NULL);
2674 }
2675 } else {
2676 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2677 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2678 {
2679 break; /* while */
2680 }
2681 }
Owen Taylor3473f882001-02-23 17:55:21 +00002682 }
Daniel Veillard358fef42005-07-13 16:37:38 +00002683 COPY_BUF(l,buf,nbchar,cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002684 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2685 if (ctxt->sax->cdataBlock!= NULL) {
2686 /*
2687 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2688 */
2689 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002690 } else if (ctxt->sax->characters != NULL) {
2691 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002692 }
2693 nbchar = 0;
2694 }
Daniel Veillard358fef42005-07-13 16:37:38 +00002695 NEXTL(l);
2696 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002697 }
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002698
William M. Brack76e95df2003-10-18 16:20:14 +00002699 if (!(IS_CHAR_CH(cur))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002700 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2701 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002702 NEXT;
2703 }
2704
2705 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2706 if (ctxt->sax->cdataBlock!= NULL) {
2707 /*
2708 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2709 */
2710 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002711 } else if (ctxt->sax->characters != NULL) {
2712 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002713 }
2714 }
2715}
2716
2717
2718/**
2719 * htmlParseCharData:
2720 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002721 *
2722 * parse a CharData section.
2723 * if we are within a CDATA section ']]>' marks an end of section.
2724 *
2725 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2726 */
2727
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002728static void
2729htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002730 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2731 int nbchar = 0;
2732 int cur, l;
2733
2734 SHRINK;
2735 cur = CUR_CHAR(l);
2736 while (((cur != '<') || (ctxt->token == '<')) &&
2737 ((cur != '&') || (ctxt->token == '&')) &&
2738 (IS_CHAR(cur))) {
2739 COPY_BUF(l,buf,nbchar,cur);
2740 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2741 /*
2742 * Ok the segment is to be consumed as chars.
2743 */
2744 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2745 if (areBlanks(ctxt, buf, nbchar)) {
2746 if (ctxt->sax->ignorableWhitespace != NULL)
2747 ctxt->sax->ignorableWhitespace(ctxt->userData,
2748 buf, nbchar);
2749 } else {
2750 htmlCheckParagraph(ctxt);
2751 if (ctxt->sax->characters != NULL)
2752 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2753 }
2754 }
2755 nbchar = 0;
2756 }
2757 NEXTL(l);
2758 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002759 if (cur == 0) {
2760 SHRINK;
2761 GROW;
2762 cur = CUR_CHAR(l);
2763 }
Owen Taylor3473f882001-02-23 17:55:21 +00002764 }
2765 if (nbchar != 0) {
Daniel Veillardd2755a82005-08-07 23:42:39 +00002766 buf[nbchar] = 0;
2767
Owen Taylor3473f882001-02-23 17:55:21 +00002768 /*
2769 * Ok the segment is to be consumed as chars.
2770 */
2771 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2772 if (areBlanks(ctxt, buf, nbchar)) {
2773 if (ctxt->sax->ignorableWhitespace != NULL)
2774 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2775 } else {
2776 htmlCheckParagraph(ctxt);
2777 if (ctxt->sax->characters != NULL)
2778 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2779 }
2780 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002781 } else {
2782 /*
2783 * Loop detection
2784 */
2785 if (cur == 0)
2786 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002787 }
2788}
2789
2790/**
2791 * htmlParseExternalID:
2792 * @ctxt: an HTML parser context
2793 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002794 *
2795 * Parse an External ID or a Public ID
2796 *
Owen Taylor3473f882001-02-23 17:55:21 +00002797 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2798 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2799 *
2800 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2801 *
2802 * Returns the function returns SystemLiteral and in the second
2803 * case publicID receives PubidLiteral, is strict is off
2804 * it is possible to return NULL and have publicID set.
2805 */
2806
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002807static xmlChar *
2808htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002809 xmlChar *URI = NULL;
2810
2811 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2812 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2813 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2814 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002815 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002816 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2817 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002818 }
2819 SKIP_BLANKS;
2820 URI = htmlParseSystemLiteral(ctxt);
2821 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002822 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2823 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002824 }
2825 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2826 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2827 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2828 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002829 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002830 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2831 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002832 }
2833 SKIP_BLANKS;
2834 *publicID = htmlParsePubidLiteral(ctxt);
2835 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002836 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2837 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2838 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002839 }
2840 SKIP_BLANKS;
2841 if ((CUR == '"') || (CUR == '\'')) {
2842 URI = htmlParseSystemLiteral(ctxt);
2843 }
2844 }
2845 return(URI);
2846}
2847
2848/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00002849 * xmlParsePI:
2850 * @ctxt: an XML parser context
2851 *
2852 * parse an XML Processing Instruction.
2853 *
2854 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2855 */
2856static void
2857htmlParsePI(htmlParserCtxtPtr ctxt) {
2858 xmlChar *buf = NULL;
2859 int len = 0;
2860 int size = HTML_PARSER_BUFFER_SIZE;
2861 int cur, l;
2862 const xmlChar *target;
2863 xmlParserInputState state;
2864 int count = 0;
2865
2866 if ((RAW == '<') && (NXT(1) == '?')) {
2867 state = ctxt->instate;
2868 ctxt->instate = XML_PARSER_PI;
2869 /*
2870 * this is a Processing Instruction.
2871 */
2872 SKIP(2);
2873 SHRINK;
2874
2875 /*
2876 * Parse the target name and check for special support like
2877 * namespace.
2878 */
2879 target = htmlParseName(ctxt);
2880 if (target != NULL) {
2881 if (RAW == '>') {
2882 SKIP(1);
2883
2884 /*
2885 * SAX: PI detected.
2886 */
2887 if ((ctxt->sax) && (!ctxt->disableSAX) &&
2888 (ctxt->sax->processingInstruction != NULL))
2889 ctxt->sax->processingInstruction(ctxt->userData,
2890 target, NULL);
2891 ctxt->instate = state;
2892 return;
2893 }
2894 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
2895 if (buf == NULL) {
2896 htmlErrMemory(ctxt, NULL);
2897 ctxt->instate = state;
2898 return;
2899 }
2900 cur = CUR;
2901 if (!IS_BLANK(cur)) {
2902 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2903 "ParsePI: PI %s space expected\n", target, NULL);
2904 }
2905 SKIP_BLANKS;
2906 cur = CUR_CHAR(l);
2907 while (IS_CHAR(cur) && (cur != '>')) {
2908 if (len + 5 >= size) {
2909 xmlChar *tmp;
2910
2911 size *= 2;
2912 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2913 if (tmp == NULL) {
2914 htmlErrMemory(ctxt, NULL);
2915 xmlFree(buf);
2916 ctxt->instate = state;
2917 return;
2918 }
2919 buf = tmp;
2920 }
2921 count++;
2922 if (count > 50) {
2923 GROW;
2924 count = 0;
2925 }
2926 COPY_BUF(l,buf,len,cur);
2927 NEXTL(l);
2928 cur = CUR_CHAR(l);
2929 if (cur == 0) {
2930 SHRINK;
2931 GROW;
2932 cur = CUR_CHAR(l);
2933 }
2934 }
2935 buf[len] = 0;
2936 if (cur != '>') {
2937 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
2938 "ParsePI: PI %s never end ...\n", target, NULL);
2939 } else {
2940 SKIP(1);
2941
2942 /*
2943 * SAX: PI detected.
2944 */
2945 if ((ctxt->sax) && (!ctxt->disableSAX) &&
2946 (ctxt->sax->processingInstruction != NULL))
2947 ctxt->sax->processingInstruction(ctxt->userData,
2948 target, buf);
2949 }
2950 xmlFree(buf);
2951 } else {
2952 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
2953 "PI is not started correctly", NULL, NULL);
2954 }
2955 ctxt->instate = state;
2956 }
2957}
2958
2959/**
Owen Taylor3473f882001-02-23 17:55:21 +00002960 * htmlParseComment:
2961 * @ctxt: an HTML parser context
2962 *
2963 * Parse an XML (SGML) comment <!-- .... -->
2964 *
2965 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2966 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002967static void
Owen Taylor3473f882001-02-23 17:55:21 +00002968htmlParseComment(htmlParserCtxtPtr ctxt) {
2969 xmlChar *buf = NULL;
2970 int len;
2971 int size = HTML_PARSER_BUFFER_SIZE;
2972 int q, ql;
2973 int r, rl;
2974 int cur, l;
2975 xmlParserInputState state;
2976
2977 /*
2978 * Check that there is a comment right here.
2979 */
2980 if ((RAW != '<') || (NXT(1) != '!') ||
2981 (NXT(2) != '-') || (NXT(3) != '-')) return;
2982
2983 state = ctxt->instate;
2984 ctxt->instate = XML_PARSER_COMMENT;
2985 SHRINK;
2986 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002987 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002988 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002989 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002990 ctxt->instate = state;
2991 return;
2992 }
2993 q = CUR_CHAR(ql);
2994 NEXTL(ql);
2995 r = CUR_CHAR(rl);
2996 NEXTL(rl);
2997 cur = CUR_CHAR(l);
2998 len = 0;
2999 while (IS_CHAR(cur) &&
3000 ((cur != '>') ||
3001 (r != '-') || (q != '-'))) {
3002 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00003003 xmlChar *tmp;
3004
Owen Taylor3473f882001-02-23 17:55:21 +00003005 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00003006 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3007 if (tmp == NULL) {
3008 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00003009 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003010 ctxt->instate = state;
3011 return;
3012 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00003013 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00003014 }
3015 COPY_BUF(ql,buf,len,q);
3016 q = r;
3017 ql = rl;
3018 r = cur;
3019 rl = l;
3020 NEXTL(l);
3021 cur = CUR_CHAR(l);
3022 if (cur == 0) {
3023 SHRINK;
3024 GROW;
3025 cur = CUR_CHAR(l);
3026 }
3027 }
3028 buf[len] = 0;
3029 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003030 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3031 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003032 xmlFree(buf);
3033 } else {
3034 NEXT;
3035 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3036 (!ctxt->disableSAX))
3037 ctxt->sax->comment(ctxt->userData, buf);
3038 xmlFree(buf);
3039 }
3040 ctxt->instate = state;
3041}
3042
3043/**
3044 * htmlParseCharRef:
3045 * @ctxt: an HTML parser context
3046 *
3047 * parse Reference declarations
3048 *
3049 * [66] CharRef ::= '&#' [0-9]+ ';' |
3050 * '&#x' [0-9a-fA-F]+ ';'
3051 *
3052 * Returns the value parsed (as an int)
3053 */
3054int
3055htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3056 int val = 0;
3057
Daniel Veillarda03e3652004-11-02 18:45:30 +00003058 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3059 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3060 "htmlParseCharRef: context error\n",
3061 NULL, NULL);
3062 return(0);
3063 }
Owen Taylor3473f882001-02-23 17:55:21 +00003064 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003065 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003066 SKIP(3);
3067 while (CUR != ';') {
3068 if ((CUR >= '0') && (CUR <= '9'))
3069 val = val * 16 + (CUR - '0');
3070 else if ((CUR >= 'a') && (CUR <= 'f'))
3071 val = val * 16 + (CUR - 'a') + 10;
3072 else if ((CUR >= 'A') && (CUR <= 'F'))
3073 val = val * 16 + (CUR - 'A') + 10;
3074 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003075 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3076 "htmlParseCharRef: invalid hexadecimal value\n",
3077 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003078 return(0);
3079 }
3080 NEXT;
3081 }
3082 if (CUR == ';')
3083 NEXT;
3084 } else if ((CUR == '&') && (NXT(1) == '#')) {
3085 SKIP(2);
3086 while (CUR != ';') {
3087 if ((CUR >= '0') && (CUR <= '9'))
3088 val = val * 10 + (CUR - '0');
3089 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003090 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3091 "htmlParseCharRef: invalid decimal value\n",
3092 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003093 return(0);
3094 }
3095 NEXT;
3096 }
3097 if (CUR == ';')
3098 NEXT;
3099 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003100 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3101 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003102 }
3103 /*
3104 * Check the value IS_CHAR ...
3105 */
3106 if (IS_CHAR(val)) {
3107 return(val);
3108 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003109 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3110 "htmlParseCharRef: invalid xmlChar value %d\n",
3111 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003112 }
3113 return(0);
3114}
3115
3116
3117/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003118 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003119 * @ctxt: an HTML parser context
3120 *
3121 * parse a DOCTYPE declaration
3122 *
3123 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3124 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3125 */
3126
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003127static void
Owen Taylor3473f882001-02-23 17:55:21 +00003128htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003129 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003130 xmlChar *ExternalID = NULL;
3131 xmlChar *URI = NULL;
3132
3133 /*
3134 * We know that '<!DOCTYPE' has been detected.
3135 */
3136 SKIP(9);
3137
3138 SKIP_BLANKS;
3139
3140 /*
3141 * Parse the DOCTYPE name.
3142 */
3143 name = htmlParseName(ctxt);
3144 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003145 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3146 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3147 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003148 }
3149 /*
3150 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3151 */
3152
3153 SKIP_BLANKS;
3154
3155 /*
3156 * Check for SystemID and ExternalID
3157 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003158 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003159 SKIP_BLANKS;
3160
3161 /*
3162 * We should be at the end of the DOCTYPE declaration.
3163 */
3164 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003165 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3166 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003167 /* We shouldn't try to resynchronize ... */
3168 }
3169 NEXT;
3170
3171 /*
3172 * Create or update the document accordingly to the DOCTYPE
3173 */
3174 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3175 (!ctxt->disableSAX))
3176 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3177
3178 /*
3179 * Cleanup, since we don't use all those identifiers
3180 */
3181 if (URI != NULL) xmlFree(URI);
3182 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003183}
3184
3185/**
3186 * htmlParseAttribute:
3187 * @ctxt: an HTML parser context
3188 * @value: a xmlChar ** used to store the value of the attribute
3189 *
3190 * parse an attribute
3191 *
3192 * [41] Attribute ::= Name Eq AttValue
3193 *
3194 * [25] Eq ::= S? '=' S?
3195 *
3196 * With namespace:
3197 *
3198 * [NS 11] Attribute ::= QName Eq AttValue
3199 *
3200 * Also the case QName == xmlns:??? is handled independently as a namespace
3201 * definition.
3202 *
3203 * Returns the attribute name, and the value in *value.
3204 */
3205
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003206static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003207htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003208 const xmlChar *name;
3209 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003210
3211 *value = NULL;
3212 name = htmlParseHTMLName(ctxt);
3213 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003214 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3215 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003216 return(NULL);
3217 }
3218
3219 /*
3220 * read the value
3221 */
3222 SKIP_BLANKS;
3223 if (CUR == '=') {
3224 NEXT;
3225 SKIP_BLANKS;
3226 val = htmlParseAttValue(ctxt);
3227 /******
3228 } else {
3229 * TODO : some attribute must have values, some may not
3230 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3231 ctxt->sax->warning(ctxt->userData,
3232 "No value for attribute %s\n", name); */
3233 }
3234
3235 *value = val;
3236 return(name);
3237}
3238
3239/**
3240 * htmlCheckEncoding:
3241 * @ctxt: an HTML parser context
3242 * @attvalue: the attribute value
3243 *
3244 * Checks an http-equiv attribute from a Meta tag to detect
3245 * the encoding
3246 * If a new encoding is detected the parser is switched to decode
3247 * it and pass UTF8
3248 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003249static void
Owen Taylor3473f882001-02-23 17:55:21 +00003250htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3251 const xmlChar *encoding;
3252
3253 if ((ctxt == NULL) || (attvalue == NULL))
3254 return;
3255
3256 /* do not change encoding */
3257 if (ctxt->input->encoding != NULL)
3258 return;
3259
3260 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3261 if (encoding != NULL) {
3262 encoding += 8;
3263 } else {
3264 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3265 if (encoding != NULL)
3266 encoding += 9;
3267 }
3268 if (encoding != NULL) {
3269 xmlCharEncoding enc;
3270 xmlCharEncodingHandlerPtr handler;
3271
3272 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3273
3274 if (ctxt->input->encoding != NULL)
3275 xmlFree((xmlChar *) ctxt->input->encoding);
3276 ctxt->input->encoding = xmlStrdup(encoding);
3277
3278 enc = xmlParseCharEncoding((const char *) encoding);
3279 /*
3280 * registered set of known encodings
3281 */
3282 if (enc != XML_CHAR_ENCODING_ERROR) {
3283 xmlSwitchEncoding(ctxt, enc);
3284 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3285 } else {
3286 /*
3287 * fallback for unknown encodings
3288 */
3289 handler = xmlFindCharEncodingHandler((const char *) encoding);
3290 if (handler != NULL) {
3291 xmlSwitchToEncoding(ctxt, handler);
3292 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3293 } else {
3294 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3295 }
3296 }
3297
3298 if ((ctxt->input->buf != NULL) &&
3299 (ctxt->input->buf->encoder != NULL) &&
3300 (ctxt->input->buf->raw != NULL) &&
3301 (ctxt->input->buf->buffer != NULL)) {
3302 int nbchars;
3303 int processed;
3304
3305 /*
3306 * convert as much as possible to the parser reading buffer.
3307 */
3308 processed = ctxt->input->cur - ctxt->input->base;
3309 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3310 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3311 ctxt->input->buf->buffer,
3312 ctxt->input->buf->raw);
3313 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003314 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3315 "htmlCheckEncoding: encoder error\n",
3316 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003317 }
3318 ctxt->input->base =
3319 ctxt->input->cur = ctxt->input->buf->buffer->content;
3320 }
3321 }
3322}
3323
3324/**
3325 * htmlCheckMeta:
3326 * @ctxt: an HTML parser context
3327 * @atts: the attributes values
3328 *
3329 * Checks an attributes from a Meta tag
3330 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003331static void
Owen Taylor3473f882001-02-23 17:55:21 +00003332htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3333 int i;
3334 const xmlChar *att, *value;
3335 int http = 0;
3336 const xmlChar *content = NULL;
3337
3338 if ((ctxt == NULL) || (atts == NULL))
3339 return;
3340
3341 i = 0;
3342 att = atts[i++];
3343 while (att != NULL) {
3344 value = atts[i++];
3345 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3346 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3347 http = 1;
3348 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3349 content = value;
3350 att = atts[i++];
3351 }
3352 if ((http) && (content != NULL))
3353 htmlCheckEncoding(ctxt, content);
3354
3355}
3356
3357/**
3358 * htmlParseStartTag:
3359 * @ctxt: an HTML parser context
3360 *
3361 * parse a start of tag either for rule element or
3362 * EmptyElement. In both case we don't parse the tag closing chars.
3363 *
3364 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3365 *
3366 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3367 *
3368 * With namespace:
3369 *
3370 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3371 *
3372 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3373 *
Daniel Veillard597f1c12005-07-03 23:00:18 +00003374 * Returns 0 in case of success and -1 in case of error.
Owen Taylor3473f882001-02-23 17:55:21 +00003375 */
3376
Daniel Veillard597f1c12005-07-03 23:00:18 +00003377static int
Owen Taylor3473f882001-02-23 17:55:21 +00003378htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003379 const xmlChar *name;
3380 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003381 xmlChar *attvalue;
Daniel Veillardf403d292003-10-05 13:51:35 +00003382 const xmlChar **atts = ctxt->atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003383 int nbatts = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +00003384 int maxatts = ctxt->maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003385 int meta = 0;
3386 int i;
3387
Daniel Veillarda03e3652004-11-02 18:45:30 +00003388 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3389 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3390 "htmlParseStartTag: context error\n", NULL, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003391 return -1;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003392 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003393 if (CUR != '<') return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003394 NEXT;
3395
3396 GROW;
3397 name = htmlParseHTMLName(ctxt);
3398 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003399 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3400 "htmlParseStartTag: invalid element name\n",
3401 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003402 /* Dump the bogus tag like browsers do */
William M. Brack76e95df2003-10-18 16:20:14 +00003403 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Owen Taylor3473f882001-02-23 17:55:21 +00003404 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003405 return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003406 }
3407 if (xmlStrEqual(name, BAD_CAST"meta"))
3408 meta = 1;
3409
3410 /*
3411 * Check for auto-closure of HTML elements.
3412 */
3413 htmlAutoClose(ctxt, name);
3414
3415 /*
3416 * Check for implied HTML elements.
3417 */
3418 htmlCheckImplied(ctxt, name);
3419
3420 /*
3421 * Avoid html at any level > 0, head at any level != 1
3422 * or any attempt to recurse body
3423 */
3424 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003425 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3426 "htmlParseStartTag: misplaced <html> tag\n",
3427 name, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003428 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003429 }
3430 if ((ctxt->nameNr != 1) &&
3431 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003432 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3433 "htmlParseStartTag: misplaced <head> tag\n",
3434 name, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003435 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003436 }
3437 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003438 int indx;
3439 for (indx = 0;indx < ctxt->nameNr;indx++) {
3440 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003441 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3442 "htmlParseStartTag: misplaced <body> tag\n",
3443 name, NULL);
Daniel Veillardc59d8262003-11-20 21:59:12 +00003444 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3445 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003446 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003447 }
3448 }
3449 }
3450
3451 /*
3452 * Now parse the attributes, it ends up with the ending
3453 *
3454 * (S Attribute)* S?
3455 */
3456 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003457 while ((IS_CHAR_CH(CUR)) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003458 (CUR != '>') &&
3459 ((CUR != '/') || (NXT(1) != '>'))) {
3460 long cons = ctxt->nbChars;
3461
3462 GROW;
3463 attname = htmlParseAttribute(ctxt, &attvalue);
3464 if (attname != NULL) {
3465
3466 /*
3467 * Well formedness requires at most one declaration of an attribute
3468 */
3469 for (i = 0; i < nbatts;i += 2) {
3470 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003471 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3472 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003473 if (attvalue != NULL)
3474 xmlFree(attvalue);
3475 goto failed;
3476 }
3477 }
3478
3479 /*
3480 * Add the pair to atts
3481 */
3482 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003483 maxatts = 22; /* allow for 10 attrs by default */
3484 atts = (const xmlChar **)
3485 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003486 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003487 htmlErrMemory(ctxt, NULL);
3488 if (attvalue != NULL)
3489 xmlFree(attvalue);
3490 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003491 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003492 ctxt->atts = atts;
3493 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003494 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003495 const xmlChar **n;
3496
Owen Taylor3473f882001-02-23 17:55:21 +00003497 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003498 n = (const xmlChar **) xmlRealloc((void *) atts,
3499 maxatts * sizeof(const xmlChar *));
3500 if (n == NULL) {
3501 htmlErrMemory(ctxt, NULL);
3502 if (attvalue != NULL)
3503 xmlFree(attvalue);
3504 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003505 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003506 atts = n;
3507 ctxt->atts = atts;
3508 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003509 }
3510 atts[nbatts++] = attname;
3511 atts[nbatts++] = attvalue;
3512 atts[nbatts] = NULL;
3513 atts[nbatts + 1] = NULL;
3514 }
3515 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003516 if (attvalue != NULL)
3517 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003518 /* Dump the bogus attribute string up to the next blank or
3519 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003520 while ((IS_CHAR_CH(CUR)) &&
3521 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003522 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003523 NEXT;
3524 }
3525
3526failed:
3527 SKIP_BLANKS;
3528 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003529 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3530 "htmlParseStartTag: problem parsing attributes\n",
3531 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003532 break;
3533 }
3534 }
3535
3536 /*
3537 * Handle specific association to the META tag
3538 */
3539 if (meta)
3540 htmlCheckMeta(ctxt, atts);
3541
3542 /*
3543 * SAX: Start of Element !
3544 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003545 htmlnamePush(ctxt, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003546 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3547 if (nbatts != 0)
3548 ctxt->sax->startElement(ctxt->userData, name, atts);
3549 else
3550 ctxt->sax->startElement(ctxt->userData, name, NULL);
3551 }
Owen Taylor3473f882001-02-23 17:55:21 +00003552
3553 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003554 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003555 if (atts[i] != NULL)
3556 xmlFree((xmlChar *) atts[i]);
3557 }
Owen Taylor3473f882001-02-23 17:55:21 +00003558 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003559
3560 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003561}
3562
3563/**
3564 * htmlParseEndTag:
3565 * @ctxt: an HTML parser context
3566 *
3567 * parse an end of tag
3568 *
3569 * [42] ETag ::= '</' Name S? '>'
3570 *
3571 * With namespace
3572 *
3573 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003574 *
3575 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003576 */
3577
Daniel Veillardf420ac52001-07-04 16:04:09 +00003578static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003579htmlParseEndTag(htmlParserCtxtPtr ctxt)
3580{
3581 const xmlChar *name;
3582 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003583 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003584
3585 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003586 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3587 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003588 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003589 }
3590 SKIP(2);
3591
3592 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003593 if (name == NULL)
3594 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003595
3596 /*
3597 * We should definitely be at the ending "S? '>'" part
3598 */
3599 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003600 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003601 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3602 "End tag : expected '>'\n", NULL, NULL);
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00003603 if (ctxt->recovery) {
3604 /*
3605 * We're not at the ending > !!
3606 * Error, unless in recover mode where we search forwards
3607 * until we find a >
3608 */
3609 while (CUR != '\0' && CUR != '>') NEXT;
3610 NEXT;
3611 }
Owen Taylor3473f882001-02-23 17:55:21 +00003612 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003613 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003614
3615 /*
3616 * If the name read is not one of the element in the parsing stack
3617 * then return, it's just an error.
3618 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003619 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3620 if (xmlStrEqual(name, ctxt->nameTab[i]))
3621 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003622 }
3623 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003624 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3625 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003626 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003627 }
3628
3629
3630 /*
3631 * Check for auto-closure of HTML elements.
3632 */
3633
3634 htmlAutoCloseOnClose(ctxt, name);
3635
3636 /*
3637 * Well formedness constraints, opening and closing must match.
3638 * With the exception that the autoclose may have popped stuff out
3639 * of the stack.
3640 */
3641 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003642 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003643 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3644 "Opening and ending tag mismatch: %s and %s\n",
3645 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003646 }
3647 }
3648
3649 /*
3650 * SAX: End of Tag
3651 */
3652 oldname = ctxt->name;
3653 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003654 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3655 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003656 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003657 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003658 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003659 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003660 }
3661
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003662 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003663}
3664
3665
3666/**
3667 * htmlParseReference:
3668 * @ctxt: an HTML parser context
3669 *
3670 * parse and handle entity references in content,
3671 * this will end-up in a call to character() since this is either a
3672 * CharRef, or a predefined entity.
3673 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003674static void
Owen Taylor3473f882001-02-23 17:55:21 +00003675htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003676 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003677 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003678 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003679 if (CUR != '&') return;
3680
3681 if (NXT(1) == '#') {
3682 unsigned int c;
3683 int bits, i = 0;
3684
3685 c = htmlParseCharRef(ctxt);
3686 if (c == 0)
3687 return;
3688
3689 if (c < 0x80) { out[i++]= c; bits= -6; }
3690 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3691 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3692 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3693
3694 for ( ; bits >= 0; bits-= 6) {
3695 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3696 }
3697 out[i] = 0;
3698
3699 htmlCheckParagraph(ctxt);
3700 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3701 ctxt->sax->characters(ctxt->userData, out, i);
3702 } else {
3703 ent = htmlParseEntityRef(ctxt, &name);
3704 if (name == NULL) {
3705 htmlCheckParagraph(ctxt);
3706 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3707 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3708 return;
3709 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003710 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003711 htmlCheckParagraph(ctxt);
3712 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3713 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3714 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3715 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3716 }
3717 } else {
3718 unsigned int c;
3719 int bits, i = 0;
3720
3721 c = ent->value;
3722 if (c < 0x80)
3723 { out[i++]= c; bits= -6; }
3724 else if (c < 0x800)
3725 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3726 else if (c < 0x10000)
3727 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3728 else
3729 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3730
3731 for ( ; bits >= 0; bits-= 6) {
3732 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3733 }
3734 out[i] = 0;
3735
3736 htmlCheckParagraph(ctxt);
3737 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3738 ctxt->sax->characters(ctxt->userData, out, i);
3739 }
Owen Taylor3473f882001-02-23 17:55:21 +00003740 }
3741}
3742
3743/**
3744 * htmlParseContent:
3745 * @ctxt: an HTML parser context
3746 * @name: the node name
3747 *
3748 * Parse a content: comment, sub-element, reference or text.
3749 *
3750 */
3751
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003752static void
Owen Taylor3473f882001-02-23 17:55:21 +00003753htmlParseContent(htmlParserCtxtPtr ctxt) {
3754 xmlChar *currentNode;
3755 int depth;
3756
3757 currentNode = xmlStrdup(ctxt->name);
3758 depth = ctxt->nameNr;
3759 while (1) {
3760 long cons = ctxt->nbChars;
3761
3762 GROW;
3763 /*
3764 * Our tag or one of it's parent or children is ending.
3765 */
3766 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003767 if (htmlParseEndTag(ctxt) &&
3768 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3769 if (currentNode != NULL)
3770 xmlFree(currentNode);
3771 return;
3772 }
3773 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003774 }
3775
3776 /*
3777 * Has this node been popped out during parsing of
3778 * the next element
3779 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003780 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3781 (!xmlStrEqual(currentNode, ctxt->name)))
3782 {
Owen Taylor3473f882001-02-23 17:55:21 +00003783 if (currentNode != NULL) xmlFree(currentNode);
3784 return;
3785 }
3786
Daniel Veillardf9533d12001-03-03 10:04:57 +00003787 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3788 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003789 /*
3790 * Handle SCRIPT/STYLE separately
3791 */
3792 htmlParseScript(ctxt);
3793 } else {
3794 /*
3795 * Sometimes DOCTYPE arrives in the middle of the document
3796 */
3797 if ((CUR == '<') && (NXT(1) == '!') &&
3798 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3799 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3800 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3801 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003802 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3803 "Misplaced DOCTYPE declaration\n",
3804 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003805 htmlParseDocTypeDecl(ctxt);
3806 }
3807
3808 /*
3809 * First case : a comment
3810 */
3811 if ((CUR == '<') && (NXT(1) == '!') &&
3812 (NXT(2) == '-') && (NXT(3) == '-')) {
3813 htmlParseComment(ctxt);
3814 }
3815
3816 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003817 * Second case : a Processing Instruction.
3818 */
3819 else if ((CUR == '<') && (NXT(1) == '?')) {
3820 htmlParsePI(ctxt);
3821 }
3822
3823 /*
3824 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00003825 */
3826 else if (CUR == '<') {
3827 htmlParseElement(ctxt);
3828 }
3829
3830 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003831 * Fourth case : a reference. If if has not been resolved,
Owen Taylor3473f882001-02-23 17:55:21 +00003832 * parsing returns it's Name, create the node
3833 */
3834 else if (CUR == '&') {
3835 htmlParseReference(ctxt);
3836 }
3837
3838 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003839 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00003840 */
3841 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003842 htmlAutoCloseOnEnd(ctxt);
3843 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003844 }
3845
3846 /*
3847 * Last case, text. Note that References are handled directly.
3848 */
3849 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003850 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003851 }
3852
3853 if (cons == ctxt->nbChars) {
3854 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003855 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3856 "detected an error in element content\n",
3857 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003858 }
3859 break;
3860 }
3861 }
3862 GROW;
3863 }
3864 if (currentNode != NULL) xmlFree(currentNode);
3865}
3866
3867/**
3868 * htmlParseElement:
3869 * @ctxt: an HTML parser context
3870 *
3871 * parse an HTML element, this is highly recursive
3872 *
3873 * [39] element ::= EmptyElemTag | STag content ETag
3874 *
3875 * [41] Attribute ::= Name Eq AttValue
3876 */
3877
3878void
3879htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003880 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003881 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003882 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003883 htmlParserNodeInfo node_info;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003884 int failed;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003885 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003886 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003887
Daniel Veillarda03e3652004-11-02 18:45:30 +00003888 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3889 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
Daniel Veillard597f1c12005-07-03 23:00:18 +00003890 "htmlParseElement: context error\n", NULL, NULL);
Daniel Veillarda03e3652004-11-02 18:45:30 +00003891 return;
3892 }
Owen Taylor3473f882001-02-23 17:55:21 +00003893 /* Capture start position */
3894 if (ctxt->record_info) {
3895 node_info.begin_pos = ctxt->input->consumed +
3896 (CUR_PTR - ctxt->input->base);
3897 node_info.begin_line = ctxt->input->line;
3898 }
3899
Daniel Veillard597f1c12005-07-03 23:00:18 +00003900 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003901 name = ctxt->name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003902 if (failed || (name == NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003903 if (CUR == '>')
3904 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003905 return;
3906 }
Owen Taylor3473f882001-02-23 17:55:21 +00003907
3908 /*
3909 * Lookup the info for that element.
3910 */
3911 info = htmlTagLookup(name);
3912 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003913 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
3914 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003915 }
3916
3917 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003918 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003919 */
3920 if ((CUR == '/') && (NXT(1) == '>')) {
3921 SKIP(2);
3922 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3923 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003924 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003925 return;
3926 }
3927
3928 if (CUR == '>') {
3929 NEXT;
3930 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003931 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3932 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003933
3934 /*
3935 * end of parsing of this node.
3936 */
3937 if (xmlStrEqual(name, ctxt->name)) {
3938 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00003939 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003940 }
3941
3942 /*
3943 * Capture end position and add node
3944 */
3945 if ( currentNode != NULL && ctxt->record_info ) {
3946 node_info.end_pos = ctxt->input->consumed +
3947 (CUR_PTR - ctxt->input->base);
3948 node_info.end_line = ctxt->input->line;
3949 node_info.node = ctxt->node;
3950 xmlParserAddNodeInfo(ctxt, &node_info);
3951 }
3952 return;
3953 }
3954
3955 /*
3956 * Check for an Empty Element from DTD definition
3957 */
3958 if ((info != NULL) && (info->empty)) {
3959 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3960 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003961 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003962 return;
3963 }
3964
3965 /*
3966 * Parse the content of the element:
3967 */
3968 currentNode = xmlStrdup(ctxt->name);
3969 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00003970 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003971 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003972 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003973 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003974 if (ctxt->nameNr < depth) break;
3975 }
3976
Owen Taylor3473f882001-02-23 17:55:21 +00003977 /*
3978 * Capture end position and add node
3979 */
3980 if ( currentNode != NULL && ctxt->record_info ) {
3981 node_info.end_pos = ctxt->input->consumed +
3982 (CUR_PTR - ctxt->input->base);
3983 node_info.end_line = ctxt->input->line;
3984 node_info.node = ctxt->node;
3985 xmlParserAddNodeInfo(ctxt, &node_info);
3986 }
William M. Brack76e95df2003-10-18 16:20:14 +00003987 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003988 htmlAutoCloseOnEnd(ctxt);
3989 }
3990
Owen Taylor3473f882001-02-23 17:55:21 +00003991 if (currentNode != NULL)
3992 xmlFree(currentNode);
3993}
3994
3995/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003996 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00003997 * @ctxt: an HTML parser context
3998 *
3999 * parse an HTML document (and build a tree if using the standard SAX
4000 * interface).
4001 *
4002 * Returns 0, -1 in case of error. the parser context is augmented
4003 * as a result of the parsing.
4004 */
4005
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00004006int
Owen Taylor3473f882001-02-23 17:55:21 +00004007htmlParseDocument(htmlParserCtxtPtr ctxt) {
4008 xmlDtdPtr dtd;
4009
Daniel Veillardd0463562001-10-13 09:15:48 +00004010 xmlInitParser();
4011
Owen Taylor3473f882001-02-23 17:55:21 +00004012 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00004013
Daniel Veillarda03e3652004-11-02 18:45:30 +00004014 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4015 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4016 "htmlParseDocument: context error\n", NULL, NULL);
4017 return(XML_ERR_INTERNAL_ERROR);
4018 }
4019 ctxt->html = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00004020 GROW;
4021 /*
4022 * SAX: beginning of the document processing.
4023 */
4024 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4025 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4026
4027 /*
4028 * Wipe out everything which is before the first '<'
4029 */
4030 SKIP_BLANKS;
4031 if (CUR == 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004032 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4033 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004034 }
4035
4036 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4037 ctxt->sax->startDocument(ctxt->userData);
4038
4039
4040 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004041 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004042 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004043 while (((CUR == '<') && (NXT(1) == '!') &&
4044 (NXT(2) == '-') && (NXT(3) == '-')) ||
4045 ((CUR == '<') && (NXT(1) == '?'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004046 htmlParseComment(ctxt);
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004047 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004048 SKIP_BLANKS;
4049 }
4050
4051
4052 /*
4053 * Then possibly doc type declaration(s) and more Misc
4054 * (doctypedecl Misc*)?
4055 */
4056 if ((CUR == '<') && (NXT(1) == '!') &&
4057 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4058 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4059 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4060 (UPP(8) == 'E')) {
4061 htmlParseDocTypeDecl(ctxt);
4062 }
4063 SKIP_BLANKS;
4064
4065 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004066 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004067 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004068 while (((CUR == '<') && (NXT(1) == '!') &&
4069 (NXT(2) == '-') && (NXT(3) == '-')) ||
4070 ((CUR == '<') && (NXT(1) == '?'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004071 htmlParseComment(ctxt);
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004072 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004073 SKIP_BLANKS;
4074 }
4075
4076 /*
4077 * Time to start parsing the tree itself
4078 */
4079 htmlParseContent(ctxt);
4080
4081 /*
4082 * autoclose
4083 */
4084 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004085 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004086
4087
4088 /*
4089 * SAX: end of the document processing.
4090 */
4091 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4092 ctxt->sax->endDocument(ctxt->userData);
4093
4094 if (ctxt->myDoc != NULL) {
4095 dtd = xmlGetIntSubset(ctxt->myDoc);
4096 if (dtd == NULL)
4097 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00004098 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004099 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4100 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4101 }
4102 if (! ctxt->wellFormed) return(-1);
4103 return(0);
4104}
4105
4106
4107/************************************************************************
4108 * *
4109 * Parser contexts handling *
4110 * *
4111 ************************************************************************/
4112
4113/**
William M. Brackedb65a72004-02-06 07:36:04 +00004114 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004115 * @ctxt: an HTML parser context
4116 *
4117 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004118 *
4119 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004120 */
4121
Daniel Veillardf403d292003-10-05 13:51:35 +00004122static int
Owen Taylor3473f882001-02-23 17:55:21 +00004123htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4124{
4125 htmlSAXHandler *sax;
4126
Daniel Veillardf403d292003-10-05 13:51:35 +00004127 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004128 memset(ctxt, 0, sizeof(htmlParserCtxt));
4129
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004130 ctxt->dict = xmlDictCreate();
4131 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004132 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4133 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004134 }
Owen Taylor3473f882001-02-23 17:55:21 +00004135 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4136 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004137 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4138 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004139 }
4140 else
4141 memset(sax, 0, sizeof(htmlSAXHandler));
4142
4143 /* Allocate the Input stack */
4144 ctxt->inputTab = (htmlParserInputPtr *)
4145 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4146 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004147 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004148 ctxt->inputNr = 0;
4149 ctxt->inputMax = 0;
4150 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004151 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004152 }
4153 ctxt->inputNr = 0;
4154 ctxt->inputMax = 5;
4155 ctxt->input = NULL;
4156 ctxt->version = NULL;
4157 ctxt->encoding = NULL;
4158 ctxt->standalone = -1;
4159 ctxt->instate = XML_PARSER_START;
4160
4161 /* Allocate the Node stack */
4162 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4163 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004164 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004165 ctxt->nodeNr = 0;
4166 ctxt->nodeMax = 0;
4167 ctxt->node = NULL;
4168 ctxt->inputNr = 0;
4169 ctxt->inputMax = 0;
4170 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004171 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004172 }
4173 ctxt->nodeNr = 0;
4174 ctxt->nodeMax = 10;
4175 ctxt->node = NULL;
4176
4177 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004178 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004179 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004180 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004181 ctxt->nameNr = 0;
4182 ctxt->nameMax = 10;
4183 ctxt->name = NULL;
4184 ctxt->nodeNr = 0;
4185 ctxt->nodeMax = 0;
4186 ctxt->node = NULL;
4187 ctxt->inputNr = 0;
4188 ctxt->inputMax = 0;
4189 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004190 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004191 }
4192 ctxt->nameNr = 0;
4193 ctxt->nameMax = 10;
4194 ctxt->name = NULL;
4195
Daniel Veillard092643b2003-09-25 14:29:29 +00004196 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004197 else {
4198 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004199 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004200 }
4201 ctxt->userData = ctxt;
4202 ctxt->myDoc = NULL;
4203 ctxt->wellFormed = 1;
4204 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004205 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004206 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004207 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004208 ctxt->vctxt.userData = ctxt;
4209 ctxt->vctxt.error = xmlParserValidityError;
4210 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004211 ctxt->record_info = 0;
4212 ctxt->validate = 0;
4213 ctxt->nbChars = 0;
4214 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004215 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004216 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004217 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004218}
4219
4220/**
4221 * htmlFreeParserCtxt:
4222 * @ctxt: an HTML parser context
4223 *
4224 * Free all the memory used by a parser context. However the parsed
4225 * document in ctxt->myDoc is not freed.
4226 */
4227
4228void
4229htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4230{
4231 xmlFreeParserCtxt(ctxt);
4232}
4233
4234/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004235 * htmlNewParserCtxt:
4236 *
4237 * Allocate and initialize a new parser context.
4238 *
4239 * Returns the xmlParserCtxtPtr or NULL
4240 */
4241
4242static htmlParserCtxtPtr
4243htmlNewParserCtxt(void)
4244{
4245 xmlParserCtxtPtr ctxt;
4246
4247 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4248 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004249 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004250 return(NULL);
4251 }
4252 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004253 if (htmlInitParserCtxt(ctxt) < 0) {
4254 htmlFreeParserCtxt(ctxt);
4255 return(NULL);
4256 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004257 return(ctxt);
4258}
4259
4260/**
4261 * htmlCreateMemoryParserCtxt:
4262 * @buffer: a pointer to a char array
4263 * @size: the size of the array
4264 *
4265 * Create a parser context for an HTML in-memory document.
4266 *
4267 * Returns the new parser context or NULL
4268 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004269htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004270htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4271 xmlParserCtxtPtr ctxt;
4272 xmlParserInputPtr input;
4273 xmlParserInputBufferPtr buf;
4274
4275 if (buffer == NULL)
4276 return(NULL);
4277 if (size <= 0)
4278 return(NULL);
4279
4280 ctxt = htmlNewParserCtxt();
4281 if (ctxt == NULL)
4282 return(NULL);
4283
4284 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4285 if (buf == NULL) return(NULL);
4286
4287 input = xmlNewInputStream(ctxt);
4288 if (input == NULL) {
4289 xmlFreeParserCtxt(ctxt);
4290 return(NULL);
4291 }
4292
4293 input->filename = NULL;
4294 input->buf = buf;
4295 input->base = input->buf->buffer->content;
4296 input->cur = input->buf->buffer->content;
4297 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4298
4299 inputPush(ctxt, input);
4300 return(ctxt);
4301}
4302
4303/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004304 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004305 * @cur: a pointer to an array of xmlChar
4306 * @encoding: a free form C string describing the HTML document encoding, or NULL
4307 *
4308 * Create a parser context for an HTML document.
4309 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004310 * TODO: check the need to add encoding handling there
4311 *
Owen Taylor3473f882001-02-23 17:55:21 +00004312 * Returns the new parser context or NULL
4313 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004314static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00004315htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004316 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004317 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004318
Daniel Veillard1d995272002-07-22 16:43:32 +00004319 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004320 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004321 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004322 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4323
4324 if (encoding != NULL) {
4325 xmlCharEncoding enc;
4326 xmlCharEncodingHandlerPtr handler;
4327
4328 if (ctxt->input->encoding != NULL)
4329 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004330 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004331
4332 enc = xmlParseCharEncoding(encoding);
4333 /*
4334 * registered set of known encodings
4335 */
4336 if (enc != XML_CHAR_ENCODING_ERROR) {
4337 xmlSwitchEncoding(ctxt, enc);
4338 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004339 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4340 "Unsupported encoding %s\n",
4341 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004342 }
4343 } else {
4344 /*
4345 * fallback for unknown encodings
4346 */
4347 handler = xmlFindCharEncodingHandler((const char *) encoding);
4348 if (handler != NULL) {
4349 xmlSwitchToEncoding(ctxt, handler);
4350 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004351 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4352 "Unsupported encoding %s\n",
4353 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004354 }
4355 }
4356 }
4357 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004358}
4359
Daniel Veillard73b013f2003-09-30 12:36:01 +00004360#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004361/************************************************************************
4362 * *
4363 * Progressive parsing interfaces *
4364 * *
4365 ************************************************************************/
4366
4367/**
4368 * htmlParseLookupSequence:
4369 * @ctxt: an HTML parser context
4370 * @first: the first char to lookup
4371 * @next: the next char to lookup or zero
4372 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004373 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004374 *
4375 * Try to find if a sequence (first, next, third) or just (first next) or
4376 * (first) is available in the input stream.
4377 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4378 * to avoid rescanning sequences of bytes, it DOES change the state of the
4379 * parser, do not use liberally.
4380 * This is basically similar to xmlParseLookupSequence()
4381 *
4382 * Returns the index to the current parsing point if the full sequence
4383 * is available, -1 otherwise.
4384 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004385static int
Owen Taylor3473f882001-02-23 17:55:21 +00004386htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
William M. Brackc1939562003-08-05 15:52:22 +00004387 xmlChar next, xmlChar third, int iscomment) {
Owen Taylor3473f882001-02-23 17:55:21 +00004388 int base, len;
4389 htmlParserInputPtr in;
4390 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004391 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004392
4393 in = ctxt->input;
4394 if (in == NULL) return(-1);
4395 base = in->cur - in->base;
4396 if (base < 0) return(-1);
4397 if (ctxt->checkIndex > base)
4398 base = ctxt->checkIndex;
4399 if (in->buf == NULL) {
4400 buf = in->base;
4401 len = in->length;
4402 } else {
4403 buf = in->buf->buffer->content;
4404 len = in->buf->buffer->use;
4405 }
4406 /* take into account the sequence length */
4407 if (third) len -= 2;
4408 else if (next) len --;
4409 for (;base < len;base++) {
William M. Brackc1939562003-08-05 15:52:22 +00004410 if (!incomment && (base + 4 < len) && !iscomment) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004411 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4412 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4413 incomment = 1;
Daniel Veillard97e01882003-07-30 18:59:19 +00004414 /* do not increment past <! - some people use <!--> */
4415 base += 2;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004416 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004417 }
4418 if (incomment) {
William M. Brack4a557d92003-07-29 04:28:04 +00004419 if (base + 3 > len)
Daniel Veillardc1f78342001-11-10 11:43:05 +00004420 return(-1);
4421 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4422 (buf[base + 2] == '>')) {
4423 incomment = 0;
4424 base += 2;
4425 }
4426 continue;
4427 }
Owen Taylor3473f882001-02-23 17:55:21 +00004428 if (buf[base] == first) {
4429 if (third != 0) {
4430 if ((buf[base + 1] != next) ||
4431 (buf[base + 2] != third)) continue;
4432 } else if (next != 0) {
4433 if (buf[base + 1] != next) continue;
4434 }
4435 ctxt->checkIndex = 0;
4436#ifdef DEBUG_PUSH
4437 if (next == 0)
4438 xmlGenericError(xmlGenericErrorContext,
4439 "HPP: lookup '%c' found at %d\n",
4440 first, base);
4441 else if (third == 0)
4442 xmlGenericError(xmlGenericErrorContext,
4443 "HPP: lookup '%c%c' found at %d\n",
4444 first, next, base);
4445 else
4446 xmlGenericError(xmlGenericErrorContext,
4447 "HPP: lookup '%c%c%c' found at %d\n",
4448 first, next, third, base);
4449#endif
4450 return(base - (in->cur - in->base));
4451 }
4452 }
4453 ctxt->checkIndex = base;
4454#ifdef DEBUG_PUSH
4455 if (next == 0)
4456 xmlGenericError(xmlGenericErrorContext,
4457 "HPP: lookup '%c' failed\n", first);
4458 else if (third == 0)
4459 xmlGenericError(xmlGenericErrorContext,
4460 "HPP: lookup '%c%c' failed\n", first, next);
4461 else
4462 xmlGenericError(xmlGenericErrorContext,
4463 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4464#endif
4465 return(-1);
4466}
4467
4468/**
4469 * htmlParseTryOrFinish:
4470 * @ctxt: an HTML parser context
4471 * @terminate: last chunk indicator
4472 *
4473 * Try to progress on parsing
4474 *
4475 * Returns zero if no parsing was possible
4476 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004477static int
Owen Taylor3473f882001-02-23 17:55:21 +00004478htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4479 int ret = 0;
4480 htmlParserInputPtr in;
4481 int avail = 0;
4482 xmlChar cur, next;
4483
4484#ifdef DEBUG_PUSH
4485 switch (ctxt->instate) {
4486 case XML_PARSER_EOF:
4487 xmlGenericError(xmlGenericErrorContext,
4488 "HPP: try EOF\n"); break;
4489 case XML_PARSER_START:
4490 xmlGenericError(xmlGenericErrorContext,
4491 "HPP: try START\n"); break;
4492 case XML_PARSER_MISC:
4493 xmlGenericError(xmlGenericErrorContext,
4494 "HPP: try MISC\n");break;
4495 case XML_PARSER_COMMENT:
4496 xmlGenericError(xmlGenericErrorContext,
4497 "HPP: try COMMENT\n");break;
4498 case XML_PARSER_PROLOG:
4499 xmlGenericError(xmlGenericErrorContext,
4500 "HPP: try PROLOG\n");break;
4501 case XML_PARSER_START_TAG:
4502 xmlGenericError(xmlGenericErrorContext,
4503 "HPP: try START_TAG\n");break;
4504 case XML_PARSER_CONTENT:
4505 xmlGenericError(xmlGenericErrorContext,
4506 "HPP: try CONTENT\n");break;
4507 case XML_PARSER_CDATA_SECTION:
4508 xmlGenericError(xmlGenericErrorContext,
4509 "HPP: try CDATA_SECTION\n");break;
4510 case XML_PARSER_END_TAG:
4511 xmlGenericError(xmlGenericErrorContext,
4512 "HPP: try END_TAG\n");break;
4513 case XML_PARSER_ENTITY_DECL:
4514 xmlGenericError(xmlGenericErrorContext,
4515 "HPP: try ENTITY_DECL\n");break;
4516 case XML_PARSER_ENTITY_VALUE:
4517 xmlGenericError(xmlGenericErrorContext,
4518 "HPP: try ENTITY_VALUE\n");break;
4519 case XML_PARSER_ATTRIBUTE_VALUE:
4520 xmlGenericError(xmlGenericErrorContext,
4521 "HPP: try ATTRIBUTE_VALUE\n");break;
4522 case XML_PARSER_DTD:
4523 xmlGenericError(xmlGenericErrorContext,
4524 "HPP: try DTD\n");break;
4525 case XML_PARSER_EPILOG:
4526 xmlGenericError(xmlGenericErrorContext,
4527 "HPP: try EPILOG\n");break;
4528 case XML_PARSER_PI:
4529 xmlGenericError(xmlGenericErrorContext,
4530 "HPP: try PI\n");break;
4531 case XML_PARSER_SYSTEM_LITERAL:
4532 xmlGenericError(xmlGenericErrorContext,
4533 "HPP: try SYSTEM_LITERAL\n");break;
4534 }
4535#endif
4536
4537 while (1) {
4538
4539 in = ctxt->input;
4540 if (in == NULL) break;
4541 if (in->buf == NULL)
4542 avail = in->length - (in->cur - in->base);
4543 else
4544 avail = in->buf->buffer->use - (in->cur - in->base);
4545 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004546 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004547 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4548 /*
4549 * SAX: end of the document processing.
4550 */
4551 ctxt->instate = XML_PARSER_EOF;
4552 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4553 ctxt->sax->endDocument(ctxt->userData);
4554 }
4555 }
4556 if (avail < 1)
4557 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004558 cur = in->cur[0];
4559 if (cur == 0) {
4560 SKIP(1);
4561 continue;
4562 }
4563
Owen Taylor3473f882001-02-23 17:55:21 +00004564 switch (ctxt->instate) {
4565 case XML_PARSER_EOF:
4566 /*
4567 * Document parsing is done !
4568 */
4569 goto done;
4570 case XML_PARSER_START:
4571 /*
4572 * Very first chars read from the document flow.
4573 */
4574 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004575 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004576 SKIP_BLANKS;
4577 if (in->buf == NULL)
4578 avail = in->length - (in->cur - in->base);
4579 else
4580 avail = in->buf->buffer->use - (in->cur - in->base);
4581 }
4582 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4583 ctxt->sax->setDocumentLocator(ctxt->userData,
4584 &xmlDefaultSAXLocator);
4585 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4586 (!ctxt->disableSAX))
4587 ctxt->sax->startDocument(ctxt->userData);
4588
4589 cur = in->cur[0];
4590 next = in->cur[1];
4591 if ((cur == '<') && (next == '!') &&
4592 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4593 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4594 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4595 (UPP(8) == 'E')) {
4596 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004597 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004598 goto done;
4599#ifdef DEBUG_PUSH
4600 xmlGenericError(xmlGenericErrorContext,
4601 "HPP: Parsing internal subset\n");
4602#endif
4603 htmlParseDocTypeDecl(ctxt);
4604 ctxt->instate = XML_PARSER_PROLOG;
4605#ifdef DEBUG_PUSH
4606 xmlGenericError(xmlGenericErrorContext,
4607 "HPP: entering PROLOG\n");
4608#endif
4609 } else {
4610 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004611#ifdef DEBUG_PUSH
Daniel Veillard597f1c12005-07-03 23:00:18 +00004612 xmlGenericError(xmlGenericErrorContext,
4613 "HPP: entering MISC\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004614#endif
Daniel Veillard597f1c12005-07-03 23:00:18 +00004615 }
Owen Taylor3473f882001-02-23 17:55:21 +00004616 break;
4617 case XML_PARSER_MISC:
4618 SKIP_BLANKS;
4619 if (in->buf == NULL)
4620 avail = in->length - (in->cur - in->base);
4621 else
4622 avail = in->buf->buffer->use - (in->cur - in->base);
4623 if (avail < 2)
4624 goto done;
4625 cur = in->cur[0];
4626 next = in->cur[1];
4627 if ((cur == '<') && (next == '!') &&
4628 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4629 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004630 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004631 goto done;
4632#ifdef DEBUG_PUSH
4633 xmlGenericError(xmlGenericErrorContext,
4634 "HPP: Parsing Comment\n");
4635#endif
4636 htmlParseComment(ctxt);
4637 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004638 } else if ((cur == '<') && (next == '?')) {
4639 if ((!terminate) &&
4640 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4641 goto done;
4642#ifdef DEBUG_PUSH
4643 xmlGenericError(xmlGenericErrorContext,
4644 "HPP: Parsing PI\n");
4645#endif
4646 htmlParsePI(ctxt);
4647 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004648 } else if ((cur == '<') && (next == '!') &&
4649 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4650 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4651 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4652 (UPP(8) == 'E')) {
4653 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004654 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004655 goto done;
4656#ifdef DEBUG_PUSH
4657 xmlGenericError(xmlGenericErrorContext,
4658 "HPP: Parsing internal subset\n");
4659#endif
4660 htmlParseDocTypeDecl(ctxt);
4661 ctxt->instate = XML_PARSER_PROLOG;
4662#ifdef DEBUG_PUSH
4663 xmlGenericError(xmlGenericErrorContext,
4664 "HPP: entering PROLOG\n");
4665#endif
4666 } else if ((cur == '<') && (next == '!') &&
4667 (avail < 9)) {
4668 goto done;
4669 } else {
4670 ctxt->instate = XML_PARSER_START_TAG;
4671#ifdef DEBUG_PUSH
4672 xmlGenericError(xmlGenericErrorContext,
4673 "HPP: entering START_TAG\n");
4674#endif
4675 }
4676 break;
4677 case XML_PARSER_PROLOG:
4678 SKIP_BLANKS;
4679 if (in->buf == NULL)
4680 avail = in->length - (in->cur - in->base);
4681 else
4682 avail = in->buf->buffer->use - (in->cur - in->base);
4683 if (avail < 2)
4684 goto done;
4685 cur = in->cur[0];
4686 next = in->cur[1];
4687 if ((cur == '<') && (next == '!') &&
4688 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4689 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004690 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004691 goto done;
4692#ifdef DEBUG_PUSH
4693 xmlGenericError(xmlGenericErrorContext,
4694 "HPP: Parsing Comment\n");
4695#endif
4696 htmlParseComment(ctxt);
4697 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004698 } else if ((cur == '<') && (next == '?')) {
4699 if ((!terminate) &&
4700 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4701 goto done;
4702#ifdef DEBUG_PUSH
4703 xmlGenericError(xmlGenericErrorContext,
4704 "HPP: Parsing PI\n");
4705#endif
4706 htmlParsePI(ctxt);
4707 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004708 } else if ((cur == '<') && (next == '!') &&
4709 (avail < 4)) {
4710 goto done;
4711 } else {
4712 ctxt->instate = XML_PARSER_START_TAG;
4713#ifdef DEBUG_PUSH
4714 xmlGenericError(xmlGenericErrorContext,
4715 "HPP: entering START_TAG\n");
4716#endif
4717 }
4718 break;
4719 case XML_PARSER_EPILOG:
4720 if (in->buf == NULL)
4721 avail = in->length - (in->cur - in->base);
4722 else
4723 avail = in->buf->buffer->use - (in->cur - in->base);
4724 if (avail < 1)
4725 goto done;
4726 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004727 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004728 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004729 goto done;
4730 }
4731 if (avail < 2)
4732 goto done;
4733 next = in->cur[1];
4734 if ((cur == '<') && (next == '!') &&
4735 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4736 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004737 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004738 goto done;
4739#ifdef DEBUG_PUSH
4740 xmlGenericError(xmlGenericErrorContext,
4741 "HPP: Parsing Comment\n");
4742#endif
4743 htmlParseComment(ctxt);
4744 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004745 } else if ((cur == '<') && (next == '?')) {
4746 if ((!terminate) &&
4747 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4748 goto done;
4749#ifdef DEBUG_PUSH
4750 xmlGenericError(xmlGenericErrorContext,
4751 "HPP: Parsing PI\n");
4752#endif
4753 htmlParsePI(ctxt);
4754 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004755 } else if ((cur == '<') && (next == '!') &&
4756 (avail < 4)) {
4757 goto done;
4758 } else {
4759 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004760 ctxt->wellFormed = 0;
4761 ctxt->instate = XML_PARSER_EOF;
4762#ifdef DEBUG_PUSH
4763 xmlGenericError(xmlGenericErrorContext,
4764 "HPP: entering EOF\n");
4765#endif
4766 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4767 ctxt->sax->endDocument(ctxt->userData);
4768 goto done;
4769 }
4770 break;
4771 case XML_PARSER_START_TAG: {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004772 const xmlChar *name, *oldname;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004773 int failed;
Daniel Veillardbb371292001-08-16 23:26:59 +00004774 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004775
4776 if (avail < 2)
4777 goto done;
4778 cur = in->cur[0];
4779 if (cur != '<') {
4780 ctxt->instate = XML_PARSER_CONTENT;
4781#ifdef DEBUG_PUSH
4782 xmlGenericError(xmlGenericErrorContext,
4783 "HPP: entering CONTENT\n");
4784#endif
4785 break;
4786 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004787 if (in->cur[1] == '/') {
4788 ctxt->instate = XML_PARSER_END_TAG;
4789 ctxt->checkIndex = 0;
4790#ifdef DEBUG_PUSH
4791 xmlGenericError(xmlGenericErrorContext,
4792 "HPP: entering END_TAG\n");
4793#endif
4794 break;
4795 }
Owen Taylor3473f882001-02-23 17:55:21 +00004796 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004797 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004798 goto done;
4799
Daniel Veillard597f1c12005-07-03 23:00:18 +00004800 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004801 name = ctxt->name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004802 if (failed ||
Owen Taylor3473f882001-02-23 17:55:21 +00004803 (name == NULL)) {
4804 if (CUR == '>')
4805 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004806 break;
4807 }
Owen Taylor3473f882001-02-23 17:55:21 +00004808
4809 /*
4810 * Lookup the info for that element.
4811 */
4812 info = htmlTagLookup(name);
4813 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004814 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4815 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004816 }
4817
4818 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004819 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004820 */
4821 if ((CUR == '/') && (NXT(1) == '>')) {
4822 SKIP(2);
4823 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4824 ctxt->sax->endElement(ctxt->userData, name);
4825 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004826 ctxt->instate = XML_PARSER_CONTENT;
4827#ifdef DEBUG_PUSH
4828 xmlGenericError(xmlGenericErrorContext,
4829 "HPP: entering CONTENT\n");
4830#endif
4831 break;
4832 }
4833
4834 if (CUR == '>') {
4835 NEXT;
4836 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004837 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4838 "Couldn't find end of Start Tag %s\n",
4839 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004840
4841 /*
4842 * end of parsing of this node.
4843 */
4844 if (xmlStrEqual(name, ctxt->name)) {
4845 nodePop(ctxt);
4846 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004847 }
4848
4849 ctxt->instate = XML_PARSER_CONTENT;
4850#ifdef DEBUG_PUSH
4851 xmlGenericError(xmlGenericErrorContext,
4852 "HPP: entering CONTENT\n");
4853#endif
4854 break;
4855 }
4856
4857 /*
4858 * Check for an Empty Element from DTD definition
4859 */
4860 if ((info != NULL) && (info->empty)) {
4861 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4862 ctxt->sax->endElement(ctxt->userData, name);
4863 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004864 }
4865 ctxt->instate = XML_PARSER_CONTENT;
4866#ifdef DEBUG_PUSH
4867 xmlGenericError(xmlGenericErrorContext,
4868 "HPP: entering CONTENT\n");
4869#endif
4870 break;
4871 }
4872 case XML_PARSER_CONTENT: {
4873 long cons;
4874 /*
4875 * Handle preparsed entities and charRef
4876 */
4877 if (ctxt->token != 0) {
4878 xmlChar chr[2] = { 0 , 0 } ;
4879
4880 chr[0] = (xmlChar) ctxt->token;
4881 htmlCheckParagraph(ctxt);
4882 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4883 ctxt->sax->characters(ctxt->userData, chr, 1);
4884 ctxt->token = 0;
4885 ctxt->checkIndex = 0;
4886 }
4887 if ((avail == 1) && (terminate)) {
4888 cur = in->cur[0];
4889 if ((cur != '<') && (cur != '&')) {
4890 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00004891 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004892 if (ctxt->sax->ignorableWhitespace != NULL)
4893 ctxt->sax->ignorableWhitespace(
4894 ctxt->userData, &cur, 1);
4895 } else {
4896 htmlCheckParagraph(ctxt);
4897 if (ctxt->sax->characters != NULL)
4898 ctxt->sax->characters(
4899 ctxt->userData, &cur, 1);
4900 }
4901 }
4902 ctxt->token = 0;
4903 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00004904 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00004905 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004906 }
Owen Taylor3473f882001-02-23 17:55:21 +00004907 }
4908 if (avail < 2)
4909 goto done;
4910 cur = in->cur[0];
4911 next = in->cur[1];
4912 cons = ctxt->nbChars;
4913 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4914 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4915 /*
4916 * Handle SCRIPT/STYLE separately
4917 */
4918 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004919 (htmlParseLookupSequence(ctxt, '<', '/', 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004920 goto done;
4921 htmlParseScript(ctxt);
4922 if ((cur == '<') && (next == '/')) {
4923 ctxt->instate = XML_PARSER_END_TAG;
4924 ctxt->checkIndex = 0;
4925#ifdef DEBUG_PUSH
4926 xmlGenericError(xmlGenericErrorContext,
4927 "HPP: entering END_TAG\n");
4928#endif
4929 break;
4930 }
4931 } else {
4932 /*
4933 * Sometimes DOCTYPE arrives in the middle of the document
4934 */
4935 if ((cur == '<') && (next == '!') &&
4936 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4937 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4938 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4939 (UPP(8) == 'E')) {
4940 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004941 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004942 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00004943 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4944 "Misplaced DOCTYPE declaration\n",
4945 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004946 htmlParseDocTypeDecl(ctxt);
4947 } else if ((cur == '<') && (next == '!') &&
4948 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4949 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004950 (htmlParseLookupSequence(
4951 ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004952 goto done;
4953#ifdef DEBUG_PUSH
4954 xmlGenericError(xmlGenericErrorContext,
4955 "HPP: Parsing Comment\n");
4956#endif
4957 htmlParseComment(ctxt);
4958 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004959 } else if ((cur == '<') && (next == '?')) {
4960 if ((!terminate) &&
4961 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4962 goto done;
4963#ifdef DEBUG_PUSH
4964 xmlGenericError(xmlGenericErrorContext,
4965 "HPP: Parsing PI\n");
4966#endif
4967 htmlParsePI(ctxt);
4968 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00004969 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4970 goto done;
4971 } else if ((cur == '<') && (next == '/')) {
4972 ctxt->instate = XML_PARSER_END_TAG;
4973 ctxt->checkIndex = 0;
4974#ifdef DEBUG_PUSH
4975 xmlGenericError(xmlGenericErrorContext,
4976 "HPP: entering END_TAG\n");
4977#endif
4978 break;
4979 } else if (cur == '<') {
4980 ctxt->instate = XML_PARSER_START_TAG;
4981 ctxt->checkIndex = 0;
4982#ifdef DEBUG_PUSH
4983 xmlGenericError(xmlGenericErrorContext,
4984 "HPP: entering START_TAG\n");
4985#endif
4986 break;
4987 } else if (cur == '&') {
4988 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004989 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004990 goto done;
4991#ifdef DEBUG_PUSH
4992 xmlGenericError(xmlGenericErrorContext,
4993 "HPP: Parsing Reference\n");
4994#endif
4995 /* TODO: check generation of subtrees if noent !!! */
4996 htmlParseReference(ctxt);
4997 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00004998 /*
4999 * check that the text sequence is complete
5000 * before handing out the data to the parser
5001 * to avoid problems with erroneous end of
5002 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00005003 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00005004 if ((!terminate) &&
5005 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5006 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00005007 ctxt->checkIndex = 0;
5008#ifdef DEBUG_PUSH
5009 xmlGenericError(xmlGenericErrorContext,
5010 "HPP: Parsing char data\n");
5011#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005012 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005013 }
5014 }
5015 if (cons == ctxt->nbChars) {
5016 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005017 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5018 "detected an error in element content\n",
5019 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005020 }
5021 NEXT;
5022 break;
5023 }
5024
5025 break;
5026 }
5027 case XML_PARSER_END_TAG:
5028 if (avail < 2)
5029 goto done;
5030 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005031 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005032 goto done;
5033 htmlParseEndTag(ctxt);
5034 if (ctxt->nameNr == 0) {
5035 ctxt->instate = XML_PARSER_EPILOG;
5036 } else {
5037 ctxt->instate = XML_PARSER_CONTENT;
5038 }
5039 ctxt->checkIndex = 0;
5040#ifdef DEBUG_PUSH
5041 xmlGenericError(xmlGenericErrorContext,
5042 "HPP: entering CONTENT\n");
5043#endif
5044 break;
5045 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005046 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5047 "HPP: internal error, state == CDATA\n",
5048 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005049 ctxt->instate = XML_PARSER_CONTENT;
5050 ctxt->checkIndex = 0;
5051#ifdef DEBUG_PUSH
5052 xmlGenericError(xmlGenericErrorContext,
5053 "HPP: entering CONTENT\n");
5054#endif
5055 break;
5056 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005057 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5058 "HPP: internal error, state == DTD\n",
5059 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005060 ctxt->instate = XML_PARSER_CONTENT;
5061 ctxt->checkIndex = 0;
5062#ifdef DEBUG_PUSH
5063 xmlGenericError(xmlGenericErrorContext,
5064 "HPP: entering CONTENT\n");
5065#endif
5066 break;
5067 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005068 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5069 "HPP: internal error, state == COMMENT\n",
5070 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005071 ctxt->instate = XML_PARSER_CONTENT;
5072 ctxt->checkIndex = 0;
5073#ifdef DEBUG_PUSH
5074 xmlGenericError(xmlGenericErrorContext,
5075 "HPP: entering CONTENT\n");
5076#endif
5077 break;
5078 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005079 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5080 "HPP: internal error, state == PI\n",
5081 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005082 ctxt->instate = XML_PARSER_CONTENT;
5083 ctxt->checkIndex = 0;
5084#ifdef DEBUG_PUSH
5085 xmlGenericError(xmlGenericErrorContext,
5086 "HPP: entering CONTENT\n");
5087#endif
5088 break;
5089 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005090 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5091 "HPP: internal error, state == ENTITY_DECL\n",
5092 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005093 ctxt->instate = XML_PARSER_CONTENT;
5094 ctxt->checkIndex = 0;
5095#ifdef DEBUG_PUSH
5096 xmlGenericError(xmlGenericErrorContext,
5097 "HPP: entering CONTENT\n");
5098#endif
5099 break;
5100 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005101 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5102 "HPP: internal error, state == ENTITY_VALUE\n",
5103 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005104 ctxt->instate = XML_PARSER_CONTENT;
5105 ctxt->checkIndex = 0;
5106#ifdef DEBUG_PUSH
5107 xmlGenericError(xmlGenericErrorContext,
5108 "HPP: entering DTD\n");
5109#endif
5110 break;
5111 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005112 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5113 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5114 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005115 ctxt->instate = XML_PARSER_START_TAG;
5116 ctxt->checkIndex = 0;
5117#ifdef DEBUG_PUSH
5118 xmlGenericError(xmlGenericErrorContext,
5119 "HPP: entering START_TAG\n");
5120#endif
5121 break;
5122 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005123 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5124 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5125 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005126 ctxt->instate = XML_PARSER_CONTENT;
5127 ctxt->checkIndex = 0;
5128#ifdef DEBUG_PUSH
5129 xmlGenericError(xmlGenericErrorContext,
5130 "HPP: entering CONTENT\n");
5131#endif
5132 break;
5133 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005134 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5135 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5136 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005137 ctxt->instate = XML_PARSER_CONTENT;
5138 ctxt->checkIndex = 0;
5139#ifdef DEBUG_PUSH
5140 xmlGenericError(xmlGenericErrorContext,
5141 "HPP: entering CONTENT\n");
5142#endif
5143 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005144 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005145 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5146 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5147 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005148 ctxt->instate = XML_PARSER_CONTENT;
5149 ctxt->checkIndex = 0;
5150#ifdef DEBUG_PUSH
5151 xmlGenericError(xmlGenericErrorContext,
5152 "HPP: entering CONTENT\n");
5153#endif
5154 break;
5155
Owen Taylor3473f882001-02-23 17:55:21 +00005156 }
5157 }
5158done:
5159 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005160 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005161 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5162 /*
5163 * SAX: end of the document processing.
5164 */
5165 ctxt->instate = XML_PARSER_EOF;
5166 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5167 ctxt->sax->endDocument(ctxt->userData);
5168 }
5169 }
5170 if ((ctxt->myDoc != NULL) &&
5171 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5172 (ctxt->instate == XML_PARSER_EPILOG))) {
5173 xmlDtdPtr dtd;
5174 dtd = xmlGetIntSubset(ctxt->myDoc);
5175 if (dtd == NULL)
5176 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00005177 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005178 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5179 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5180 }
5181#ifdef DEBUG_PUSH
5182 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5183#endif
5184 return(ret);
5185}
5186
5187/**
Owen Taylor3473f882001-02-23 17:55:21 +00005188 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00005189 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00005190 * @chunk: an char array
5191 * @size: the size in byte of the chunk
5192 * @terminate: last chunk indicator
5193 *
5194 * Parse a Chunk of memory
5195 *
5196 * Returns zero if no error, the xmlParserErrors otherwise.
5197 */
5198int
5199htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5200 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00005201 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5202 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5203 "htmlParseChunk: context error\n", NULL, NULL);
5204 return(XML_ERR_INTERNAL_ERROR);
5205 }
Owen Taylor3473f882001-02-23 17:55:21 +00005206 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5207 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5208 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5209 int cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005210 int res;
Owen Taylor3473f882001-02-23 17:55:21 +00005211
Daniel Veillardd2755a82005-08-07 23:42:39 +00005212 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5213 if (res < 0) {
5214 ctxt->errNo = XML_PARSER_EOF;
5215 ctxt->disableSAX = 1;
5216 return (XML_PARSER_EOF);
5217 }
Owen Taylor3473f882001-02-23 17:55:21 +00005218 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5219 ctxt->input->cur = ctxt->input->base + cur;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005220 ctxt->input->end =
5221 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005222#ifdef DEBUG_PUSH
5223 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5224#endif
5225
Daniel Veillard14f752c2003-08-09 11:44:50 +00005226#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00005227 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5228 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005229#endif
Owen Taylor3473f882001-02-23 17:55:21 +00005230 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005231 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5232 xmlParserInputBufferPtr in = ctxt->input->buf;
5233 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5234 (in->raw != NULL)) {
5235 int nbchars;
5236
5237 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5238 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005239 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5240 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005241 return(XML_ERR_INVALID_ENCODING);
5242 }
5243 }
5244 }
Owen Taylor3473f882001-02-23 17:55:21 +00005245 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00005246 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00005247 if (terminate) {
5248 if ((ctxt->instate != XML_PARSER_EOF) &&
5249 (ctxt->instate != XML_PARSER_EPILOG) &&
5250 (ctxt->instate != XML_PARSER_MISC)) {
5251 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005252 ctxt->wellFormed = 0;
5253 }
5254 if (ctxt->instate != XML_PARSER_EOF) {
5255 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5256 ctxt->sax->endDocument(ctxt->userData);
5257 }
5258 ctxt->instate = XML_PARSER_EOF;
5259 }
5260 return((xmlParserErrors) ctxt->errNo);
5261}
5262
5263/************************************************************************
5264 * *
5265 * User entry points *
5266 * *
5267 ************************************************************************/
5268
5269/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005270 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005271 * @sax: a SAX handler
5272 * @user_data: The user data returned on SAX callbacks
5273 * @chunk: a pointer to an array of chars
5274 * @size: number of chars in the array
5275 * @filename: an optional file name or URI
5276 * @enc: an optional encoding
5277 *
5278 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005279 * The value of @filename is used for fetching external entities
5280 * and error/warning reports.
5281 *
5282 * Returns the new parser context or NULL
5283 */
5284htmlParserCtxtPtr
5285htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5286 const char *chunk, int size, const char *filename,
5287 xmlCharEncoding enc) {
5288 htmlParserCtxtPtr ctxt;
5289 htmlParserInputPtr inputStream;
5290 xmlParserInputBufferPtr buf;
5291
Daniel Veillardd0463562001-10-13 09:15:48 +00005292 xmlInitParser();
5293
Owen Taylor3473f882001-02-23 17:55:21 +00005294 buf = xmlAllocParserInputBuffer(enc);
5295 if (buf == NULL) return(NULL);
5296
Daniel Veillardf403d292003-10-05 13:51:35 +00005297 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005298 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005299 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005300 return(NULL);
5301 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00005302 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5303 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005304 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005305 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005306 xmlFree(ctxt->sax);
5307 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5308 if (ctxt->sax == NULL) {
5309 xmlFree(buf);
5310 xmlFree(ctxt);
5311 return(NULL);
5312 }
5313 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5314 if (user_data != NULL)
5315 ctxt->userData = user_data;
5316 }
5317 if (filename == NULL) {
5318 ctxt->directory = NULL;
5319 } else {
5320 ctxt->directory = xmlParserGetDirectory(filename);
5321 }
5322
5323 inputStream = htmlNewInputStream(ctxt);
5324 if (inputStream == NULL) {
5325 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005326 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005327 return(NULL);
5328 }
5329
5330 if (filename == NULL)
5331 inputStream->filename = NULL;
5332 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005333 inputStream->filename = (char *)
5334 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005335 inputStream->buf = buf;
5336 inputStream->base = inputStream->buf->buffer->content;
5337 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillard5f704af2003-03-05 10:01:43 +00005338 inputStream->end =
5339 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005340
5341 inputPush(ctxt, inputStream);
5342
5343 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5344 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005345 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5346 int cur = ctxt->input->cur - ctxt->input->base;
5347
Owen Taylor3473f882001-02-23 17:55:21 +00005348 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005349
5350 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5351 ctxt->input->cur = ctxt->input->base + cur;
5352 ctxt->input->end =
5353 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005354#ifdef DEBUG_PUSH
5355 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5356#endif
5357 }
5358
5359 return(ctxt);
5360}
William M. Brack21e4ef22005-01-02 09:53:13 +00005361#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00005362
5363/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005364 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005365 * @cur: a pointer to an array of xmlChar
5366 * @encoding: a free form C string describing the HTML document encoding, or NULL
5367 * @sax: the SAX handler block
5368 * @userData: if using SAX, this pointer will be provided on callbacks.
5369 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005370 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5371 * to handle parse events. If sax is NULL, fallback to the default DOM
5372 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00005373 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005374 * Returns the resulting document tree unless SAX is NULL or the document is
5375 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005376 */
5377
5378htmlDocPtr
5379htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5380 htmlDocPtr ret;
5381 htmlParserCtxtPtr ctxt;
5382
Daniel Veillardd0463562001-10-13 09:15:48 +00005383 xmlInitParser();
5384
Owen Taylor3473f882001-02-23 17:55:21 +00005385 if (cur == NULL) return(NULL);
5386
5387
5388 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5389 if (ctxt == NULL) return(NULL);
5390 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005391 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005392 ctxt->sax = sax;
5393 ctxt->userData = userData;
5394 }
5395
5396 htmlParseDocument(ctxt);
5397 ret = ctxt->myDoc;
5398 if (sax != NULL) {
5399 ctxt->sax = NULL;
5400 ctxt->userData = NULL;
5401 }
5402 htmlFreeParserCtxt(ctxt);
5403
5404 return(ret);
5405}
5406
5407/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005408 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005409 * @cur: a pointer to an array of xmlChar
5410 * @encoding: a free form C string describing the HTML document encoding, or NULL
5411 *
5412 * parse an HTML in-memory document and build a tree.
5413 *
5414 * Returns the resulting document tree
5415 */
5416
5417htmlDocPtr
5418htmlParseDoc(xmlChar *cur, const char *encoding) {
5419 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5420}
5421
5422
5423/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005424 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005425 * @filename: the filename
5426 * @encoding: a free form C string describing the HTML document encoding, or NULL
5427 *
5428 * Create a parser context for a file content.
5429 * Automatic support for ZLIB/Compress compressed document is provided
5430 * by default if found at compile-time.
5431 *
5432 * Returns the new parser context or NULL
5433 */
5434htmlParserCtxtPtr
5435htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5436{
5437 htmlParserCtxtPtr ctxt;
5438 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005439 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005440 /* htmlCharEncoding enc; */
5441 xmlChar *content, *content_line = (xmlChar *) "charset=";
5442
Daniel Veillarda03e3652004-11-02 18:45:30 +00005443 if (filename == NULL)
5444 return(NULL);
5445
Daniel Veillardf403d292003-10-05 13:51:35 +00005446 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005447 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00005448 return(NULL);
5449 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005450 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5451 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00005452#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005453 if (xmlDefaultSAXHandler.error != NULL) {
5454 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5455 }
Daniel Veillard87247e82004-01-13 20:42:02 +00005456#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00005457 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005458 return(NULL);
5459 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005460
5461 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5462 xmlFree(canonicFilename);
5463 if (inputStream == NULL) {
5464 xmlFreeParserCtxt(ctxt);
5465 return(NULL);
5466 }
Owen Taylor3473f882001-02-23 17:55:21 +00005467
5468 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005469
Owen Taylor3473f882001-02-23 17:55:21 +00005470 /* set encoding */
5471 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005472 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Owen Taylor3473f882001-02-23 17:55:21 +00005473 if (content) {
5474 strcpy ((char *)content, (char *)content_line);
5475 strcat ((char *)content, (char *)encoding);
5476 htmlCheckEncoding (ctxt, content);
5477 xmlFree (content);
5478 }
5479 }
5480
5481 return(ctxt);
5482}
5483
5484/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005485 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005486 * @filename: the filename
5487 * @encoding: a free form C string describing the HTML document encoding, or NULL
5488 * @sax: the SAX handler block
5489 * @userData: if using SAX, this pointer will be provided on callbacks.
5490 *
5491 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5492 * compressed document is provided by default if found at compile-time.
5493 * It use the given SAX function block to handle the parsing callback.
5494 * If sax is NULL, fallback to the default DOM tree building routines.
5495 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005496 * Returns the resulting document tree unless SAX is NULL or the document is
5497 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005498 */
5499
5500htmlDocPtr
5501htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5502 void *userData) {
5503 htmlDocPtr ret;
5504 htmlParserCtxtPtr ctxt;
5505 htmlSAXHandlerPtr oldsax = NULL;
5506
Daniel Veillardd0463562001-10-13 09:15:48 +00005507 xmlInitParser();
5508
Owen Taylor3473f882001-02-23 17:55:21 +00005509 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5510 if (ctxt == NULL) return(NULL);
5511 if (sax != NULL) {
5512 oldsax = ctxt->sax;
5513 ctxt->sax = sax;
5514 ctxt->userData = userData;
5515 }
5516
5517 htmlParseDocument(ctxt);
5518
5519 ret = ctxt->myDoc;
5520 if (sax != NULL) {
5521 ctxt->sax = oldsax;
5522 ctxt->userData = NULL;
5523 }
5524 htmlFreeParserCtxt(ctxt);
5525
5526 return(ret);
5527}
5528
5529/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005530 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005531 * @filename: the filename
5532 * @encoding: a free form C string describing the HTML document encoding, or NULL
5533 *
5534 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5535 * compressed document is provided by default if found at compile-time.
5536 *
5537 * Returns the resulting document tree
5538 */
5539
5540htmlDocPtr
5541htmlParseFile(const char *filename, const char *encoding) {
5542 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5543}
5544
5545/**
5546 * htmlHandleOmittedElem:
5547 * @val: int 0 or 1
5548 *
5549 * Set and return the previous value for handling HTML omitted tags.
5550 *
5551 * Returns the last value for 0 for no handling, 1 for auto insertion.
5552 */
5553
5554int
5555htmlHandleOmittedElem(int val) {
5556 int old = htmlOmittedDefaultValue;
5557
5558 htmlOmittedDefaultValue = val;
5559 return(old);
5560}
5561
Daniel Veillard930dfb62003-02-05 10:17:38 +00005562/**
5563 * htmlElementAllowedHere:
5564 * @parent: HTML parent element
5565 * @elt: HTML element
5566 *
5567 * Checks whether an HTML element may be a direct child of a parent element.
5568 * Note - doesn't check for deprecated elements
5569 *
5570 * Returns 1 if allowed; 0 otherwise.
5571 */
5572int
5573htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5574 const char** p ;
5575
5576 if ( ! elt || ! parent || ! parent->subelts )
5577 return 0 ;
5578
5579 for ( p = parent->subelts; *p; ++p )
5580 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5581 return 1 ;
5582
5583 return 0 ;
5584}
5585/**
5586 * htmlElementStatusHere:
5587 * @parent: HTML parent element
5588 * @elt: HTML element
5589 *
5590 * Checks whether an HTML element may be a direct child of a parent element.
5591 * and if so whether it is valid or deprecated.
5592 *
5593 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5594 */
5595htmlStatus
5596htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5597 if ( ! parent || ! elt )
5598 return HTML_INVALID ;
5599 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5600 return HTML_INVALID ;
5601
5602 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5603}
5604/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005605 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005606 * @elt: HTML element
5607 * @attr: HTML attribute
5608 * @legacy: whether to allow deprecated attributes
5609 *
5610 * Checks whether an attribute is valid for an element
5611 * Has full knowledge of Required and Deprecated attributes
5612 *
5613 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5614 */
5615htmlStatus
5616htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5617 const char** p ;
5618
5619 if ( !elt || ! attr )
5620 return HTML_INVALID ;
5621
5622 if ( elt->attrs_req )
5623 for ( p = elt->attrs_req; *p; ++p)
5624 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5625 return HTML_REQUIRED ;
5626
5627 if ( elt->attrs_opt )
5628 for ( p = elt->attrs_opt; *p; ++p)
5629 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5630 return HTML_VALID ;
5631
5632 if ( legacy && elt->attrs_depr )
5633 for ( p = elt->attrs_depr; *p; ++p)
5634 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5635 return HTML_DEPRECATED ;
5636
5637 return HTML_INVALID ;
5638}
5639/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005640 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005641 * @node: an htmlNodePtr in a tree
5642 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005643 * for Element nodes)
5644 *
5645 * Checks whether the tree node is valid. Experimental (the author
5646 * only uses the HTML enhancements in a SAX parser)
5647 *
5648 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5649 * legacy allowed) or htmlElementStatusHere (otherwise).
5650 * for Attribute nodes, a return from htmlAttrAllowed
5651 * for other nodes, HTML_NA (no checks performed)
5652 */
5653htmlStatus
5654htmlNodeStatus(const htmlNodePtr node, int legacy) {
5655 if ( ! node )
5656 return HTML_INVALID ;
5657
5658 switch ( node->type ) {
5659 case XML_ELEMENT_NODE:
5660 return legacy
5661 ? ( htmlElementAllowedHere (
5662 htmlTagLookup(node->parent->name) , node->name
5663 ) ? HTML_VALID : HTML_INVALID )
5664 : htmlElementStatusHere(
5665 htmlTagLookup(node->parent->name) ,
5666 htmlTagLookup(node->name) )
5667 ;
5668 case XML_ATTRIBUTE_NODE:
5669 return htmlAttrAllowed(
5670 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5671 default: return HTML_NA ;
5672 }
5673}
Daniel Veillard9475a352003-09-26 12:47:50 +00005674/************************************************************************
5675 * *
5676 * New set (2.6.0) of simpler and more flexible APIs *
5677 * *
5678 ************************************************************************/
5679/**
5680 * DICT_FREE:
5681 * @str: a string
5682 *
5683 * Free a string if it is not owned by the "dict" dictionnary in the
5684 * current scope
5685 */
5686#define DICT_FREE(str) \
5687 if ((str) && ((!dict) || \
5688 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5689 xmlFree((char *)(str));
5690
5691/**
5692 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00005693 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00005694 *
5695 * Reset a parser context
5696 */
5697void
5698htmlCtxtReset(htmlParserCtxtPtr ctxt)
5699{
5700 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00005701 xmlDictPtr dict;
5702
5703 if (ctxt == NULL)
5704 return;
5705
5706 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00005707
5708 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5709 xmlFreeInputStream(input);
5710 }
5711 ctxt->inputNr = 0;
5712 ctxt->input = NULL;
5713
5714 ctxt->spaceNr = 0;
Daniel Veillarda521d282004-11-09 14:59:59 +00005715 if (ctxt->spaceTab != NULL) {
5716 ctxt->spaceTab[0] = -1;
5717 ctxt->space = &ctxt->spaceTab[0];
5718 } else {
5719 ctxt->space = NULL;
5720 }
Daniel Veillard9475a352003-09-26 12:47:50 +00005721
5722
5723 ctxt->nodeNr = 0;
5724 ctxt->node = NULL;
5725
5726 ctxt->nameNr = 0;
5727 ctxt->name = NULL;
5728
5729 DICT_FREE(ctxt->version);
5730 ctxt->version = NULL;
5731 DICT_FREE(ctxt->encoding);
5732 ctxt->encoding = NULL;
5733 DICT_FREE(ctxt->directory);
5734 ctxt->directory = NULL;
5735 DICT_FREE(ctxt->extSubURI);
5736 ctxt->extSubURI = NULL;
5737 DICT_FREE(ctxt->extSubSystem);
5738 ctxt->extSubSystem = NULL;
5739 if (ctxt->myDoc != NULL)
5740 xmlFreeDoc(ctxt->myDoc);
5741 ctxt->myDoc = NULL;
5742
5743 ctxt->standalone = -1;
5744 ctxt->hasExternalSubset = 0;
5745 ctxt->hasPErefs = 0;
5746 ctxt->html = 1;
5747 ctxt->external = 0;
5748 ctxt->instate = XML_PARSER_START;
5749 ctxt->token = 0;
5750
5751 ctxt->wellFormed = 1;
5752 ctxt->nsWellFormed = 1;
5753 ctxt->valid = 1;
5754 ctxt->vctxt.userData = ctxt;
5755 ctxt->vctxt.error = xmlParserValidityError;
5756 ctxt->vctxt.warning = xmlParserValidityWarning;
5757 ctxt->record_info = 0;
5758 ctxt->nbChars = 0;
5759 ctxt->checkIndex = 0;
5760 ctxt->inSubset = 0;
5761 ctxt->errNo = XML_ERR_OK;
5762 ctxt->depth = 0;
5763 ctxt->charset = XML_CHAR_ENCODING_UTF8;
5764 ctxt->catalogs = NULL;
5765 xmlInitNodeInfoSeq(&ctxt->node_seq);
5766
5767 if (ctxt->attsDefault != NULL) {
5768 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5769 ctxt->attsDefault = NULL;
5770 }
5771 if (ctxt->attsSpecial != NULL) {
5772 xmlHashFree(ctxt->attsSpecial, NULL);
5773 ctxt->attsSpecial = NULL;
5774 }
5775}
5776
5777/**
5778 * htmlCtxtUseOptions:
5779 * @ctxt: an HTML parser context
5780 * @options: a combination of htmlParserOption(s)
5781 *
5782 * Applies the options to the parser context
5783 *
5784 * Returns 0 in case of success, the set of unknown or unimplemented options
5785 * in case of error.
5786 */
5787int
5788htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5789{
Daniel Veillarda03e3652004-11-02 18:45:30 +00005790 if (ctxt == NULL)
5791 return(-1);
5792
Daniel Veillard9475a352003-09-26 12:47:50 +00005793 if (options & HTML_PARSE_NOWARNING) {
5794 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005795 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00005796 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005797 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00005798 }
5799 if (options & HTML_PARSE_NOERROR) {
5800 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005801 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00005802 ctxt->sax->fatalError = NULL;
5803 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005804 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00005805 }
5806 if (options & HTML_PARSE_PEDANTIC) {
5807 ctxt->pedantic = 1;
5808 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005809 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00005810 } else
5811 ctxt->pedantic = 0;
5812 if (options & XML_PARSE_NOBLANKS) {
5813 ctxt->keepBlanks = 0;
5814 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5815 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005816 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00005817 } else
5818 ctxt->keepBlanks = 1;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00005819 if (options & HTML_PARSE_RECOVER) {
5820 ctxt->recovery = 1;
5821 } else
5822 ctxt->recovery = 0;
Daniel Veillard8874b942005-08-25 13:19:21 +00005823 if (options & HTML_PARSE_COMPACT) {
5824 ctxt->options |= HTML_PARSE_COMPACT;
5825 options -= HTML_PARSE_COMPACT;
5826 }
Daniel Veillard9475a352003-09-26 12:47:50 +00005827 ctxt->dictNames = 0;
5828 return (options);
5829}
5830
5831/**
5832 * htmlDoRead:
5833 * @ctxt: an HTML parser context
5834 * @URL: the base URL to use for the document
5835 * @encoding: the document encoding, or NULL
5836 * @options: a combination of htmlParserOption(s)
5837 * @reuse: keep the context for reuse
5838 *
5839 * Common front-end for the htmlRead functions
5840 *
5841 * Returns the resulting document tree or NULL
5842 */
5843static htmlDocPtr
5844htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5845 int options, int reuse)
5846{
5847 htmlDocPtr ret;
5848
5849 htmlCtxtUseOptions(ctxt, options);
5850 ctxt->html = 1;
5851 if (encoding != NULL) {
5852 xmlCharEncodingHandlerPtr hdlr;
5853
5854 hdlr = xmlFindCharEncodingHandler(encoding);
5855 if (hdlr != NULL)
5856 xmlSwitchToEncoding(ctxt, hdlr);
5857 }
5858 if ((URL != NULL) && (ctxt->input != NULL) &&
5859 (ctxt->input->filename == NULL))
5860 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
5861 htmlParseDocument(ctxt);
5862 ret = ctxt->myDoc;
5863 ctxt->myDoc = NULL;
5864 if (!reuse) {
5865 if ((ctxt->dictNames) &&
5866 (ret != NULL) &&
5867 (ret->dict == ctxt->dict))
5868 ctxt->dict = NULL;
5869 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00005870 }
5871 return (ret);
5872}
5873
5874/**
5875 * htmlReadDoc:
5876 * @cur: a pointer to a zero terminated string
5877 * @URL: the base URL to use for the document
5878 * @encoding: the document encoding, or NULL
5879 * @options: a combination of htmlParserOption(s)
5880 *
5881 * parse an XML in-memory document and build a tree.
5882 *
5883 * Returns the resulting document tree
5884 */
5885htmlDocPtr
5886htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
5887{
5888 htmlParserCtxtPtr ctxt;
5889
5890 if (cur == NULL)
5891 return (NULL);
5892
5893 ctxt = xmlCreateDocParserCtxt(cur);
5894 if (ctxt == NULL)
5895 return (NULL);
5896 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5897}
5898
5899/**
5900 * htmlReadFile:
5901 * @filename: a file or URL
5902 * @encoding: the document encoding, or NULL
5903 * @options: a combination of htmlParserOption(s)
5904 *
5905 * parse an XML file from the filesystem or the network.
5906 *
5907 * Returns the resulting document tree
5908 */
5909htmlDocPtr
5910htmlReadFile(const char *filename, const char *encoding, int options)
5911{
5912 htmlParserCtxtPtr ctxt;
5913
5914 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5915 if (ctxt == NULL)
5916 return (NULL);
5917 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
5918}
5919
5920/**
5921 * htmlReadMemory:
5922 * @buffer: a pointer to a char array
5923 * @size: the size of the array
5924 * @URL: the base URL to use for the document
5925 * @encoding: the document encoding, or NULL
5926 * @options: a combination of htmlParserOption(s)
5927 *
5928 * parse an XML in-memory document and build a tree.
5929 *
5930 * Returns the resulting document tree
5931 */
5932htmlDocPtr
5933htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
5934{
5935 htmlParserCtxtPtr ctxt;
5936
5937 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
5938 if (ctxt == NULL)
5939 return (NULL);
William M. Brackd43cdcd2004-08-03 15:13:29 +00005940 if (ctxt->sax != NULL)
5941 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00005942 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5943}
5944
5945/**
5946 * htmlReadFd:
5947 * @fd: an open file descriptor
5948 * @URL: the base URL to use for the document
5949 * @encoding: the document encoding, or NULL
5950 * @options: a combination of htmlParserOption(s)
5951 *
5952 * parse an XML from a file descriptor and build a tree.
5953 *
5954 * Returns the resulting document tree
5955 */
5956htmlDocPtr
5957htmlReadFd(int fd, const char *URL, const char *encoding, int options)
5958{
5959 htmlParserCtxtPtr ctxt;
5960 xmlParserInputBufferPtr input;
5961 xmlParserInputPtr stream;
5962
5963 if (fd < 0)
5964 return (NULL);
5965
5966 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
5967 if (input == NULL)
5968 return (NULL);
5969 ctxt = xmlNewParserCtxt();
5970 if (ctxt == NULL) {
5971 xmlFreeParserInputBuffer(input);
5972 return (NULL);
5973 }
5974 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5975 if (stream == NULL) {
5976 xmlFreeParserInputBuffer(input);
5977 xmlFreeParserCtxt(ctxt);
5978 return (NULL);
5979 }
5980 inputPush(ctxt, stream);
5981 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5982}
5983
5984/**
5985 * htmlReadIO:
5986 * @ioread: an I/O read function
5987 * @ioclose: an I/O close function
5988 * @ioctx: an I/O handler
5989 * @URL: the base URL to use for the document
5990 * @encoding: the document encoding, or NULL
5991 * @options: a combination of htmlParserOption(s)
5992 *
5993 * parse an HTML document from I/O functions and source and build a tree.
5994 *
5995 * Returns the resulting document tree
5996 */
5997htmlDocPtr
5998htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
5999 void *ioctx, const char *URL, const char *encoding, int options)
6000{
6001 htmlParserCtxtPtr ctxt;
6002 xmlParserInputBufferPtr input;
6003 xmlParserInputPtr stream;
6004
6005 if (ioread == NULL)
6006 return (NULL);
6007
6008 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6009 XML_CHAR_ENCODING_NONE);
6010 if (input == NULL)
6011 return (NULL);
6012 ctxt = xmlNewParserCtxt();
6013 if (ctxt == NULL) {
6014 xmlFreeParserInputBuffer(input);
6015 return (NULL);
6016 }
6017 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6018 if (stream == NULL) {
6019 xmlFreeParserInputBuffer(input);
6020 xmlFreeParserCtxt(ctxt);
6021 return (NULL);
6022 }
6023 inputPush(ctxt, stream);
6024 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6025}
6026
6027/**
6028 * htmlCtxtReadDoc:
6029 * @ctxt: an HTML parser context
6030 * @cur: a pointer to a zero terminated string
6031 * @URL: the base URL to use for the document
6032 * @encoding: the document encoding, or NULL
6033 * @options: a combination of htmlParserOption(s)
6034 *
6035 * parse an XML in-memory document and build a tree.
6036 * This reuses the existing @ctxt parser context
6037 *
6038 * Returns the resulting document tree
6039 */
6040htmlDocPtr
6041htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6042 const char *URL, const char *encoding, int options)
6043{
6044 xmlParserInputPtr stream;
6045
6046 if (cur == NULL)
6047 return (NULL);
6048 if (ctxt == NULL)
6049 return (NULL);
6050
6051 htmlCtxtReset(ctxt);
6052
6053 stream = xmlNewStringInputStream(ctxt, cur);
6054 if (stream == NULL) {
6055 return (NULL);
6056 }
6057 inputPush(ctxt, stream);
6058 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6059}
6060
6061/**
6062 * htmlCtxtReadFile:
6063 * @ctxt: an HTML parser context
6064 * @filename: a file or URL
6065 * @encoding: the document encoding, or NULL
6066 * @options: a combination of htmlParserOption(s)
6067 *
6068 * parse an XML file from the filesystem or the network.
6069 * This reuses the existing @ctxt parser context
6070 *
6071 * Returns the resulting document tree
6072 */
6073htmlDocPtr
6074htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6075 const char *encoding, int options)
6076{
6077 xmlParserInputPtr stream;
6078
6079 if (filename == NULL)
6080 return (NULL);
6081 if (ctxt == NULL)
6082 return (NULL);
6083
6084 htmlCtxtReset(ctxt);
6085
Daniel Veillard29614c72004-11-26 10:47:26 +00006086 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006087 if (stream == NULL) {
6088 return (NULL);
6089 }
6090 inputPush(ctxt, stream);
6091 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6092}
6093
6094/**
6095 * htmlCtxtReadMemory:
6096 * @ctxt: an HTML parser context
6097 * @buffer: a pointer to a char array
6098 * @size: the size of the array
6099 * @URL: the base URL to use for the document
6100 * @encoding: the document encoding, or NULL
6101 * @options: a combination of htmlParserOption(s)
6102 *
6103 * parse an XML in-memory document and build a tree.
6104 * This reuses the existing @ctxt parser context
6105 *
6106 * Returns the resulting document tree
6107 */
6108htmlDocPtr
6109htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6110 const char *URL, const char *encoding, int options)
6111{
6112 xmlParserInputBufferPtr input;
6113 xmlParserInputPtr stream;
6114
6115 if (ctxt == NULL)
6116 return (NULL);
6117 if (buffer == NULL)
6118 return (NULL);
6119
6120 htmlCtxtReset(ctxt);
6121
6122 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6123 if (input == NULL) {
6124 return(NULL);
6125 }
6126
6127 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6128 if (stream == NULL) {
6129 xmlFreeParserInputBuffer(input);
6130 return(NULL);
6131 }
6132
6133 inputPush(ctxt, stream);
6134 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6135}
6136
6137/**
6138 * htmlCtxtReadFd:
6139 * @ctxt: an HTML parser context
6140 * @fd: an open file descriptor
6141 * @URL: the base URL to use for the document
6142 * @encoding: the document encoding, or NULL
6143 * @options: a combination of htmlParserOption(s)
6144 *
6145 * parse an XML from a file descriptor and build a tree.
6146 * This reuses the existing @ctxt parser context
6147 *
6148 * Returns the resulting document tree
6149 */
6150htmlDocPtr
6151htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6152 const char *URL, const char *encoding, int options)
6153{
6154 xmlParserInputBufferPtr input;
6155 xmlParserInputPtr stream;
6156
6157 if (fd < 0)
6158 return (NULL);
6159 if (ctxt == NULL)
6160 return (NULL);
6161
6162 htmlCtxtReset(ctxt);
6163
6164
6165 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6166 if (input == NULL)
6167 return (NULL);
6168 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6169 if (stream == NULL) {
6170 xmlFreeParserInputBuffer(input);
6171 return (NULL);
6172 }
6173 inputPush(ctxt, stream);
6174 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6175}
6176
6177/**
6178 * htmlCtxtReadIO:
6179 * @ctxt: an HTML parser context
6180 * @ioread: an I/O read function
6181 * @ioclose: an I/O close function
6182 * @ioctx: an I/O handler
6183 * @URL: the base URL to use for the document
6184 * @encoding: the document encoding, or NULL
6185 * @options: a combination of htmlParserOption(s)
6186 *
6187 * parse an HTML document from I/O functions and source and build a tree.
6188 * This reuses the existing @ctxt parser context
6189 *
6190 * Returns the resulting document tree
6191 */
6192htmlDocPtr
6193htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6194 xmlInputCloseCallback ioclose, void *ioctx,
6195 const char *URL,
6196 const char *encoding, int options)
6197{
6198 xmlParserInputBufferPtr input;
6199 xmlParserInputPtr stream;
6200
6201 if (ioread == NULL)
6202 return (NULL);
6203 if (ctxt == NULL)
6204 return (NULL);
6205
6206 htmlCtxtReset(ctxt);
6207
6208 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6209 XML_CHAR_ENCODING_NONE);
6210 if (input == NULL)
6211 return (NULL);
6212 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6213 if (stream == NULL) {
6214 xmlFreeParserInputBuffer(input);
6215 return (NULL);
6216 }
6217 inputPush(ctxt, stream);
6218 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6219}
6220
Daniel Veillard5d4644e2005-04-01 13:11:58 +00006221#define bottom_HTMLparser
6222#include "elfgcchack.h"
Owen Taylor3473f882001-02-23 17:55:21 +00006223#endif /* LIBXML_HTML_ENABLED */