blob: f2d1bd2d435c2496d7871fd2412fe0bc3586b00c [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillardf403d292003-10-05 13:51:35 +000062 * Some factorized error routines *
63 * *
64 ************************************************************************/
65
66/**
William M. Brackedb65a72004-02-06 07:36:04 +000067 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000068 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
Daniel Veillard157fee02003-10-31 10:36:03 +000076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000079 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
Daniel Veillard157fee02003-10-31 10:36:03 +0000109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000112 if (ctxt != NULL)
113 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
117 NULL, 0, 0,
118 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000119 if (ctxt != NULL)
120 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
128 * @val: integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
135{
Daniel Veillard157fee02003-10-31 10:36:03 +0000136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
138 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000139 if (ctxt != NULL)
140 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000144 if (ctxt != NULL)
145 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000146}
147
148/************************************************************************
149 * *
Owen Taylor3473f882001-02-23 17:55:21 +0000150 * Parser stacks related functions and macros *
151 * *
152 ************************************************************************/
153
Daniel Veillard1c732d22002-11-30 11:22:59 +0000154/**
155 * htmlnamePush:
156 * @ctxt: an HTML parser context
157 * @value: the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000162 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000163static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000165{
166 if (ctxt->nameNr >= ctxt->nameMax) {
167 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000168 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000169 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000170 ctxt->nameMax *
171 sizeof(ctxt->nameTab[0]));
172 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000173 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000174 return (0);
175 }
176 }
177 ctxt->nameTab[ctxt->nameNr] = value;
178 ctxt->name = value;
179 return (ctxt->nameNr++);
180}
181/**
182 * htmlnamePop:
183 * @ctxt: an HTML parser context
184 *
185 * Pops the top element name from the name stack
186 *
187 * Returns the name just removed
188 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000189static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000190htmlnamePop(htmlParserCtxtPtr ctxt)
191{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000192 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000193
Daniel Veillard1c732d22002-11-30 11:22:59 +0000194 if (ctxt->nameNr <= 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000195 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000196 ctxt->nameNr--;
197 if (ctxt->nameNr < 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000198 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000199 if (ctxt->nameNr > 0)
200 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
201 else
202 ctxt->name = NULL;
203 ret = ctxt->nameTab[ctxt->nameNr];
Daniel Veillard24505b02005-07-28 23:49:35 +0000204 ctxt->nameTab[ctxt->nameNr] = NULL;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000205 return (ret);
206}
Owen Taylor3473f882001-02-23 17:55:21 +0000207
208/*
209 * Macros for accessing the content. Those should be used only by the parser,
210 * and not exported.
211 *
212 * Dirty macros, i.e. one need to make assumption on the context to use them
213 *
214 * CUR_PTR return the current pointer to the xmlChar to be parsed.
215 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
216 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
217 * in UNICODE mode. This should be used internally by the parser
218 * only to compare to ASCII values otherwise it would break when
219 * running with UTF-8 encoding.
220 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
221 * to compare on ASCII based substring.
222 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
223 * it should be used only to compare on ASCII based substring.
224 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000225 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000226 *
227 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
228 *
229 * CURRENT Returns the current char value, with the full decoding of
230 * UTF-8 if we are using this mode. It returns an int.
231 * NEXT Skip to the next character, this does the proper decoding
232 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000233 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000234 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
235 */
236
237#define UPPER (toupper(*ctxt->input->cur))
238
Daniel Veillard77a90a72003-03-22 00:04:05 +0000239#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000240
241#define NXT(val) ctxt->input->cur[(val)]
242
243#define UPP(val) (toupper(ctxt->input->cur[(val)]))
244
245#define CUR_PTR ctxt->input->cur
246
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000247#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
248 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
249 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000250
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000251#define GROW if ((ctxt->progressive == 0) && \
252 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
253 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000254
255#define CURRENT ((int) (*ctxt->input->cur))
256
257#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
258
259/* Inported from XML */
260
Daniel Veillard561b7f82002-03-20 21:55:57 +0000261/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
262#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000263#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000264
Daniel Veillard561b7f82002-03-20 21:55:57 +0000265#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000266#define NXT(val) ctxt->input->cur[(val)]
267#define CUR_PTR ctxt->input->cur
268
269
270#define NEXTL(l) do { \
271 if (*(ctxt->input->cur) == '\n') { \
272 ctxt->input->line++; ctxt->input->col = 1; \
273 } else ctxt->input->col++; \
274 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
275 } while (0)
276
277/************
278 \
279 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
280 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
281 ************/
282
283#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
284#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
285
286#define COPY_BUF(l,b,i,v) \
287 if (l == 1) b[i++] = (xmlChar) v; \
288 else i += xmlCopyChar(l,&b[i],v)
289
290/**
291 * htmlCurrentChar:
292 * @ctxt: the HTML parser context
293 * @len: pointer to the length of the char read
294 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000295 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000296 * bytes in the input buffer. Implement the end of line normalization:
297 * 2.11 End-of-Line Handling
298 * If the encoding is unspecified, in the case we find an ISO-Latin-1
299 * char, then the encoding converter is plugged in automatically.
300 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000301 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000302 */
303
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000304static int
Owen Taylor3473f882001-02-23 17:55:21 +0000305htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
306 if (ctxt->instate == XML_PARSER_EOF)
307 return(0);
308
309 if (ctxt->token != 0) {
310 *len = 0;
311 return(ctxt->token);
312 }
313 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
314 /*
315 * We are supposed to handle UTF8, check it's valid
316 * From rfc2044: encoding of the Unicode values on UTF-8:
317 *
318 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
319 * 0000 0000-0000 007F 0xxxxxxx
320 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
321 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
322 *
323 * Check for the 0x110000 limit too
324 */
325 const unsigned char *cur = ctxt->input->cur;
326 unsigned char c;
327 unsigned int val;
328
329 c = *cur;
330 if (c & 0x80) {
331 if (cur[1] == 0)
332 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
333 if ((cur[1] & 0xc0) != 0x80)
334 goto encoding_error;
335 if ((c & 0xe0) == 0xe0) {
336
337 if (cur[2] == 0)
338 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
339 if ((cur[2] & 0xc0) != 0x80)
340 goto encoding_error;
341 if ((c & 0xf0) == 0xf0) {
342 if (cur[3] == 0)
343 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
344 if (((c & 0xf8) != 0xf0) ||
345 ((cur[3] & 0xc0) != 0x80))
346 goto encoding_error;
347 /* 4-byte code */
348 *len = 4;
349 val = (cur[0] & 0x7) << 18;
350 val |= (cur[1] & 0x3f) << 12;
351 val |= (cur[2] & 0x3f) << 6;
352 val |= cur[3] & 0x3f;
353 } else {
354 /* 3-byte code */
355 *len = 3;
356 val = (cur[0] & 0xf) << 12;
357 val |= (cur[1] & 0x3f) << 6;
358 val |= cur[2] & 0x3f;
359 }
360 } else {
361 /* 2-byte code */
362 *len = 2;
363 val = (cur[0] & 0x1f) << 6;
364 val |= cur[1] & 0x3f;
365 }
366 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000367 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
368 "Char 0x%X out of allowed range\n", val);
Owen Taylor3473f882001-02-23 17:55:21 +0000369 }
370 return(val);
371 } else {
372 /* 1-byte code */
373 *len = 1;
374 return((int) *ctxt->input->cur);
375 }
376 }
377 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000378 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000379 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000380 * XML constructs only use < 128 chars
381 */
382 *len = 1;
383 if ((int) *ctxt->input->cur < 0x80)
384 return((int) *ctxt->input->cur);
385
386 /*
387 * Humm this is bad, do an automatic flow conversion
388 */
389 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
390 ctxt->charset = XML_CHAR_ENCODING_UTF8;
391 return(xmlCurrentChar(ctxt, len));
392
393encoding_error:
394 /*
395 * If we detect an UTF8 error that probably mean that the
396 * input encoding didn't get properly advertized in the
397 * declaration header. Report the error and switch the encoding
398 * to ISO-Latin-1 (if you don't like this policy, just declare the
399 * encoding !)
400 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000401 {
402 char buffer[150];
403
404 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
Owen Taylor3473f882001-02-23 17:55:21 +0000405 ctxt->input->cur[0], ctxt->input->cur[1],
406 ctxt->input->cur[2], ctxt->input->cur[3]);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000407 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
408 "Input is not proper UTF-8, indicate encoding !\n",
409 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000410 }
411
412 ctxt->charset = XML_CHAR_ENCODING_8859_1;
413 *len = 1;
414 return((int) *ctxt->input->cur);
415}
416
417/**
Owen Taylor3473f882001-02-23 17:55:21 +0000418 * htmlSkipBlankChars:
419 * @ctxt: the HTML parser context
420 *
421 * skip all blanks character found at that point in the input streams.
422 *
423 * Returns the number of space chars skipped
424 */
425
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000426static int
Owen Taylor3473f882001-02-23 17:55:21 +0000427htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
428 int res = 0;
429
William M. Brack76e95df2003-10-18 16:20:14 +0000430 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000431 if ((*ctxt->input->cur == 0) &&
432 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
433 xmlPopInput(ctxt);
434 } else {
435 if (*(ctxt->input->cur) == '\n') {
436 ctxt->input->line++; ctxt->input->col = 1;
437 } else ctxt->input->col++;
438 ctxt->input->cur++;
439 ctxt->nbChars++;
440 if (*ctxt->input->cur == 0)
441 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
442 }
443 res++;
444 }
445 return(res);
446}
447
448
449
450/************************************************************************
451 * *
452 * The list of HTML elements and their properties *
453 * *
454 ************************************************************************/
455
456/*
457 * Start Tag: 1 means the start tag can be ommited
458 * End Tag: 1 means the end tag can be ommited
459 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000460 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000461 * Depr: this element is deprecated
462 * DTD: 1 means that this element is valid only in the Loose DTD
463 * 2 means that this element is valid only in the Frameset DTD
464 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000465 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000466 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000467 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000468
469/* Definitions and a couple of vars for HTML Elements */
470
471#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000472#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000473#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000474#define NB_PHRASE 10
Daniel Veillard930dfb62003-02-05 10:17:38 +0000475#define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000476#define NB_SPECIAL 15
Daniel Veillard930dfb62003-02-05 10:17:38 +0000477#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000478#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
479#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
480#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000481#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000482#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000483#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000484#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000485#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000486#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000487#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000488#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000489#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000490#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000491#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000492#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000493#define EMPTY NULL
494
495
Daniel Veillard065abe82006-07-03 08:55:04 +0000496static const char* const html_flow[] = { FLOW, NULL } ;
497static const char* const html_inline[] = { INLINE, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000498
499/* placeholders: elts with content but no subelements */
Daniel Veillard065abe82006-07-03 08:55:04 +0000500static const char* const html_pcdata[] = { NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000501#define html_cdata html_pcdata
502
503
504/* ... and for HTML Attributes */
505
506#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000507#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000508#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000509#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000510#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000511#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000512#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000513#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000514#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000515#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000516#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000517#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000518
Daniel Veillard065abe82006-07-03 08:55:04 +0000519static const char* const html_attrs[] = { ATTRS, NULL } ;
520static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
521static const char* const core_attrs[] = { COREATTRS, NULL } ;
522static const char* const i18n_attrs[] = { I18N, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000523
524
525/* Other declarations that should go inline ... */
Daniel Veillard065abe82006-07-03 08:55:04 +0000526static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000527 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
528 "tabindex", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000529static const char* const target_attr[] = { "target", NULL } ;
530static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
531static const char* const alt_attr[] = { "alt", NULL } ;
532static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
533static const char* const href_attrs[] = { "href", NULL } ;
534static const char* const clear_attrs[] = { "clear", NULL } ;
535static const char* const inline_p[] = { INLINE, "p", NULL } ;
536
537static const char* const flow_param[] = { FLOW, "param", NULL } ;
538static const char* const applet_attrs[] = { COREATTRS , "codebase",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000539 "archive", "alt", "name", "height", "width", "align",
540 "hspace", "vspace", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000541static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000542 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000543static const char* const basefont_attrs[] =
Daniel Veillard930dfb62003-02-05 10:17:38 +0000544 { "id", "size", "color", "face", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000545static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
546static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
547static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
548static const char* const body_depr[] = { "background", "bgcolor", "text",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000549 "link", "vlink", "alink", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000550static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000551 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
552
553
Daniel Veillard065abe82006-07-03 08:55:04 +0000554static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
555static const char* const col_elt[] = { "col", NULL } ;
556static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
557static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
558static const char* const dl_contents[] = { "dt", "dd", NULL } ;
559static const char* const compact_attr[] = { "compact", NULL } ;
560static const char* const label_attr[] = { "label", NULL } ;
561static const char* const fieldset_contents[] = { FLOW, "legend" } ;
562static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
563static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
564static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
565static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
566static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
567static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
568static const char* const head_attrs[] = { I18N, "profile", NULL } ;
569static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
570static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
571static const char* const version_attr[] = { "version", NULL } ;
572static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
573static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
574static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
575static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
576static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
577static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
578static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
579static const char* const align_attr[] = { "align", NULL } ;
580static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
581static const char* const map_contents[] = { BLOCK, "area", NULL } ;
582static const char* const name_attr[] = { "name", NULL } ;
583static const char* const action_attr[] = { "action", NULL } ;
584static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
585static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
586static const char* const content_attr[] = { "content", NULL } ;
587static const char* const type_attr[] = { "type", NULL } ;
588static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
589static const char* const object_contents[] = { FLOW, "param", NULL } ;
590static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
591static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
592static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
593static const char* const option_elt[] = { "option", NULL } ;
594static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
595static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
596static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
597static const char* const width_attr[] = { "width", NULL } ;
598static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
599static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
600static const char* const language_attr[] = { "language", NULL } ;
601static const char* const select_content[] = { "optgroup", "option", NULL } ;
602static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
603static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
604static const char* const table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
605static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
606static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
607static const char* const tr_elt[] = { "tr", NULL } ;
608static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
609static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
610static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
611static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
612static const char* const tr_contents[] = { "th", "td", NULL } ;
613static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
614static const char* const li_elt[] = { "li", NULL } ;
615static const char* const ul_depr[] = { "type", "compact", NULL} ;
616static const char* const dir_attr[] = { "dir", NULL} ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000617
618#define DECL (const char**)
619
Daniel Veillard22090732001-07-16 00:06:07 +0000620static const htmlElemDesc
621html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000622{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
623 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
624},
625{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
626 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
627},
628{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
629 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
630},
631{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
632 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
633},
634{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
635 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
636},
637{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
638 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
639},
640{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
641 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
642},
643{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
644 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
645},
646{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
647 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
648},
649{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
650 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
651},
652{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
653 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
654},
655{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
656 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
657},
658{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
659 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
660},
661{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
662 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
663},
664{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
665 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
666},
667{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
668 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
669},
670{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
671 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
672},
673{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
674 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
675},
676{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
677 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
678},
679{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
680 EMPTY , NULL , DECL col_attrs , NULL, NULL
681},
682{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
683 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
684},
685{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
686 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
687},
688{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
689 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
690},
691{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
692 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
693},
694{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
695 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
696},
697{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
698 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
699},
700{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
701 DECL dl_contents , "dd" , html_attrs, DECL compact_attr, NULL
702},
703{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
704 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
705},
706{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
707 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
708},
709{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
710 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
711},
712{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
713 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
714},
715{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
716 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
717},
718{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
719 EMPTY, NULL, NULL, DECL frame_attrs, NULL
720},
721{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
722 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
723},
724{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
725 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
726},
727{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
728 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
729},
730{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
731 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
732},
733{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
734 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
735},
736{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
737 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
738},
739{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
740 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
741},
742{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
743 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
744},
745{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
746 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
747},
748{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
749 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
750},
751{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
752 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
753},
754{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
755 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
756},
757{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
758 EMPTY, NULL, DECL img_attrs, DECL align_attr, src_alt_attrs
759},
760{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
761 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
762},
763{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
764 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
765},
766{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
767 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
768},
769{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
770 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
771},
772{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
773 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
774},
775{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
776 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
777},
778{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
779 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
780},
781{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
782 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
783},
784{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
785 DECL map_contents , NULL, DECL html_attrs , NULL, name_attr
786},
787{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
788 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
789},
790{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
791 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
792},
793{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
794 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
795},
796{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
797 DECL html_flow, "div", DECL html_attrs, NULL, NULL
798},
799{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
800 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
801},
802{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
803 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
804},
805{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
806 option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
807},
808{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
809 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
810},
811{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
812 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
813},
814{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
815 EMPTY, NULL, DECL param_attrs, NULL, name_attr
816},
817{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
818 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
819},
820{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
821 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
822},
823{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
824 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
825},
826{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
827 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
828},
829{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
830 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
831},
832{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
833 DECL select_content, NULL, DECL select_attrs, NULL, NULL
834},
835{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
836 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
837},
838{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
839 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
840},
841{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
842 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
843},
844{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
845 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
846},
847{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
848 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
849},
850{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
851 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
852},
853{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
854 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
855},
856{ "table", 0, 0, 0, 0, 0, 0, 0, "",
857 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
858},
859{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
860 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
861},
862{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
863 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
864},
865{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
866 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
867},
868{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
869 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
870},
871{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
872 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
873},
874{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
875 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
876},
877{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
878 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
879},
880{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
881 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
882},
883{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
884 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
885},
886{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
887 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
888},
889{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
890 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
891},
892{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
893 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
894}
Owen Taylor3473f882001-02-23 17:55:21 +0000895};
896
897/*
Owen Taylor3473f882001-02-23 17:55:21 +0000898 * start tags that imply the end of current element
899 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000900static const char * const htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000901"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
902 "dl", "ul", "ol", "menu", "dir", "address", "pre",
903 "listing", "xmp", "head", NULL,
904"head", "p", NULL,
905"title", "p", NULL,
906"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +0000907"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000908"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
909 "pre", "listing", "xmp", "head", "li", NULL,
910"hr", "p", "head", NULL,
911"h1", "p", "head", NULL,
912"h2", "p", "head", NULL,
913"h3", "p", "head", NULL,
914"h4", "p", "head", NULL,
915"h5", "p", "head", NULL,
916"h6", "p", "head", NULL,
917"dir", "p", "head", NULL,
918"address", "p", "head", "ul", NULL,
919"pre", "p", "head", "ul", NULL,
920"listing", "p", "head", NULL,
921"xmp", "p", "head", NULL,
922"blockquote", "p", "head", NULL,
923"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
924 "xmp", "head", NULL,
925"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
926 "head", "dd", NULL,
927"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
928 "head", "dt", NULL,
929"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
930 "listing", "xmp", NULL,
931"ol", "p", "head", "ul", NULL,
932"menu", "p", "head", "ul", NULL,
933"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
934"div", "p", "head", NULL,
935"noscript", "p", "head", NULL,
936"center", "font", "b", "i", "p", "head", NULL,
937"a", "a", NULL,
938"caption", "p", NULL,
939"colgroup", "caption", "colgroup", "col", "p", NULL,
940"col", "caption", "col", "p", NULL,
941"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
942 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000943"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
944"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000945"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
946"thead", "caption", "col", "colgroup", NULL,
947"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
948 "tbody", "p", NULL,
949"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
950 "tfoot", "tbody", "p", NULL,
951"optgroup", "option", NULL,
952"option", "option", NULL,
953"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
954 "pre", "listing", "xmp", "a", NULL,
955NULL
956};
957
958/*
959 * The list of HTML elements which are supposed not to have
960 * CDATA content and where a p element will be implied
961 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000962 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000963 * implied paragraph
964 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000965static const char *const htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000966 "html",
967 "head",
Owen Taylor3473f882001-02-23 17:55:21 +0000968 NULL
969};
970
971/*
972 * The list of HTML attributes which are of content %Script;
973 * NOTE: when adding ones, check htmlIsScriptAttribute() since
974 * it assumes the name starts with 'on'
975 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000976static const char *const htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000977 "onclick",
978 "ondblclick",
979 "onmousedown",
980 "onmouseup",
981 "onmouseover",
982 "onmousemove",
983 "onmouseout",
984 "onkeypress",
985 "onkeydown",
986 "onkeyup",
987 "onload",
988 "onunload",
989 "onfocus",
990 "onblur",
991 "onsubmit",
992 "onrest",
993 "onchange",
994 "onselect"
995};
996
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000997/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000998 * This table is used by the htmlparser to know what to do with
999 * broken html pages. By assigning different priorities to different
1000 * elements the parser can decide how to handle extra endtags.
1001 * Endtags are only allowed to close elements with lower or equal
1002 * priority.
1003 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001004
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001005typedef struct {
1006 const char *name;
1007 int priority;
1008} elementPriority;
1009
Daniel Veillard22090732001-07-16 00:06:07 +00001010static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001011 {"div", 150},
1012 {"td", 160},
1013 {"th", 160},
1014 {"tr", 170},
1015 {"thead", 180},
1016 {"tbody", 180},
1017 {"tfoot", 180},
1018 {"table", 190},
1019 {"head", 200},
1020 {"body", 200},
1021 {"html", 220},
1022 {NULL, 100} /* Default priority */
1023};
Owen Taylor3473f882001-02-23 17:55:21 +00001024
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001025static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001026static int htmlStartCloseIndexinitialized = 0;
1027
1028/************************************************************************
1029 * *
1030 * functions to handle HTML specific data *
1031 * *
1032 ************************************************************************/
1033
1034/**
1035 * htmlInitAutoClose:
1036 *
1037 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1038 * This is not reentrant. Call xmlInitParser() once before processing in
1039 * case of use in multithreaded programs.
1040 */
1041void
1042htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001043 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001044
1045 if (htmlStartCloseIndexinitialized) return;
1046
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001047 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1048 indx = 0;
1049 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
Daniel Veillard28aac0b2006-10-16 08:31:18 +00001050 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001051 while (htmlStartClose[i] != NULL) i++;
1052 i++;
1053 }
1054 htmlStartCloseIndexinitialized = 1;
1055}
1056
1057/**
1058 * htmlTagLookup:
1059 * @tag: The tag name in lowercase
1060 *
1061 * Lookup the HTML tag in the ElementTable
1062 *
1063 * Returns the related htmlElemDescPtr or NULL if not found.
1064 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001065const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001066htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001067 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001068
1069 for (i = 0; i < (sizeof(html40ElementTable) /
1070 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001071 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001072 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001073 }
1074 return(NULL);
1075}
1076
1077/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001078 * htmlGetEndPriority:
1079 * @name: The name of the element to look up the priority for.
1080 *
1081 * Return value: The "endtag" priority.
1082 **/
1083static int
1084htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001085 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001086
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001087 while ((htmlEndPriority[i].name != NULL) &&
1088 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1089 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001090
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001091 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001092}
1093
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001094
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001095/**
Owen Taylor3473f882001-02-23 17:55:21 +00001096 * htmlCheckAutoClose:
1097 * @newtag: The new tag name
1098 * @oldtag: The old tag name
1099 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001100 * Checks whether the new tag is one of the registered valid tags for
1101 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001102 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1103 *
1104 * Returns 0 if no, 1 if yes.
1105 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001106static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001107htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1108{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001109 int i, indx;
1110 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001111
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001112 if (htmlStartCloseIndexinitialized == 0)
1113 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001114
1115 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001116 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001117 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001118 if (closed == NULL)
1119 return (0);
1120 if (xmlStrEqual(BAD_CAST * closed, newtag))
1121 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001122 }
1123
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001124 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001125 i++;
1126 while (htmlStartClose[i] != NULL) {
1127 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001128 return (1);
1129 }
1130 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001131 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001132 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001133}
1134
1135/**
1136 * htmlAutoCloseOnClose:
1137 * @ctxt: an HTML parser context
1138 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001139 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001140 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001141 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001142 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001143static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001144htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1145{
1146 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001147 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001148
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001149 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001150
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001151 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001152
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001153 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1154 break;
1155 /*
1156 * A missplaced endtag can only close elements with lower
1157 * or equal priority, so if we find an element with higher
1158 * priority before we find an element with
1159 * matching name, we just ignore this endtag
1160 */
1161 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1162 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001163 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001164 if (i < 0)
1165 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001166
1167 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001168 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001169 if ((info != NULL) && (info->endTag == 3)) {
1170 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1171 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001172 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001173 }
1174 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1175 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001176 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001177 }
1178}
1179
1180/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001181 * htmlAutoCloseOnEnd:
1182 * @ctxt: an HTML parser context
1183 *
1184 * Close all remaining tags at the end of the stream
1185 */
1186static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001187htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1188{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001189 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001190
William M. Brack899e64a2003-09-26 18:03:42 +00001191 if (ctxt->nameNr == 0)
1192 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001193 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001194 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1195 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001196 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001197 }
1198}
1199
1200/**
Owen Taylor3473f882001-02-23 17:55:21 +00001201 * htmlAutoClose:
1202 * @ctxt: an HTML parser context
1203 * @newtag: The new tag name or NULL
1204 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001205 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001206 * The list is kept in htmlStartClose array. This function is
1207 * called when a new tag has been detected and generates the
1208 * appropriates closes if possible/needed.
1209 * If newtag is NULL this mean we are at the end of the resource
1210 * and we should check
1211 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001212static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001213htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1214{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001215 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001216 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001217 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1218 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001219 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001220 }
1221 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001222 htmlAutoCloseOnEnd(ctxt);
1223 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001224 }
1225 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001226 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1227 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1228 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001229 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1230 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001231 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001232 }
Owen Taylor3473f882001-02-23 17:55:21 +00001233}
1234
1235/**
1236 * htmlAutoCloseTag:
1237 * @doc: the HTML document
1238 * @name: The tag name
1239 * @elem: the HTML element
1240 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001241 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001242 * The list is kept in htmlStartClose array. This function checks
1243 * if the element or one of it's children would autoclose the
1244 * given tag.
1245 *
1246 * Returns 1 if autoclose, 0 otherwise
1247 */
1248int
1249htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1250 htmlNodePtr child;
1251
1252 if (elem == NULL) return(1);
1253 if (xmlStrEqual(name, elem->name)) return(0);
1254 if (htmlCheckAutoClose(elem->name, name)) return(1);
1255 child = elem->children;
1256 while (child != NULL) {
1257 if (htmlAutoCloseTag(doc, name, child)) return(1);
1258 child = child->next;
1259 }
1260 return(0);
1261}
1262
1263/**
1264 * htmlIsAutoClosed:
1265 * @doc: the HTML document
1266 * @elem: the HTML element
1267 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001268 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001269 * The list is kept in htmlStartClose array. This function checks
1270 * if a tag is autoclosed by one of it's child
1271 *
1272 * Returns 1 if autoclosed, 0 otherwise
1273 */
1274int
1275htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1276 htmlNodePtr child;
1277
1278 if (elem == NULL) return(1);
1279 child = elem->children;
1280 while (child != NULL) {
1281 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1282 child = child->next;
1283 }
1284 return(0);
1285}
1286
1287/**
1288 * htmlCheckImplied:
1289 * @ctxt: an HTML parser context
1290 * @newtag: The new tag name
1291 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001292 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001293 * called when a new tag has been detected and generates the
1294 * appropriates implicit tags if missing
1295 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001296static void
Owen Taylor3473f882001-02-23 17:55:21 +00001297htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1298 if (!htmlOmittedDefaultValue)
1299 return;
1300 if (xmlStrEqual(newtag, BAD_CAST"html"))
1301 return;
1302 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001303 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001304 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1305 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1306 }
1307 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1308 return;
1309 if ((ctxt->nameNr <= 1) &&
1310 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1311 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1312 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1313 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1314 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1315 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1316 /*
1317 * dropped OBJECT ... i you put it first BODY will be
1318 * assumed !
1319 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001320 htmlnamePush(ctxt, BAD_CAST"head");
Owen Taylor3473f882001-02-23 17:55:21 +00001321 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1322 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1323 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1324 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1325 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1326 int i;
1327 for (i = 0;i < ctxt->nameNr;i++) {
1328 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1329 return;
1330 }
1331 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1332 return;
1333 }
1334 }
1335
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001336 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001337 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1338 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1339 }
1340}
1341
1342/**
1343 * htmlCheckParagraph
1344 * @ctxt: an HTML parser context
1345 *
1346 * Check whether a p element need to be implied before inserting
1347 * characters in the current element.
1348 *
1349 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1350 * in case of error.
1351 */
1352
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001353static int
Owen Taylor3473f882001-02-23 17:55:21 +00001354htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1355 const xmlChar *tag;
1356 int i;
1357
1358 if (ctxt == NULL)
1359 return(-1);
1360 tag = ctxt->name;
1361 if (tag == NULL) {
1362 htmlAutoClose(ctxt, BAD_CAST"p");
1363 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001364 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001365 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1366 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1367 return(1);
1368 }
1369 if (!htmlOmittedDefaultValue)
1370 return(0);
1371 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1372 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001373 htmlAutoClose(ctxt, BAD_CAST"p");
1374 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001375 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001376 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1377 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1378 return(1);
1379 }
1380 }
1381 return(0);
1382}
1383
1384/**
1385 * htmlIsScriptAttribute:
1386 * @name: an attribute name
1387 *
1388 * Check if an attribute is of content type Script
1389 *
1390 * Returns 1 is the attribute is a script 0 otherwise
1391 */
1392int
1393htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001394 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001395
1396 if (name == NULL)
1397 return(0);
1398 /*
1399 * all script attributes start with 'on'
1400 */
1401 if ((name[0] != 'o') || (name[1] != 'n'))
1402 return(0);
1403 for (i = 0;
1404 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1405 i++) {
1406 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1407 return(1);
1408 }
1409 return(0);
1410}
1411
1412/************************************************************************
1413 * *
1414 * The list of HTML predefined entities *
1415 * *
1416 ************************************************************************/
1417
1418
Daniel Veillard22090732001-07-16 00:06:07 +00001419static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001420/*
1421 * the 4 absolute ones, plus apostrophe.
1422 */
1423{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1424{ 38, "amp", "ampersand, U+0026 ISOnum" },
1425{ 39, "apos", "single quote" },
1426{ 60, "lt", "less-than sign, U+003C ISOnum" },
1427{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1428
1429/*
1430 * A bunch still in the 128-255 range
1431 * Replacing them depend really on the charset used.
1432 */
1433{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1434{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1435{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1436{ 163, "pound","pound sign, U+00A3 ISOnum" },
1437{ 164, "curren","currency sign, U+00A4 ISOnum" },
1438{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1439{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1440{ 167, "sect", "section sign, U+00A7 ISOnum" },
1441{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1442{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1443{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1444{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1445{ 172, "not", "not sign, U+00AC ISOnum" },
1446{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1447{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1448{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1449{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1450{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1451{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1452{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1453{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1454{ 181, "micro","micro sign, U+00B5 ISOnum" },
1455{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1456{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1457{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1458{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1459{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1460{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1461{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1462{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1463{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1464{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1465{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1466{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1467{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1468{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1469{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1470{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1471{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1472{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1473{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1474{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1475{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1476{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1477{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1478{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1479{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1480{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1481{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1482{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1483{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1484{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1485{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1486{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1487{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1488{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1489{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1490{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1491{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1492{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1493{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1494{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1495{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1496{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1497{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1498{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1499{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1500{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1501{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1502{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1503{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1504{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1505{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1506{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1507{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1508{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1509{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1510{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1511{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1512{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1513{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1514{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1515{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1516{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1517{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1518{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1519{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1520{ 247, "divide","division sign, U+00F7 ISOnum" },
1521{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1522{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1523{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1524{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1525{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1526{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1527{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1528{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1529
1530{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1531{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1532{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1533{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1534{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1535
1536/*
1537 * Anything below should really be kept as entities references
1538 */
1539{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1540
1541{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1542{ 732, "tilde","small tilde, U+02DC ISOdia" },
1543
1544{ 913, "Alpha","greek capital letter alpha, U+0391" },
1545{ 914, "Beta", "greek capital letter beta, U+0392" },
1546{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1547{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1548{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1549{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1550{ 919, "Eta", "greek capital letter eta, U+0397" },
1551{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1552{ 921, "Iota", "greek capital letter iota, U+0399" },
1553{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001554{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001555{ 924, "Mu", "greek capital letter mu, U+039C" },
1556{ 925, "Nu", "greek capital letter nu, U+039D" },
1557{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1558{ 927, "Omicron","greek capital letter omicron, U+039F" },
1559{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1560{ 929, "Rho", "greek capital letter rho, U+03A1" },
1561{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1562{ 932, "Tau", "greek capital letter tau, U+03A4" },
1563{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1564{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1565{ 935, "Chi", "greek capital letter chi, U+03A7" },
1566{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1567{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1568
1569{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1570{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1571{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1572{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1573{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1574{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1575{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1576{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1577{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1578{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1579{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1580{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1581{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1582{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1583{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1584{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1585{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1586{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1587{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1588{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1589{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1590{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1591{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1592{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1593{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1594{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1595{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1596{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1597
1598{ 8194, "ensp", "en space, U+2002 ISOpub" },
1599{ 8195, "emsp", "em space, U+2003 ISOpub" },
1600{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1601{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1602{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1603{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1604{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1605{ 8211, "ndash","en dash, U+2013 ISOpub" },
1606{ 8212, "mdash","em dash, U+2014 ISOpub" },
1607{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1608{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1609{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1610{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1611{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1612{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1613{ 8224, "dagger","dagger, U+2020 ISOpub" },
1614{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1615
1616{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1617{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1618
1619{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1620
1621{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1622{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1623
1624{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1625{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1626
1627{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1628{ 8260, "frasl","fraction slash, U+2044 NEW" },
1629
1630{ 8364, "euro", "euro sign, U+20AC NEW" },
1631
1632{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1633{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1634{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1635{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1636{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1637{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1638{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1639{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1640{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1641{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1642{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1643{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1644{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1645{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1646{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1647{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1648
1649{ 8704, "forall","for all, U+2200 ISOtech" },
1650{ 8706, "part", "partial differential, U+2202 ISOtech" },
1651{ 8707, "exist","there exists, U+2203 ISOtech" },
1652{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1653{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1654{ 8712, "isin", "element of, U+2208 ISOtech" },
1655{ 8713, "notin","not an element of, U+2209 ISOtech" },
1656{ 8715, "ni", "contains as member, U+220B ISOtech" },
1657{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001658{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001659{ 8722, "minus","minus sign, U+2212 ISOtech" },
1660{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1661{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1662{ 8733, "prop", "proportional to, U+221D ISOtech" },
1663{ 8734, "infin","infinity, U+221E ISOtech" },
1664{ 8736, "ang", "angle, U+2220 ISOamso" },
1665{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1666{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1667{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1668{ 8746, "cup", "union = cup, U+222A ISOtech" },
1669{ 8747, "int", "integral, U+222B ISOtech" },
1670{ 8756, "there4","therefore, U+2234 ISOtech" },
1671{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1672{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1673{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1674{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1675{ 8801, "equiv","identical to, U+2261 ISOtech" },
1676{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1677{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1678{ 8834, "sub", "subset of, U+2282 ISOtech" },
1679{ 8835, "sup", "superset of, U+2283 ISOtech" },
1680{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1681{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1682{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1683{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1684{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1685{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1686{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1687{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1688{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1689{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1690{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1691{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1692{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1693{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1694
1695{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1696{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1697{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1698{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1699
1700};
1701
1702/************************************************************************
1703 * *
1704 * Commodity functions to handle entities *
1705 * *
1706 ************************************************************************/
1707
1708/*
1709 * Macro used to grow the current buffer.
1710 */
1711#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001712 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001713 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001714 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1715 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001716 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001717 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001718 return(NULL); \
1719 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001720 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001721}
1722
1723/**
1724 * htmlEntityLookup:
1725 * @name: the entity name
1726 *
1727 * Lookup the given entity in EntitiesTable
1728 *
1729 * TODO: the linear scan is really ugly, an hash table is really needed.
1730 *
1731 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1732 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001733const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001734htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001735 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001736
1737 for (i = 0;i < (sizeof(html40EntitiesTable)/
1738 sizeof(html40EntitiesTable[0]));i++) {
1739 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001740 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001741 }
1742 }
1743 return(NULL);
1744}
1745
1746/**
1747 * htmlEntityValueLookup:
1748 * @value: the entity's unicode value
1749 *
1750 * Lookup the given entity in EntitiesTable
1751 *
1752 * TODO: the linear scan is really ugly, an hash table is really needed.
1753 *
1754 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1755 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001756const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001757htmlEntityValueLookup(unsigned int value) {
1758 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001759
1760 for (i = 0;i < (sizeof(html40EntitiesTable)/
1761 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001762 if (html40EntitiesTable[i].value >= value) {
1763 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001764 break;
William M. Brack78637da2003-07-31 14:47:38 +00001765 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001766 }
Owen Taylor3473f882001-02-23 17:55:21 +00001767 }
1768 return(NULL);
1769}
1770
1771/**
1772 * UTF8ToHtml:
1773 * @out: a pointer to an array of bytes to store the result
1774 * @outlen: the length of @out
1775 * @in: a pointer to an array of UTF-8 chars
1776 * @inlen: the length of @in
1777 *
1778 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1779 * plus HTML entities block of chars out.
1780 *
1781 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1782 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001783 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001784 * The value of @outlen after return is the number of octets consumed.
1785 */
1786int
1787UTF8ToHtml(unsigned char* out, int *outlen,
1788 const unsigned char* in, int *inlen) {
1789 const unsigned char* processed = in;
1790 const unsigned char* outend;
1791 const unsigned char* outstart = out;
1792 const unsigned char* instart = in;
1793 const unsigned char* inend;
1794 unsigned int c, d;
1795 int trailing;
1796
Daniel Veillardce682bc2004-11-05 17:22:25 +00001797 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00001798 if (in == NULL) {
1799 /*
1800 * initialization nothing to do
1801 */
1802 *outlen = 0;
1803 *inlen = 0;
1804 return(0);
1805 }
1806 inend = in + (*inlen);
1807 outend = out + (*outlen);
1808 while (in < inend) {
1809 d = *in++;
1810 if (d < 0x80) { c= d; trailing= 0; }
1811 else if (d < 0xC0) {
1812 /* trailing byte in leading position */
1813 *outlen = out - outstart;
1814 *inlen = processed - instart;
1815 return(-2);
1816 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1817 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1818 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1819 else {
1820 /* no chance for this in Ascii */
1821 *outlen = out - outstart;
1822 *inlen = processed - instart;
1823 return(-2);
1824 }
1825
1826 if (inend - in < trailing) {
1827 break;
1828 }
1829
1830 for ( ; trailing; trailing--) {
1831 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1832 break;
1833 c <<= 6;
1834 c |= d & 0x3F;
1835 }
1836
1837 /* assertion: c is a single UTF-4 value */
1838 if (c < 0x80) {
1839 if (out + 1 >= outend)
1840 break;
1841 *out++ = c;
1842 } else {
1843 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001844 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001845
1846 /*
1847 * Try to lookup a predefined HTML entity for it
1848 */
1849
1850 ent = htmlEntityValueLookup(c);
1851 if (ent == NULL) {
1852 /* no chance for this in Ascii */
1853 *outlen = out - outstart;
1854 *inlen = processed - instart;
1855 return(-2);
1856 }
1857 len = strlen(ent->name);
1858 if (out + 2 + len >= outend)
1859 break;
1860 *out++ = '&';
1861 memcpy(out, ent->name, len);
1862 out += len;
1863 *out++ = ';';
1864 }
1865 processed = in;
1866 }
1867 *outlen = out - outstart;
1868 *inlen = processed - instart;
1869 return(0);
1870}
1871
1872/**
1873 * htmlEncodeEntities:
1874 * @out: a pointer to an array of bytes to store the result
1875 * @outlen: the length of @out
1876 * @in: a pointer to an array of UTF-8 chars
1877 * @inlen: the length of @in
1878 * @quoteChar: the quote character to escape (' or ") or zero.
1879 *
1880 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1881 * plus HTML entities block of chars out.
1882 *
1883 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1884 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001885 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001886 * The value of @outlen after return is the number of octets consumed.
1887 */
1888int
1889htmlEncodeEntities(unsigned char* out, int *outlen,
1890 const unsigned char* in, int *inlen, int quoteChar) {
1891 const unsigned char* processed = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001892 const unsigned char* outend;
Owen Taylor3473f882001-02-23 17:55:21 +00001893 const unsigned char* outstart = out;
1894 const unsigned char* instart = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001895 const unsigned char* inend;
Owen Taylor3473f882001-02-23 17:55:21 +00001896 unsigned int c, d;
1897 int trailing;
1898
Daniel Veillardce682bc2004-11-05 17:22:25 +00001899 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
1900 return(-1);
1901 outend = out + (*outlen);
1902 inend = in + (*inlen);
Owen Taylor3473f882001-02-23 17:55:21 +00001903 while (in < inend) {
1904 d = *in++;
1905 if (d < 0x80) { c= d; trailing= 0; }
1906 else if (d < 0xC0) {
1907 /* trailing byte in leading position */
1908 *outlen = out - outstart;
1909 *inlen = processed - instart;
1910 return(-2);
1911 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1912 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1913 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1914 else {
1915 /* no chance for this in Ascii */
1916 *outlen = out - outstart;
1917 *inlen = processed - instart;
1918 return(-2);
1919 }
1920
1921 if (inend - in < trailing)
1922 break;
1923
1924 while (trailing--) {
1925 if (((d= *in++) & 0xC0) != 0x80) {
1926 *outlen = out - outstart;
1927 *inlen = processed - instart;
1928 return(-2);
1929 }
1930 c <<= 6;
1931 c |= d & 0x3F;
1932 }
1933
1934 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001935 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1936 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001937 if (out >= outend)
1938 break;
1939 *out++ = c;
1940 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001941 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001942 const char *cp;
1943 char nbuf[16];
1944 int len;
1945
1946 /*
1947 * Try to lookup a predefined HTML entity for it
1948 */
1949 ent = htmlEntityValueLookup(c);
1950 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001951 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001952 cp = nbuf;
1953 }
1954 else
1955 cp = ent->name;
1956 len = strlen(cp);
1957 if (out + 2 + len > outend)
1958 break;
1959 *out++ = '&';
1960 memcpy(out, cp, len);
1961 out += len;
1962 *out++ = ';';
1963 }
1964 processed = in;
1965 }
1966 *outlen = out - outstart;
1967 *inlen = processed - instart;
1968 return(0);
1969}
1970
Owen Taylor3473f882001-02-23 17:55:21 +00001971/************************************************************************
1972 * *
1973 * Commodity functions to handle streams *
1974 * *
1975 ************************************************************************/
1976
1977/**
Owen Taylor3473f882001-02-23 17:55:21 +00001978 * htmlNewInputStream:
1979 * @ctxt: an HTML parser context
1980 *
1981 * Create a new input stream structure
1982 * Returns the new input stream or NULL
1983 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001984static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001985htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1986 htmlParserInputPtr input;
1987
1988 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1989 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00001990 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001991 return(NULL);
1992 }
1993 memset(input, 0, sizeof(htmlParserInput));
1994 input->filename = NULL;
1995 input->directory = NULL;
1996 input->base = NULL;
1997 input->cur = NULL;
1998 input->buf = NULL;
1999 input->line = 1;
2000 input->col = 1;
2001 input->buf = NULL;
2002 input->free = NULL;
2003 input->version = NULL;
2004 input->consumed = 0;
2005 input->length = 0;
2006 return(input);
2007}
2008
2009
2010/************************************************************************
2011 * *
2012 * Commodity functions, cleanup needed ? *
2013 * *
2014 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002015/*
2016 * all tags allowing pc data from the html 4.01 loose dtd
2017 * NOTE: it might be more apropriate to integrate this information
2018 * into the html40ElementTable array but I don't want to risk any
2019 * binary incomptibility
2020 */
2021static const char *allowPCData[] = {
2022 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2023 "blockquote", "body", "button", "caption", "center", "cite", "code",
2024 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2025 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2026 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2027 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2028};
Owen Taylor3473f882001-02-23 17:55:21 +00002029
2030/**
2031 * areBlanks:
2032 * @ctxt: an HTML parser context
2033 * @str: a xmlChar *
2034 * @len: the size of @str
2035 *
2036 * Is this a sequence of blank chars that one can ignore ?
2037 *
2038 * Returns 1 if ignorable 0 otherwise.
2039 */
2040
2041static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002042 unsigned int i;
2043 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002044 xmlNodePtr lastChild;
Daniel Veillard36d73402005-09-01 09:52:30 +00002045 xmlDtdPtr dtd;
Owen Taylor3473f882001-02-23 17:55:21 +00002046
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002047 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002048 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002049
2050 if (CUR == 0) return(1);
2051 if (CUR != '<') return(0);
2052 if (ctxt->name == NULL)
2053 return(1);
2054 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2055 return(1);
2056 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2057 return(1);
Daniel Veillard36d73402005-09-01 09:52:30 +00002058
2059 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2060 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2061 dtd = xmlGetIntSubset(ctxt->myDoc);
2062 if (dtd != NULL && dtd->ExternalID != NULL) {
2063 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2064 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2065 return(1);
2066 }
2067 }
2068
Owen Taylor3473f882001-02-23 17:55:21 +00002069 if (ctxt->node == NULL) return(0);
2070 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002071 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2072 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002073 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002074 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2075 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002076 /* keep ws in constructs like ...<b> </b>...
2077 for all tags "b" allowing PCDATA */
2078 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2079 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2080 return(0);
2081 }
2082 }
Owen Taylor3473f882001-02-23 17:55:21 +00002083 } else if (xmlNodeIsText(lastChild)) {
2084 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002085 } else {
2086 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2087 for all tags "p" allowing PCDATA */
2088 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2089 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2090 return(0);
2091 }
2092 }
Owen Taylor3473f882001-02-23 17:55:21 +00002093 }
2094 return(1);
2095}
2096
2097/**
Owen Taylor3473f882001-02-23 17:55:21 +00002098 * htmlNewDocNoDtD:
2099 * @URI: URI for the dtd, or NULL
2100 * @ExternalID: the external ID of the DTD, or NULL
2101 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002102 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2103 * are NULL
2104 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002105 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002106 */
2107htmlDocPtr
2108htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2109 xmlDocPtr cur;
2110
2111 /*
2112 * Allocate a new document and fill the fields.
2113 */
2114 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2115 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002116 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002117 return(NULL);
2118 }
2119 memset(cur, 0, sizeof(xmlDoc));
2120
2121 cur->type = XML_HTML_DOCUMENT_NODE;
2122 cur->version = NULL;
2123 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002124 cur->doc = cur;
2125 cur->name = NULL;
2126 cur->children = NULL;
2127 cur->extSubset = NULL;
2128 cur->oldNs = NULL;
2129 cur->encoding = NULL;
2130 cur->standalone = 1;
2131 cur->compression = 0;
2132 cur->ids = NULL;
2133 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002134 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002135 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002136 if ((ExternalID != NULL) ||
2137 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002138 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002139 return(cur);
2140}
2141
2142/**
2143 * htmlNewDoc:
2144 * @URI: URI for the dtd, or NULL
2145 * @ExternalID: the external ID of the DTD, or NULL
2146 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002147 * Creates a new HTML document
2148 *
Owen Taylor3473f882001-02-23 17:55:21 +00002149 * Returns a new document
2150 */
2151htmlDocPtr
2152htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2153 if ((URI == NULL) && (ExternalID == NULL))
2154 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002155 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2156 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002157
2158 return(htmlNewDocNoDtD(URI, ExternalID));
2159}
2160
2161
2162/************************************************************************
2163 * *
2164 * The parser itself *
2165 * Relates to http://www.w3.org/TR/html40 *
2166 * *
2167 ************************************************************************/
2168
2169/************************************************************************
2170 * *
2171 * The parser itself *
2172 * *
2173 ************************************************************************/
2174
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002175static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002176
Owen Taylor3473f882001-02-23 17:55:21 +00002177/**
2178 * htmlParseHTMLName:
2179 * @ctxt: an HTML parser context
2180 *
2181 * parse an HTML tag or attribute name, note that we convert it to lowercase
2182 * since HTML names are not case-sensitive.
2183 *
2184 * Returns the Tag Name parsed or NULL
2185 */
2186
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002187static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002188htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002189 int i = 0;
2190 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2191
William M. Brackd1757ab2004-10-02 22:07:48 +00002192 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Owen Taylor3473f882001-02-23 17:55:21 +00002193 (CUR != ':')) return(NULL);
2194
2195 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002196 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Owen Taylor3473f882001-02-23 17:55:21 +00002197 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2198 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2199 else loc[i] = CUR;
2200 i++;
2201
2202 NEXT;
2203 }
2204
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002205 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002206}
2207
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002208
2209/**
2210 * htmlParseHTMLName_nonInvasive:
2211 * @ctxt: an HTML parser context
2212 *
2213 * parse an HTML tag or attribute name, note that we convert it to lowercase
2214 * since HTML names are not case-sensitive, this doesn't consume the data
2215 * from the stream, it's a look-ahead
2216 *
2217 * Returns the Tag Name parsed or NULL
2218 */
2219
2220static const xmlChar *
2221htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2222 int i = 0;
2223 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2224
2225 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2226 (NXT(1) != ':')) return(NULL);
2227
2228 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2229 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2230 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2231 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2232 else loc[i] = NXT(1+i);
2233 i++;
2234 }
2235
2236 return(xmlDictLookup(ctxt->dict, loc, i));
2237}
2238
2239
Owen Taylor3473f882001-02-23 17:55:21 +00002240/**
2241 * htmlParseName:
2242 * @ctxt: an HTML parser context
2243 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002244 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002245 *
2246 * Returns the Name parsed or NULL
2247 */
2248
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002249static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002250htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002251 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002252 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002253 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002254
2255 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002256
2257 /*
2258 * Accelerator for simple ASCII names
2259 */
2260 in = ctxt->input->cur;
2261 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2262 ((*in >= 0x41) && (*in <= 0x5A)) ||
2263 (*in == '_') || (*in == ':')) {
2264 in++;
2265 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2266 ((*in >= 0x41) && (*in <= 0x5A)) ||
2267 ((*in >= 0x30) && (*in <= 0x39)) ||
2268 (*in == '_') || (*in == '-') ||
2269 (*in == ':') || (*in == '.'))
2270 in++;
2271 if ((*in > 0) && (*in < 0x80)) {
2272 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002273 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002274 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002275 ctxt->nbChars += count;
2276 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002277 return(ret);
2278 }
2279 }
2280 return(htmlParseNameComplex(ctxt));
2281}
2282
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002283static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002284htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002285 int len = 0, l;
2286 int c;
2287 int count = 0;
2288
2289 /*
2290 * Handler for more complex cases
2291 */
2292 GROW;
2293 c = CUR_CHAR(l);
2294 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2295 (!IS_LETTER(c) && (c != '_') &&
2296 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002297 return(NULL);
2298 }
2299
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002300 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2301 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2302 (c == '.') || (c == '-') ||
2303 (c == '_') || (c == ':') ||
2304 (IS_COMBINING(c)) ||
2305 (IS_EXTENDER(c)))) {
2306 if (count++ > 100) {
2307 count = 0;
2308 GROW;
2309 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002310 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002311 NEXTL(l);
2312 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002313 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002314 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002315}
2316
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002317
Owen Taylor3473f882001-02-23 17:55:21 +00002318/**
2319 * htmlParseHTMLAttribute:
2320 * @ctxt: an HTML parser context
2321 * @stop: a char stop value
2322 *
2323 * parse an HTML attribute value till the stop (quote), if
2324 * stop is 0 then it stops at the first space
2325 *
2326 * Returns the attribute parsed or NULL
2327 */
2328
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002329static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002330htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2331 xmlChar *buffer = NULL;
2332 int buffer_size = 0;
2333 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002334 const xmlChar *name = NULL;
2335 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002336 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002337
2338 /*
2339 * allocate a translation buffer.
2340 */
2341 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002342 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002343 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002344 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002345 return(NULL);
2346 }
2347 out = buffer;
2348
2349 /*
2350 * Ok loop until we reach one of the ending chars
2351 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002352 while ((CUR != 0) && (CUR != stop)) {
2353 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002354 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002355 if (CUR == '&') {
2356 if (NXT(1) == '#') {
2357 unsigned int c;
2358 int bits;
2359
2360 c = htmlParseCharRef(ctxt);
2361 if (c < 0x80)
2362 { *out++ = c; bits= -6; }
2363 else if (c < 0x800)
2364 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2365 else if (c < 0x10000)
2366 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2367 else
2368 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2369
2370 for ( ; bits >= 0; bits-= 6) {
2371 *out++ = ((c >> bits) & 0x3F) | 0x80;
2372 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002373
2374 if (out - buffer > buffer_size - 100) {
2375 int indx = out - buffer;
2376
2377 growBuffer(buffer);
2378 out = &buffer[indx];
2379 }
Owen Taylor3473f882001-02-23 17:55:21 +00002380 } else {
2381 ent = htmlParseEntityRef(ctxt, &name);
2382 if (name == NULL) {
2383 *out++ = '&';
2384 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002385 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002386
2387 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002388 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002389 }
2390 } else if (ent == NULL) {
2391 *out++ = '&';
2392 cur = name;
2393 while (*cur != 0) {
2394 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002395 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002396
2397 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002398 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002399 }
2400 *out++ = *cur++;
2401 }
Owen Taylor3473f882001-02-23 17:55:21 +00002402 } else {
2403 unsigned int c;
2404 int bits;
2405
2406 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002407 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002408
2409 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002410 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002411 }
Daniel Veillard48519092006-10-17 15:56:35 +00002412 c = ent->value;
Owen Taylor3473f882001-02-23 17:55:21 +00002413 if (c < 0x80)
2414 { *out++ = c; bits= -6; }
2415 else if (c < 0x800)
2416 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2417 else if (c < 0x10000)
2418 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2419 else
2420 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2421
2422 for ( ; bits >= 0; bits-= 6) {
2423 *out++ = ((c >> bits) & 0x3F) | 0x80;
2424 }
Owen Taylor3473f882001-02-23 17:55:21 +00002425 }
2426 }
2427 } else {
2428 unsigned int c;
2429 int bits, l;
2430
2431 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002432 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002433
2434 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002435 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002436 }
2437 c = CUR_CHAR(l);
2438 if (c < 0x80)
2439 { *out++ = c; bits= -6; }
2440 else if (c < 0x800)
2441 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2442 else if (c < 0x10000)
2443 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2444 else
2445 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2446
2447 for ( ; bits >= 0; bits-= 6) {
2448 *out++ = ((c >> bits) & 0x3F) | 0x80;
2449 }
2450 NEXT;
2451 }
2452 }
2453 *out++ = 0;
2454 return(buffer);
2455}
2456
2457/**
Owen Taylor3473f882001-02-23 17:55:21 +00002458 * htmlParseEntityRef:
2459 * @ctxt: an HTML parser context
2460 * @str: location to store the entity name
2461 *
2462 * parse an HTML ENTITY references
2463 *
2464 * [68] EntityRef ::= '&' Name ';'
2465 *
2466 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2467 * if non-NULL *str will have to be freed by the caller.
2468 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002469const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002470htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2471 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002472 const htmlEntityDesc * ent = NULL;
Daniel Veillard42595322004-11-08 10:52:06 +00002473
2474 if (str != NULL) *str = NULL;
2475 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002476
2477 if (CUR == '&') {
2478 NEXT;
2479 name = htmlParseName(ctxt);
2480 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002481 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2482 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002483 } else {
2484 GROW;
2485 if (CUR == ';') {
Daniel Veillard42595322004-11-08 10:52:06 +00002486 if (str != NULL)
2487 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002488
2489 /*
2490 * Lookup the entity in the table.
2491 */
2492 ent = htmlEntityLookup(name);
2493 if (ent != NULL) /* OK that's ugly !!! */
2494 NEXT;
2495 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002496 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2497 "htmlParseEntityRef: expecting ';'\n",
2498 NULL, NULL);
Daniel Veillard42595322004-11-08 10:52:06 +00002499 if (str != NULL)
2500 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002501 }
2502 }
2503 }
2504 return(ent);
2505}
2506
2507/**
2508 * htmlParseAttValue:
2509 * @ctxt: an HTML parser context
2510 *
2511 * parse a value for an attribute
2512 * Note: the parser won't do substitution of entities here, this
2513 * will be handled later in xmlStringGetNodeList, unless it was
2514 * asked for ctxt->replaceEntities != 0
2515 *
2516 * Returns the AttValue parsed or NULL.
2517 */
2518
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002519static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002520htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2521 xmlChar *ret = NULL;
2522
2523 if (CUR == '"') {
2524 NEXT;
2525 ret = htmlParseHTMLAttribute(ctxt, '"');
2526 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002527 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2528 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002529 } else
2530 NEXT;
2531 } else if (CUR == '\'') {
2532 NEXT;
2533 ret = htmlParseHTMLAttribute(ctxt, '\'');
2534 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002535 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2536 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002537 } else
2538 NEXT;
2539 } else {
2540 /*
2541 * That's an HTMLism, the attribute value may not be quoted
2542 */
2543 ret = htmlParseHTMLAttribute(ctxt, 0);
2544 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002545 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2546 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002547 }
2548 }
2549 return(ret);
2550}
2551
2552/**
2553 * htmlParseSystemLiteral:
2554 * @ctxt: an HTML parser context
2555 *
2556 * parse an HTML Literal
2557 *
2558 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2559 *
2560 * Returns the SystemLiteral parsed or NULL
2561 */
2562
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002563static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002564htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2565 const xmlChar *q;
2566 xmlChar *ret = NULL;
2567
2568 if (CUR == '"') {
2569 NEXT;
2570 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002571 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002572 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002573 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002574 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2575 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002576 } else {
2577 ret = xmlStrndup(q, CUR_PTR - q);
2578 NEXT;
2579 }
2580 } else if (CUR == '\'') {
2581 NEXT;
2582 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002583 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002584 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002585 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002586 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2587 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002588 } else {
2589 ret = xmlStrndup(q, CUR_PTR - q);
2590 NEXT;
2591 }
2592 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002593 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2594 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002595 }
2596
2597 return(ret);
2598}
2599
2600/**
2601 * htmlParsePubidLiteral:
2602 * @ctxt: an HTML parser context
2603 *
2604 * parse an HTML public literal
2605 *
2606 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2607 *
2608 * Returns the PubidLiteral parsed or NULL.
2609 */
2610
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002611static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002612htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2613 const xmlChar *q;
2614 xmlChar *ret = NULL;
2615 /*
2616 * Name ::= (Letter | '_') (NameChar)*
2617 */
2618 if (CUR == '"') {
2619 NEXT;
2620 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002621 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002622 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002623 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2624 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002625 } else {
2626 ret = xmlStrndup(q, CUR_PTR - q);
2627 NEXT;
2628 }
2629 } else if (CUR == '\'') {
2630 NEXT;
2631 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002632 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002633 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002634 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002635 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2636 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002637 } else {
2638 ret = xmlStrndup(q, CUR_PTR - q);
2639 NEXT;
2640 }
2641 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002642 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2643 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002644 }
2645
2646 return(ret);
2647}
2648
2649/**
2650 * htmlParseScript:
2651 * @ctxt: an HTML parser context
2652 *
2653 * parse the content of an HTML SCRIPT or STYLE element
2654 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2655 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2656 * http://www.w3.org/TR/html4/types.html#type-script
2657 * http://www.w3.org/TR/html4/types.html#h-6.15
2658 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2659 *
2660 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2661 * element and the value of intrinsic event attributes. User agents must
2662 * not evaluate script data as HTML markup but instead must pass it on as
2663 * data to a script engine.
2664 * NOTES:
2665 * - The content is passed like CDATA
2666 * - the attributes for style and scripting "onXXX" are also described
2667 * as CDATA but SGML allows entities references in attributes so their
2668 * processing is identical as other attributes
2669 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002670static void
Owen Taylor3473f882001-02-23 17:55:21 +00002671htmlParseScript(htmlParserCtxtPtr ctxt) {
Daniel Veillard7d2b3232005-07-14 08:57:39 +00002672 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
Owen Taylor3473f882001-02-23 17:55:21 +00002673 int nbchar = 0;
Daniel Veillard358fef42005-07-13 16:37:38 +00002674 int cur,l;
Owen Taylor3473f882001-02-23 17:55:21 +00002675
2676 SHRINK;
Daniel Veillard358fef42005-07-13 16:37:38 +00002677 cur = CUR_CHAR(l);
William M. Brack76e95df2003-10-18 16:20:14 +00002678 while (IS_CHAR_CH(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002679 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2680 (NXT(3) == '-')) {
2681 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2682 if (ctxt->sax->cdataBlock!= NULL) {
2683 /*
2684 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2685 */
2686 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002687 } else if (ctxt->sax->characters != NULL) {
2688 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Daniel Veillardc1f78342001-11-10 11:43:05 +00002689 }
2690 }
2691 nbchar = 0;
2692 htmlParseComment(ctxt);
Daniel Veillard358fef42005-07-13 16:37:38 +00002693 cur = CUR_CHAR(l);
Daniel Veillardc1f78342001-11-10 11:43:05 +00002694 continue;
2695 } else if ((cur == '<') && (NXT(1) == '/')) {
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002696 /*
2697 * One should break here, the specification is clear:
2698 * Authors should therefore escape "</" within the content.
2699 * Escape mechanisms are specific to each scripting or
2700 * style sheet language.
2701 *
2702 * In recovery mode, only break if end tag match the
2703 * current tag, effectively ignoring all tags inside the
2704 * script/style block and treating the entire block as
2705 * CDATA.
2706 */
2707 if (ctxt->recovery) {
2708 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2709 xmlStrlen(ctxt->name)) == 0)
2710 {
2711 break; /* while */
2712 } else {
2713 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
Daniel Veillard2cf36a12005-10-25 12:21:29 +00002714 "Element %s embeds close tag\n",
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002715 ctxt->name, NULL);
2716 }
2717 } else {
2718 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2719 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2720 {
2721 break; /* while */
2722 }
2723 }
Owen Taylor3473f882001-02-23 17:55:21 +00002724 }
Daniel Veillard358fef42005-07-13 16:37:38 +00002725 COPY_BUF(l,buf,nbchar,cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002726 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2727 if (ctxt->sax->cdataBlock!= NULL) {
2728 /*
2729 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2730 */
2731 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002732 } else if (ctxt->sax->characters != NULL) {
2733 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002734 }
2735 nbchar = 0;
2736 }
Daniel Veillardb9900082005-10-25 12:36:29 +00002737 GROW;
Daniel Veillard358fef42005-07-13 16:37:38 +00002738 NEXTL(l);
2739 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002740 }
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002741
Daniel Veillard68716a72006-10-16 09:32:17 +00002742 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002743 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2744 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002745 NEXT;
2746 }
2747
2748 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2749 if (ctxt->sax->cdataBlock!= NULL) {
2750 /*
2751 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2752 */
2753 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002754 } else if (ctxt->sax->characters != NULL) {
2755 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002756 }
2757 }
2758}
2759
2760
2761/**
2762 * htmlParseCharData:
2763 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002764 *
2765 * parse a CharData section.
2766 * if we are within a CDATA section ']]>' marks an end of section.
2767 *
2768 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2769 */
2770
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002771static void
2772htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002773 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2774 int nbchar = 0;
2775 int cur, l;
2776
2777 SHRINK;
2778 cur = CUR_CHAR(l);
2779 while (((cur != '<') || (ctxt->token == '<')) &&
2780 ((cur != '&') || (ctxt->token == '&')) &&
2781 (IS_CHAR(cur))) {
2782 COPY_BUF(l,buf,nbchar,cur);
2783 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2784 /*
2785 * Ok the segment is to be consumed as chars.
2786 */
2787 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2788 if (areBlanks(ctxt, buf, nbchar)) {
2789 if (ctxt->sax->ignorableWhitespace != NULL)
2790 ctxt->sax->ignorableWhitespace(ctxt->userData,
2791 buf, nbchar);
2792 } else {
2793 htmlCheckParagraph(ctxt);
2794 if (ctxt->sax->characters != NULL)
2795 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2796 }
2797 }
2798 nbchar = 0;
2799 }
2800 NEXTL(l);
2801 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002802 if (cur == 0) {
2803 SHRINK;
2804 GROW;
2805 cur = CUR_CHAR(l);
2806 }
Owen Taylor3473f882001-02-23 17:55:21 +00002807 }
2808 if (nbchar != 0) {
Daniel Veillardd2755a82005-08-07 23:42:39 +00002809 buf[nbchar] = 0;
2810
Owen Taylor3473f882001-02-23 17:55:21 +00002811 /*
2812 * Ok the segment is to be consumed as chars.
2813 */
2814 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2815 if (areBlanks(ctxt, buf, nbchar)) {
2816 if (ctxt->sax->ignorableWhitespace != NULL)
2817 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2818 } else {
2819 htmlCheckParagraph(ctxt);
2820 if (ctxt->sax->characters != NULL)
2821 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2822 }
2823 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002824 } else {
2825 /*
2826 * Loop detection
2827 */
2828 if (cur == 0)
2829 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002830 }
2831}
2832
2833/**
2834 * htmlParseExternalID:
2835 * @ctxt: an HTML parser context
2836 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002837 *
2838 * Parse an External ID or a Public ID
2839 *
Owen Taylor3473f882001-02-23 17:55:21 +00002840 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2841 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2842 *
2843 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2844 *
2845 * Returns the function returns SystemLiteral and in the second
2846 * case publicID receives PubidLiteral, is strict is off
2847 * it is possible to return NULL and have publicID set.
2848 */
2849
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002850static xmlChar *
2851htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002852 xmlChar *URI = NULL;
2853
2854 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2855 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2856 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2857 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002858 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002859 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2860 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002861 }
2862 SKIP_BLANKS;
2863 URI = htmlParseSystemLiteral(ctxt);
2864 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002865 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2866 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002867 }
2868 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2869 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2870 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2871 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002872 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002873 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2874 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002875 }
2876 SKIP_BLANKS;
2877 *publicID = htmlParsePubidLiteral(ctxt);
2878 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002879 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2880 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2881 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002882 }
2883 SKIP_BLANKS;
2884 if ((CUR == '"') || (CUR == '\'')) {
2885 URI = htmlParseSystemLiteral(ctxt);
2886 }
2887 }
2888 return(URI);
2889}
2890
2891/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00002892 * xmlParsePI:
2893 * @ctxt: an XML parser context
2894 *
2895 * parse an XML Processing Instruction.
2896 *
2897 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2898 */
2899static void
2900htmlParsePI(htmlParserCtxtPtr ctxt) {
2901 xmlChar *buf = NULL;
2902 int len = 0;
2903 int size = HTML_PARSER_BUFFER_SIZE;
2904 int cur, l;
2905 const xmlChar *target;
2906 xmlParserInputState state;
2907 int count = 0;
2908
2909 if ((RAW == '<') && (NXT(1) == '?')) {
2910 state = ctxt->instate;
2911 ctxt->instate = XML_PARSER_PI;
2912 /*
2913 * this is a Processing Instruction.
2914 */
2915 SKIP(2);
2916 SHRINK;
2917
2918 /*
2919 * Parse the target name and check for special support like
2920 * namespace.
2921 */
2922 target = htmlParseName(ctxt);
2923 if (target != NULL) {
2924 if (RAW == '>') {
2925 SKIP(1);
2926
2927 /*
2928 * SAX: PI detected.
2929 */
2930 if ((ctxt->sax) && (!ctxt->disableSAX) &&
2931 (ctxt->sax->processingInstruction != NULL))
2932 ctxt->sax->processingInstruction(ctxt->userData,
2933 target, NULL);
2934 ctxt->instate = state;
2935 return;
2936 }
2937 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
2938 if (buf == NULL) {
2939 htmlErrMemory(ctxt, NULL);
2940 ctxt->instate = state;
2941 return;
2942 }
2943 cur = CUR;
2944 if (!IS_BLANK(cur)) {
2945 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2946 "ParsePI: PI %s space expected\n", target, NULL);
2947 }
2948 SKIP_BLANKS;
2949 cur = CUR_CHAR(l);
2950 while (IS_CHAR(cur) && (cur != '>')) {
2951 if (len + 5 >= size) {
2952 xmlChar *tmp;
2953
2954 size *= 2;
2955 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2956 if (tmp == NULL) {
2957 htmlErrMemory(ctxt, NULL);
2958 xmlFree(buf);
2959 ctxt->instate = state;
2960 return;
2961 }
2962 buf = tmp;
2963 }
2964 count++;
2965 if (count > 50) {
2966 GROW;
2967 count = 0;
2968 }
2969 COPY_BUF(l,buf,len,cur);
2970 NEXTL(l);
2971 cur = CUR_CHAR(l);
2972 if (cur == 0) {
2973 SHRINK;
2974 GROW;
2975 cur = CUR_CHAR(l);
2976 }
2977 }
2978 buf[len] = 0;
2979 if (cur != '>') {
2980 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
2981 "ParsePI: PI %s never end ...\n", target, NULL);
2982 } else {
2983 SKIP(1);
2984
2985 /*
2986 * SAX: PI detected.
2987 */
2988 if ((ctxt->sax) && (!ctxt->disableSAX) &&
2989 (ctxt->sax->processingInstruction != NULL))
2990 ctxt->sax->processingInstruction(ctxt->userData,
2991 target, buf);
2992 }
2993 xmlFree(buf);
2994 } else {
2995 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
2996 "PI is not started correctly", NULL, NULL);
2997 }
2998 ctxt->instate = state;
2999 }
3000}
3001
3002/**
Owen Taylor3473f882001-02-23 17:55:21 +00003003 * htmlParseComment:
3004 * @ctxt: an HTML parser context
3005 *
3006 * Parse an XML (SGML) comment <!-- .... -->
3007 *
3008 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3009 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003010static void
Owen Taylor3473f882001-02-23 17:55:21 +00003011htmlParseComment(htmlParserCtxtPtr ctxt) {
3012 xmlChar *buf = NULL;
3013 int len;
3014 int size = HTML_PARSER_BUFFER_SIZE;
3015 int q, ql;
3016 int r, rl;
3017 int cur, l;
3018 xmlParserInputState state;
3019
3020 /*
3021 * Check that there is a comment right here.
3022 */
3023 if ((RAW != '<') || (NXT(1) != '!') ||
3024 (NXT(2) != '-') || (NXT(3) != '-')) return;
3025
3026 state = ctxt->instate;
3027 ctxt->instate = XML_PARSER_COMMENT;
3028 SHRINK;
3029 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003030 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00003031 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003032 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003033 ctxt->instate = state;
3034 return;
3035 }
3036 q = CUR_CHAR(ql);
3037 NEXTL(ql);
3038 r = CUR_CHAR(rl);
3039 NEXTL(rl);
3040 cur = CUR_CHAR(l);
3041 len = 0;
3042 while (IS_CHAR(cur) &&
3043 ((cur != '>') ||
3044 (r != '-') || (q != '-'))) {
3045 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00003046 xmlChar *tmp;
3047
Owen Taylor3473f882001-02-23 17:55:21 +00003048 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00003049 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3050 if (tmp == NULL) {
3051 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00003052 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003053 ctxt->instate = state;
3054 return;
3055 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00003056 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00003057 }
3058 COPY_BUF(ql,buf,len,q);
3059 q = r;
3060 ql = rl;
3061 r = cur;
3062 rl = l;
3063 NEXTL(l);
3064 cur = CUR_CHAR(l);
3065 if (cur == 0) {
3066 SHRINK;
3067 GROW;
3068 cur = CUR_CHAR(l);
3069 }
3070 }
3071 buf[len] = 0;
3072 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003073 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3074 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003075 xmlFree(buf);
3076 } else {
3077 NEXT;
3078 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3079 (!ctxt->disableSAX))
3080 ctxt->sax->comment(ctxt->userData, buf);
3081 xmlFree(buf);
3082 }
3083 ctxt->instate = state;
3084}
3085
3086/**
3087 * htmlParseCharRef:
3088 * @ctxt: an HTML parser context
3089 *
3090 * parse Reference declarations
3091 *
3092 * [66] CharRef ::= '&#' [0-9]+ ';' |
3093 * '&#x' [0-9a-fA-F]+ ';'
3094 *
3095 * Returns the value parsed (as an int)
3096 */
3097int
3098htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3099 int val = 0;
3100
Daniel Veillarda03e3652004-11-02 18:45:30 +00003101 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3102 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3103 "htmlParseCharRef: context error\n",
3104 NULL, NULL);
3105 return(0);
3106 }
Owen Taylor3473f882001-02-23 17:55:21 +00003107 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003108 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003109 SKIP(3);
3110 while (CUR != ';') {
3111 if ((CUR >= '0') && (CUR <= '9'))
3112 val = val * 16 + (CUR - '0');
3113 else if ((CUR >= 'a') && (CUR <= 'f'))
3114 val = val * 16 + (CUR - 'a') + 10;
3115 else if ((CUR >= 'A') && (CUR <= 'F'))
3116 val = val * 16 + (CUR - 'A') + 10;
3117 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003118 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3119 "htmlParseCharRef: invalid hexadecimal value\n",
3120 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003121 return(0);
3122 }
3123 NEXT;
3124 }
3125 if (CUR == ';')
3126 NEXT;
3127 } else if ((CUR == '&') && (NXT(1) == '#')) {
3128 SKIP(2);
3129 while (CUR != ';') {
3130 if ((CUR >= '0') && (CUR <= '9'))
3131 val = val * 10 + (CUR - '0');
3132 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003133 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3134 "htmlParseCharRef: invalid decimal value\n",
3135 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003136 return(0);
3137 }
3138 NEXT;
3139 }
3140 if (CUR == ';')
3141 NEXT;
3142 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003143 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3144 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003145 }
3146 /*
3147 * Check the value IS_CHAR ...
3148 */
3149 if (IS_CHAR(val)) {
3150 return(val);
3151 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003152 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3153 "htmlParseCharRef: invalid xmlChar value %d\n",
3154 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003155 }
3156 return(0);
3157}
3158
3159
3160/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003161 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003162 * @ctxt: an HTML parser context
3163 *
3164 * parse a DOCTYPE declaration
3165 *
3166 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3167 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3168 */
3169
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003170static void
Owen Taylor3473f882001-02-23 17:55:21 +00003171htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003172 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003173 xmlChar *ExternalID = NULL;
3174 xmlChar *URI = NULL;
3175
3176 /*
3177 * We know that '<!DOCTYPE' has been detected.
3178 */
3179 SKIP(9);
3180
3181 SKIP_BLANKS;
3182
3183 /*
3184 * Parse the DOCTYPE name.
3185 */
3186 name = htmlParseName(ctxt);
3187 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003188 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3189 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3190 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003191 }
3192 /*
3193 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3194 */
3195
3196 SKIP_BLANKS;
3197
3198 /*
3199 * Check for SystemID and ExternalID
3200 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003201 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003202 SKIP_BLANKS;
3203
3204 /*
3205 * We should be at the end of the DOCTYPE declaration.
3206 */
3207 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003208 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3209 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003210 /* We shouldn't try to resynchronize ... */
3211 }
3212 NEXT;
3213
3214 /*
3215 * Create or update the document accordingly to the DOCTYPE
3216 */
3217 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3218 (!ctxt->disableSAX))
3219 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3220
3221 /*
3222 * Cleanup, since we don't use all those identifiers
3223 */
3224 if (URI != NULL) xmlFree(URI);
3225 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003226}
3227
3228/**
3229 * htmlParseAttribute:
3230 * @ctxt: an HTML parser context
3231 * @value: a xmlChar ** used to store the value of the attribute
3232 *
3233 * parse an attribute
3234 *
3235 * [41] Attribute ::= Name Eq AttValue
3236 *
3237 * [25] Eq ::= S? '=' S?
3238 *
3239 * With namespace:
3240 *
3241 * [NS 11] Attribute ::= QName Eq AttValue
3242 *
3243 * Also the case QName == xmlns:??? is handled independently as a namespace
3244 * definition.
3245 *
3246 * Returns the attribute name, and the value in *value.
3247 */
3248
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003249static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003250htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003251 const xmlChar *name;
3252 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003253
3254 *value = NULL;
3255 name = htmlParseHTMLName(ctxt);
3256 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003257 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3258 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003259 return(NULL);
3260 }
3261
3262 /*
3263 * read the value
3264 */
3265 SKIP_BLANKS;
3266 if (CUR == '=') {
3267 NEXT;
3268 SKIP_BLANKS;
3269 val = htmlParseAttValue(ctxt);
Daniel Veillardc47d2632006-10-17 16:13:27 +00003270 } else if (htmlIsBooleanAttr(name)) {
3271 /*
3272 * assume a minimized attribute
3273 */
3274 val = xmlStrdup(name);
Owen Taylor3473f882001-02-23 17:55:21 +00003275 }
3276
3277 *value = val;
3278 return(name);
3279}
3280
3281/**
3282 * htmlCheckEncoding:
3283 * @ctxt: an HTML parser context
3284 * @attvalue: the attribute value
3285 *
3286 * Checks an http-equiv attribute from a Meta tag to detect
3287 * the encoding
3288 * If a new encoding is detected the parser is switched to decode
3289 * it and pass UTF8
3290 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003291static void
Owen Taylor3473f882001-02-23 17:55:21 +00003292htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3293 const xmlChar *encoding;
3294
3295 if ((ctxt == NULL) || (attvalue == NULL))
3296 return;
3297
3298 /* do not change encoding */
3299 if (ctxt->input->encoding != NULL)
3300 return;
3301
3302 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3303 if (encoding != NULL) {
3304 encoding += 8;
3305 } else {
3306 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3307 if (encoding != NULL)
3308 encoding += 9;
3309 }
3310 if (encoding != NULL) {
3311 xmlCharEncoding enc;
3312 xmlCharEncodingHandlerPtr handler;
3313
3314 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3315
3316 if (ctxt->input->encoding != NULL)
3317 xmlFree((xmlChar *) ctxt->input->encoding);
3318 ctxt->input->encoding = xmlStrdup(encoding);
3319
3320 enc = xmlParseCharEncoding((const char *) encoding);
3321 /*
3322 * registered set of known encodings
3323 */
3324 if (enc != XML_CHAR_ENCODING_ERROR) {
Daniel Veillard7e303562006-10-16 13:14:55 +00003325 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3326 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3327 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3328 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3329 (ctxt->input->buf != NULL) &&
3330 (ctxt->input->buf->encoder == NULL)) {
3331 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3332 "htmlCheckEncoding: wrong encoding meta\n",
3333 NULL, NULL);
3334 } else {
3335 xmlSwitchEncoding(ctxt, enc);
3336 }
Owen Taylor3473f882001-02-23 17:55:21 +00003337 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3338 } else {
3339 /*
3340 * fallback for unknown encodings
3341 */
3342 handler = xmlFindCharEncodingHandler((const char *) encoding);
3343 if (handler != NULL) {
3344 xmlSwitchToEncoding(ctxt, handler);
3345 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3346 } else {
3347 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3348 }
3349 }
3350
3351 if ((ctxt->input->buf != NULL) &&
3352 (ctxt->input->buf->encoder != NULL) &&
3353 (ctxt->input->buf->raw != NULL) &&
3354 (ctxt->input->buf->buffer != NULL)) {
3355 int nbchars;
3356 int processed;
3357
3358 /*
3359 * convert as much as possible to the parser reading buffer.
3360 */
3361 processed = ctxt->input->cur - ctxt->input->base;
3362 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3363 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3364 ctxt->input->buf->buffer,
3365 ctxt->input->buf->raw);
3366 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003367 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3368 "htmlCheckEncoding: encoder error\n",
3369 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003370 }
3371 ctxt->input->base =
3372 ctxt->input->cur = ctxt->input->buf->buffer->content;
3373 }
3374 }
3375}
3376
3377/**
3378 * htmlCheckMeta:
3379 * @ctxt: an HTML parser context
3380 * @atts: the attributes values
3381 *
3382 * Checks an attributes from a Meta tag
3383 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003384static void
Owen Taylor3473f882001-02-23 17:55:21 +00003385htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3386 int i;
3387 const xmlChar *att, *value;
3388 int http = 0;
3389 const xmlChar *content = NULL;
3390
3391 if ((ctxt == NULL) || (atts == NULL))
3392 return;
3393
3394 i = 0;
3395 att = atts[i++];
3396 while (att != NULL) {
3397 value = atts[i++];
3398 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3399 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3400 http = 1;
3401 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3402 content = value;
3403 att = atts[i++];
3404 }
3405 if ((http) && (content != NULL))
3406 htmlCheckEncoding(ctxt, content);
3407
3408}
3409
3410/**
3411 * htmlParseStartTag:
3412 * @ctxt: an HTML parser context
3413 *
3414 * parse a start of tag either for rule element or
3415 * EmptyElement. In both case we don't parse the tag closing chars.
3416 *
3417 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3418 *
3419 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3420 *
3421 * With namespace:
3422 *
3423 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3424 *
3425 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3426 *
Daniel Veillard597f1c12005-07-03 23:00:18 +00003427 * Returns 0 in case of success and -1 in case of error.
Owen Taylor3473f882001-02-23 17:55:21 +00003428 */
3429
Daniel Veillard597f1c12005-07-03 23:00:18 +00003430static int
Owen Taylor3473f882001-02-23 17:55:21 +00003431htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003432 const xmlChar *name;
3433 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003434 xmlChar *attvalue;
Daniel Veillard30e76072006-03-09 14:13:55 +00003435 const xmlChar **atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003436 int nbatts = 0;
Daniel Veillard30e76072006-03-09 14:13:55 +00003437 int maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003438 int meta = 0;
3439 int i;
3440
Daniel Veillarda03e3652004-11-02 18:45:30 +00003441 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3442 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3443 "htmlParseStartTag: context error\n", NULL, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003444 return -1;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003445 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003446 if (CUR != '<') return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003447 NEXT;
3448
Daniel Veillard30e76072006-03-09 14:13:55 +00003449 atts = ctxt->atts;
3450 maxatts = ctxt->maxatts;
3451
Owen Taylor3473f882001-02-23 17:55:21 +00003452 GROW;
3453 name = htmlParseHTMLName(ctxt);
3454 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003455 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3456 "htmlParseStartTag: invalid element name\n",
3457 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003458 /* Dump the bogus tag like browsers do */
William M. Brack76e95df2003-10-18 16:20:14 +00003459 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Owen Taylor3473f882001-02-23 17:55:21 +00003460 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003461 return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003462 }
3463 if (xmlStrEqual(name, BAD_CAST"meta"))
3464 meta = 1;
3465
3466 /*
3467 * Check for auto-closure of HTML elements.
3468 */
3469 htmlAutoClose(ctxt, name);
3470
3471 /*
3472 * Check for implied HTML elements.
3473 */
3474 htmlCheckImplied(ctxt, name);
3475
3476 /*
3477 * Avoid html at any level > 0, head at any level != 1
3478 * or any attempt to recurse body
3479 */
3480 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003481 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3482 "htmlParseStartTag: misplaced <html> tag\n",
3483 name, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003484 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003485 }
3486 if ((ctxt->nameNr != 1) &&
3487 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003488 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3489 "htmlParseStartTag: misplaced <head> tag\n",
3490 name, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003491 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003492 }
3493 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003494 int indx;
3495 for (indx = 0;indx < ctxt->nameNr;indx++) {
3496 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003497 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3498 "htmlParseStartTag: misplaced <body> tag\n",
3499 name, NULL);
Daniel Veillardc59d8262003-11-20 21:59:12 +00003500 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3501 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003502 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003503 }
3504 }
3505 }
3506
3507 /*
3508 * Now parse the attributes, it ends up with the ending
3509 *
3510 * (S Attribute)* S?
3511 */
3512 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003513 while ((IS_CHAR_CH(CUR)) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003514 (CUR != '>') &&
3515 ((CUR != '/') || (NXT(1) != '>'))) {
3516 long cons = ctxt->nbChars;
3517
3518 GROW;
3519 attname = htmlParseAttribute(ctxt, &attvalue);
3520 if (attname != NULL) {
3521
3522 /*
3523 * Well formedness requires at most one declaration of an attribute
3524 */
3525 for (i = 0; i < nbatts;i += 2) {
3526 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003527 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3528 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003529 if (attvalue != NULL)
3530 xmlFree(attvalue);
3531 goto failed;
3532 }
3533 }
3534
3535 /*
3536 * Add the pair to atts
3537 */
3538 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003539 maxatts = 22; /* allow for 10 attrs by default */
3540 atts = (const xmlChar **)
3541 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003542 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003543 htmlErrMemory(ctxt, NULL);
3544 if (attvalue != NULL)
3545 xmlFree(attvalue);
3546 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003547 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003548 ctxt->atts = atts;
3549 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003550 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003551 const xmlChar **n;
3552
Owen Taylor3473f882001-02-23 17:55:21 +00003553 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003554 n = (const xmlChar **) xmlRealloc((void *) atts,
3555 maxatts * sizeof(const xmlChar *));
3556 if (n == NULL) {
3557 htmlErrMemory(ctxt, NULL);
3558 if (attvalue != NULL)
3559 xmlFree(attvalue);
3560 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003561 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003562 atts = n;
3563 ctxt->atts = atts;
3564 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003565 }
3566 atts[nbatts++] = attname;
3567 atts[nbatts++] = attvalue;
3568 atts[nbatts] = NULL;
3569 atts[nbatts + 1] = NULL;
3570 }
3571 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003572 if (attvalue != NULL)
3573 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003574 /* Dump the bogus attribute string up to the next blank or
3575 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003576 while ((IS_CHAR_CH(CUR)) &&
3577 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003578 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003579 NEXT;
3580 }
3581
3582failed:
3583 SKIP_BLANKS;
3584 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003585 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3586 "htmlParseStartTag: problem parsing attributes\n",
3587 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003588 break;
3589 }
3590 }
3591
3592 /*
3593 * Handle specific association to the META tag
3594 */
3595 if (meta)
3596 htmlCheckMeta(ctxt, atts);
3597
3598 /*
3599 * SAX: Start of Element !
3600 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003601 htmlnamePush(ctxt, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003602 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3603 if (nbatts != 0)
3604 ctxt->sax->startElement(ctxt->userData, name, atts);
3605 else
3606 ctxt->sax->startElement(ctxt->userData, name, NULL);
3607 }
Owen Taylor3473f882001-02-23 17:55:21 +00003608
3609 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003610 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003611 if (atts[i] != NULL)
3612 xmlFree((xmlChar *) atts[i]);
3613 }
Owen Taylor3473f882001-02-23 17:55:21 +00003614 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003615
3616 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003617}
3618
3619/**
3620 * htmlParseEndTag:
3621 * @ctxt: an HTML parser context
3622 *
3623 * parse an end of tag
3624 *
3625 * [42] ETag ::= '</' Name S? '>'
3626 *
3627 * With namespace
3628 *
3629 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003630 *
3631 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003632 */
3633
Daniel Veillardf420ac52001-07-04 16:04:09 +00003634static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003635htmlParseEndTag(htmlParserCtxtPtr ctxt)
3636{
3637 const xmlChar *name;
3638 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003639 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003640
3641 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003642 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3643 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003644 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003645 }
3646 SKIP(2);
3647
3648 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003649 if (name == NULL)
3650 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003651
3652 /*
3653 * We should definitely be at the ending "S? '>'" part
3654 */
3655 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003656 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003657 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3658 "End tag : expected '>'\n", NULL, NULL);
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00003659 if (ctxt->recovery) {
3660 /*
3661 * We're not at the ending > !!
3662 * Error, unless in recover mode where we search forwards
3663 * until we find a >
3664 */
3665 while (CUR != '\0' && CUR != '>') NEXT;
3666 NEXT;
3667 }
Owen Taylor3473f882001-02-23 17:55:21 +00003668 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003669 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003670
3671 /*
3672 * If the name read is not one of the element in the parsing stack
3673 * then return, it's just an error.
3674 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003675 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3676 if (xmlStrEqual(name, ctxt->nameTab[i]))
3677 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003678 }
3679 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003680 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3681 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003682 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003683 }
3684
3685
3686 /*
3687 * Check for auto-closure of HTML elements.
3688 */
3689
3690 htmlAutoCloseOnClose(ctxt, name);
3691
3692 /*
3693 * Well formedness constraints, opening and closing must match.
3694 * With the exception that the autoclose may have popped stuff out
3695 * of the stack.
3696 */
3697 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003698 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003699 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3700 "Opening and ending tag mismatch: %s and %s\n",
3701 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003702 }
3703 }
3704
3705 /*
3706 * SAX: End of Tag
3707 */
3708 oldname = ctxt->name;
3709 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003710 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3711 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003712 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003713 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003714 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003715 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003716 }
3717
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003718 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003719}
3720
3721
3722/**
3723 * htmlParseReference:
3724 * @ctxt: an HTML parser context
3725 *
3726 * parse and handle entity references in content,
3727 * this will end-up in a call to character() since this is either a
3728 * CharRef, or a predefined entity.
3729 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003730static void
Owen Taylor3473f882001-02-23 17:55:21 +00003731htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003732 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003733 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003734 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003735 if (CUR != '&') return;
3736
3737 if (NXT(1) == '#') {
3738 unsigned int c;
3739 int bits, i = 0;
3740
3741 c = htmlParseCharRef(ctxt);
3742 if (c == 0)
3743 return;
3744
3745 if (c < 0x80) { out[i++]= c; bits= -6; }
3746 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3747 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3748 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3749
3750 for ( ; bits >= 0; bits-= 6) {
3751 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3752 }
3753 out[i] = 0;
3754
3755 htmlCheckParagraph(ctxt);
3756 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3757 ctxt->sax->characters(ctxt->userData, out, i);
3758 } else {
3759 ent = htmlParseEntityRef(ctxt, &name);
3760 if (name == NULL) {
3761 htmlCheckParagraph(ctxt);
3762 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3763 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3764 return;
3765 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003766 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003767 htmlCheckParagraph(ctxt);
3768 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3769 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3770 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3771 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3772 }
3773 } else {
3774 unsigned int c;
3775 int bits, i = 0;
3776
3777 c = ent->value;
3778 if (c < 0x80)
3779 { out[i++]= c; bits= -6; }
3780 else if (c < 0x800)
3781 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3782 else if (c < 0x10000)
3783 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3784 else
3785 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3786
3787 for ( ; bits >= 0; bits-= 6) {
3788 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3789 }
3790 out[i] = 0;
3791
3792 htmlCheckParagraph(ctxt);
3793 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3794 ctxt->sax->characters(ctxt->userData, out, i);
3795 }
Owen Taylor3473f882001-02-23 17:55:21 +00003796 }
3797}
3798
3799/**
3800 * htmlParseContent:
3801 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00003802 *
3803 * Parse a content: comment, sub-element, reference or text.
Owen Taylor3473f882001-02-23 17:55:21 +00003804 */
3805
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003806static void
Owen Taylor3473f882001-02-23 17:55:21 +00003807htmlParseContent(htmlParserCtxtPtr ctxt) {
3808 xmlChar *currentNode;
3809 int depth;
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003810 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003811
3812 currentNode = xmlStrdup(ctxt->name);
3813 depth = ctxt->nameNr;
3814 while (1) {
3815 long cons = ctxt->nbChars;
3816
3817 GROW;
3818 /*
3819 * Our tag or one of it's parent or children is ending.
3820 */
3821 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003822 if (htmlParseEndTag(ctxt) &&
3823 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3824 if (currentNode != NULL)
3825 xmlFree(currentNode);
3826 return;
3827 }
3828 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003829 }
3830
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003831 else if ((CUR == '<') &&
3832 ((IS_ASCII_LETTER(NXT(1))) ||
3833 (NXT(1) == '_') || (NXT(1) == ':'))) {
3834 name = htmlParseHTMLName_nonInvasive(ctxt);
3835 if (name == NULL) {
3836 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3837 "htmlParseStartTag: invalid element name\n",
3838 NULL, NULL);
3839 /* Dump the bogus tag like browsers do */
3840 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3841 NEXT;
3842
3843 if (currentNode != NULL)
3844 xmlFree(currentNode);
3845 return;
3846 }
3847
3848 if (ctxt->name != NULL) {
3849 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
3850 htmlAutoClose(ctxt, name);
3851 continue;
3852 }
3853 }
3854 }
3855
Owen Taylor3473f882001-02-23 17:55:21 +00003856 /*
3857 * Has this node been popped out during parsing of
3858 * the next element
3859 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003860 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3861 (!xmlStrEqual(currentNode, ctxt->name)))
3862 {
Owen Taylor3473f882001-02-23 17:55:21 +00003863 if (currentNode != NULL) xmlFree(currentNode);
3864 return;
3865 }
3866
Daniel Veillardf9533d12001-03-03 10:04:57 +00003867 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3868 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003869 /*
3870 * Handle SCRIPT/STYLE separately
3871 */
3872 htmlParseScript(ctxt);
3873 } else {
3874 /*
3875 * Sometimes DOCTYPE arrives in the middle of the document
3876 */
3877 if ((CUR == '<') && (NXT(1) == '!') &&
3878 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3879 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3880 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3881 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003882 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3883 "Misplaced DOCTYPE declaration\n",
3884 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003885 htmlParseDocTypeDecl(ctxt);
3886 }
3887
3888 /*
3889 * First case : a comment
3890 */
3891 if ((CUR == '<') && (NXT(1) == '!') &&
3892 (NXT(2) == '-') && (NXT(3) == '-')) {
3893 htmlParseComment(ctxt);
3894 }
3895
3896 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003897 * Second case : a Processing Instruction.
3898 */
3899 else if ((CUR == '<') && (NXT(1) == '?')) {
3900 htmlParsePI(ctxt);
3901 }
3902
3903 /*
3904 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00003905 */
3906 else if (CUR == '<') {
3907 htmlParseElement(ctxt);
3908 }
3909
3910 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003911 * Fourth case : a reference. If if has not been resolved,
Owen Taylor3473f882001-02-23 17:55:21 +00003912 * parsing returns it's Name, create the node
3913 */
3914 else if (CUR == '&') {
3915 htmlParseReference(ctxt);
3916 }
3917
3918 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003919 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00003920 */
3921 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003922 htmlAutoCloseOnEnd(ctxt);
3923 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003924 }
3925
3926 /*
3927 * Last case, text. Note that References are handled directly.
3928 */
3929 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003930 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003931 }
3932
3933 if (cons == ctxt->nbChars) {
3934 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003935 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3936 "detected an error in element content\n",
3937 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003938 }
3939 break;
3940 }
3941 }
3942 GROW;
3943 }
3944 if (currentNode != NULL) xmlFree(currentNode);
3945}
3946
3947/**
Daniel Veillard499cc922006-01-18 17:22:35 +00003948 * htmlParseContent:
3949 * @ctxt: an HTML parser context
3950 *
3951 * Parse a content: comment, sub-element, reference or text.
3952 */
3953
3954void
3955__htmlParseContent(void *ctxt) {
3956 if (ctxt != NULL)
3957 htmlParseContent((htmlParserCtxtPtr) ctxt);
3958}
3959
3960/**
Owen Taylor3473f882001-02-23 17:55:21 +00003961 * htmlParseElement:
3962 * @ctxt: an HTML parser context
3963 *
3964 * parse an HTML element, this is highly recursive
3965 *
3966 * [39] element ::= EmptyElemTag | STag content ETag
3967 *
3968 * [41] Attribute ::= Name Eq AttValue
3969 */
3970
3971void
3972htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003973 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003974 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003975 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003976 htmlParserNodeInfo node_info;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003977 int failed;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003978 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003979 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003980
Daniel Veillarda03e3652004-11-02 18:45:30 +00003981 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3982 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
Daniel Veillard597f1c12005-07-03 23:00:18 +00003983 "htmlParseElement: context error\n", NULL, NULL);
Daniel Veillarda03e3652004-11-02 18:45:30 +00003984 return;
3985 }
Owen Taylor3473f882001-02-23 17:55:21 +00003986 /* Capture start position */
3987 if (ctxt->record_info) {
3988 node_info.begin_pos = ctxt->input->consumed +
3989 (CUR_PTR - ctxt->input->base);
3990 node_info.begin_line = ctxt->input->line;
3991 }
3992
Daniel Veillard597f1c12005-07-03 23:00:18 +00003993 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003994 name = ctxt->name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003995 if (failed || (name == NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003996 if (CUR == '>')
3997 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003998 return;
3999 }
Owen Taylor3473f882001-02-23 17:55:21 +00004000
4001 /*
4002 * Lookup the info for that element.
4003 */
4004 info = htmlTagLookup(name);
4005 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004006 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4007 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004008 }
4009
4010 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004011 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004012 */
4013 if ((CUR == '/') && (NXT(1) == '>')) {
4014 SKIP(2);
4015 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4016 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004017 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004018 return;
4019 }
4020
4021 if (CUR == '>') {
4022 NEXT;
4023 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004024 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4025 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004026
4027 /*
4028 * end of parsing of this node.
4029 */
4030 if (xmlStrEqual(name, ctxt->name)) {
4031 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00004032 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004033 }
4034
4035 /*
4036 * Capture end position and add node
4037 */
Daniel Veillard30e76072006-03-09 14:13:55 +00004038 if (ctxt->record_info) {
Owen Taylor3473f882001-02-23 17:55:21 +00004039 node_info.end_pos = ctxt->input->consumed +
4040 (CUR_PTR - ctxt->input->base);
4041 node_info.end_line = ctxt->input->line;
4042 node_info.node = ctxt->node;
4043 xmlParserAddNodeInfo(ctxt, &node_info);
4044 }
4045 return;
4046 }
4047
4048 /*
4049 * Check for an Empty Element from DTD definition
4050 */
4051 if ((info != NULL) && (info->empty)) {
4052 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4053 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004054 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004055 return;
4056 }
4057
4058 /*
4059 * Parse the content of the element:
4060 */
4061 currentNode = xmlStrdup(ctxt->name);
4062 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00004063 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00004064 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00004065 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00004066 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00004067 if (ctxt->nameNr < depth) break;
4068 }
4069
Owen Taylor3473f882001-02-23 17:55:21 +00004070 /*
4071 * Capture end position and add node
4072 */
4073 if ( currentNode != NULL && ctxt->record_info ) {
4074 node_info.end_pos = ctxt->input->consumed +
4075 (CUR_PTR - ctxt->input->base);
4076 node_info.end_line = ctxt->input->line;
4077 node_info.node = ctxt->node;
4078 xmlParserAddNodeInfo(ctxt, &node_info);
4079 }
William M. Brack76e95df2003-10-18 16:20:14 +00004080 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004081 htmlAutoCloseOnEnd(ctxt);
4082 }
4083
Owen Taylor3473f882001-02-23 17:55:21 +00004084 if (currentNode != NULL)
4085 xmlFree(currentNode);
4086}
4087
4088/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004089 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00004090 * @ctxt: an HTML parser context
4091 *
4092 * parse an HTML document (and build a tree if using the standard SAX
4093 * interface).
4094 *
4095 * Returns 0, -1 in case of error. the parser context is augmented
4096 * as a result of the parsing.
4097 */
4098
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00004099int
Owen Taylor3473f882001-02-23 17:55:21 +00004100htmlParseDocument(htmlParserCtxtPtr ctxt) {
4101 xmlDtdPtr dtd;
4102
Daniel Veillardd0463562001-10-13 09:15:48 +00004103 xmlInitParser();
4104
Owen Taylor3473f882001-02-23 17:55:21 +00004105 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00004106
Daniel Veillarda03e3652004-11-02 18:45:30 +00004107 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4108 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4109 "htmlParseDocument: context error\n", NULL, NULL);
4110 return(XML_ERR_INTERNAL_ERROR);
4111 }
4112 ctxt->html = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00004113 GROW;
4114 /*
4115 * SAX: beginning of the document processing.
4116 */
4117 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4118 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4119
4120 /*
4121 * Wipe out everything which is before the first '<'
4122 */
4123 SKIP_BLANKS;
4124 if (CUR == 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004125 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4126 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004127 }
4128
4129 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4130 ctxt->sax->startDocument(ctxt->userData);
4131
4132
4133 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004134 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004135 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004136 while (((CUR == '<') && (NXT(1) == '!') &&
4137 (NXT(2) == '-') && (NXT(3) == '-')) ||
4138 ((CUR == '<') && (NXT(1) == '?'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004139 htmlParseComment(ctxt);
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004140 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004141 SKIP_BLANKS;
4142 }
4143
4144
4145 /*
4146 * Then possibly doc type declaration(s) and more Misc
4147 * (doctypedecl Misc*)?
4148 */
4149 if ((CUR == '<') && (NXT(1) == '!') &&
4150 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4151 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4152 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4153 (UPP(8) == 'E')) {
4154 htmlParseDocTypeDecl(ctxt);
4155 }
4156 SKIP_BLANKS;
4157
4158 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004159 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004160 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004161 while (((CUR == '<') && (NXT(1) == '!') &&
4162 (NXT(2) == '-') && (NXT(3) == '-')) ||
4163 ((CUR == '<') && (NXT(1) == '?'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004164 htmlParseComment(ctxt);
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004165 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004166 SKIP_BLANKS;
4167 }
4168
4169 /*
4170 * Time to start parsing the tree itself
4171 */
4172 htmlParseContent(ctxt);
4173
4174 /*
4175 * autoclose
4176 */
4177 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004178 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004179
4180
4181 /*
4182 * SAX: end of the document processing.
4183 */
4184 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4185 ctxt->sax->endDocument(ctxt->userData);
4186
4187 if (ctxt->myDoc != NULL) {
4188 dtd = xmlGetIntSubset(ctxt->myDoc);
4189 if (dtd == NULL)
4190 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00004191 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004192 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4193 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4194 }
4195 if (! ctxt->wellFormed) return(-1);
4196 return(0);
4197}
4198
4199
4200/************************************************************************
4201 * *
4202 * Parser contexts handling *
4203 * *
4204 ************************************************************************/
4205
4206/**
William M. Brackedb65a72004-02-06 07:36:04 +00004207 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004208 * @ctxt: an HTML parser context
4209 *
4210 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004211 *
4212 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004213 */
4214
Daniel Veillardf403d292003-10-05 13:51:35 +00004215static int
Owen Taylor3473f882001-02-23 17:55:21 +00004216htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4217{
4218 htmlSAXHandler *sax;
4219
Daniel Veillardf403d292003-10-05 13:51:35 +00004220 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004221 memset(ctxt, 0, sizeof(htmlParserCtxt));
4222
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004223 ctxt->dict = xmlDictCreate();
4224 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004225 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4226 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004227 }
Owen Taylor3473f882001-02-23 17:55:21 +00004228 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4229 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004230 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4231 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004232 }
4233 else
4234 memset(sax, 0, sizeof(htmlSAXHandler));
4235
4236 /* Allocate the Input stack */
4237 ctxt->inputTab = (htmlParserInputPtr *)
4238 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4239 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004240 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004241 ctxt->inputNr = 0;
4242 ctxt->inputMax = 0;
4243 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004244 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004245 }
4246 ctxt->inputNr = 0;
4247 ctxt->inputMax = 5;
4248 ctxt->input = NULL;
4249 ctxt->version = NULL;
4250 ctxt->encoding = NULL;
4251 ctxt->standalone = -1;
4252 ctxt->instate = XML_PARSER_START;
4253
4254 /* Allocate the Node stack */
4255 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4256 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004257 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004258 ctxt->nodeNr = 0;
4259 ctxt->nodeMax = 0;
4260 ctxt->node = NULL;
4261 ctxt->inputNr = 0;
4262 ctxt->inputMax = 0;
4263 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004264 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004265 }
4266 ctxt->nodeNr = 0;
4267 ctxt->nodeMax = 10;
4268 ctxt->node = NULL;
4269
4270 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004271 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004272 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004273 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004274 ctxt->nameNr = 0;
4275 ctxt->nameMax = 10;
4276 ctxt->name = NULL;
4277 ctxt->nodeNr = 0;
4278 ctxt->nodeMax = 0;
4279 ctxt->node = NULL;
4280 ctxt->inputNr = 0;
4281 ctxt->inputMax = 0;
4282 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004283 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004284 }
4285 ctxt->nameNr = 0;
4286 ctxt->nameMax = 10;
4287 ctxt->name = NULL;
4288
Daniel Veillard092643b2003-09-25 14:29:29 +00004289 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004290 else {
4291 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004292 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004293 }
4294 ctxt->userData = ctxt;
4295 ctxt->myDoc = NULL;
4296 ctxt->wellFormed = 1;
4297 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004298 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004299 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004300 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004301 ctxt->vctxt.userData = ctxt;
4302 ctxt->vctxt.error = xmlParserValidityError;
4303 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004304 ctxt->record_info = 0;
4305 ctxt->validate = 0;
4306 ctxt->nbChars = 0;
4307 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004308 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004309 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004310 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004311}
4312
4313/**
4314 * htmlFreeParserCtxt:
4315 * @ctxt: an HTML parser context
4316 *
4317 * Free all the memory used by a parser context. However the parsed
4318 * document in ctxt->myDoc is not freed.
4319 */
4320
4321void
4322htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4323{
4324 xmlFreeParserCtxt(ctxt);
4325}
4326
4327/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004328 * htmlNewParserCtxt:
4329 *
4330 * Allocate and initialize a new parser context.
4331 *
Daniel Veillard34c647c2006-09-21 06:53:59 +00004332 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
Daniel Veillard1d995272002-07-22 16:43:32 +00004333 */
4334
Daniel Veillard34c647c2006-09-21 06:53:59 +00004335htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004336htmlNewParserCtxt(void)
4337{
4338 xmlParserCtxtPtr ctxt;
4339
4340 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4341 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004342 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004343 return(NULL);
4344 }
4345 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004346 if (htmlInitParserCtxt(ctxt) < 0) {
4347 htmlFreeParserCtxt(ctxt);
4348 return(NULL);
4349 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004350 return(ctxt);
4351}
4352
4353/**
4354 * htmlCreateMemoryParserCtxt:
4355 * @buffer: a pointer to a char array
4356 * @size: the size of the array
4357 *
4358 * Create a parser context for an HTML in-memory document.
4359 *
4360 * Returns the new parser context or NULL
4361 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004362htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004363htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4364 xmlParserCtxtPtr ctxt;
4365 xmlParserInputPtr input;
4366 xmlParserInputBufferPtr buf;
4367
4368 if (buffer == NULL)
4369 return(NULL);
4370 if (size <= 0)
4371 return(NULL);
4372
4373 ctxt = htmlNewParserCtxt();
4374 if (ctxt == NULL)
4375 return(NULL);
4376
4377 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4378 if (buf == NULL) return(NULL);
4379
4380 input = xmlNewInputStream(ctxt);
4381 if (input == NULL) {
4382 xmlFreeParserCtxt(ctxt);
4383 return(NULL);
4384 }
4385
4386 input->filename = NULL;
4387 input->buf = buf;
4388 input->base = input->buf->buffer->content;
4389 input->cur = input->buf->buffer->content;
4390 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4391
4392 inputPush(ctxt, input);
4393 return(ctxt);
4394}
4395
4396/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004397 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004398 * @cur: a pointer to an array of xmlChar
4399 * @encoding: a free form C string describing the HTML document encoding, or NULL
4400 *
4401 * Create a parser context for an HTML document.
4402 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004403 * TODO: check the need to add encoding handling there
4404 *
Owen Taylor3473f882001-02-23 17:55:21 +00004405 * Returns the new parser context or NULL
4406 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004407static htmlParserCtxtPtr
Daniel Veillard8a82ae12006-10-17 20:04:10 +00004408htmlCreateDocParserCtxt(const xmlChar *cur,
4409 const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004410 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004411 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004412
Daniel Veillard1d995272002-07-22 16:43:32 +00004413 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004414 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004415 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004416 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4417
4418 if (encoding != NULL) {
4419 xmlCharEncoding enc;
4420 xmlCharEncodingHandlerPtr handler;
4421
4422 if (ctxt->input->encoding != NULL)
4423 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004424 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004425
4426 enc = xmlParseCharEncoding(encoding);
4427 /*
4428 * registered set of known encodings
4429 */
4430 if (enc != XML_CHAR_ENCODING_ERROR) {
4431 xmlSwitchEncoding(ctxt, enc);
4432 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004433 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4434 "Unsupported encoding %s\n",
4435 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004436 }
4437 } else {
4438 /*
4439 * fallback for unknown encodings
4440 */
4441 handler = xmlFindCharEncodingHandler((const char *) encoding);
4442 if (handler != NULL) {
4443 xmlSwitchToEncoding(ctxt, handler);
4444 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004445 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4446 "Unsupported encoding %s\n",
4447 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004448 }
4449 }
4450 }
4451 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004452}
4453
Daniel Veillard73b013f2003-09-30 12:36:01 +00004454#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004455/************************************************************************
4456 * *
4457 * Progressive parsing interfaces *
4458 * *
4459 ************************************************************************/
4460
4461/**
4462 * htmlParseLookupSequence:
4463 * @ctxt: an HTML parser context
4464 * @first: the first char to lookup
4465 * @next: the next char to lookup or zero
4466 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004467 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004468 *
4469 * Try to find if a sequence (first, next, third) or just (first next) or
4470 * (first) is available in the input stream.
4471 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4472 * to avoid rescanning sequences of bytes, it DOES change the state of the
4473 * parser, do not use liberally.
4474 * This is basically similar to xmlParseLookupSequence()
4475 *
4476 * Returns the index to the current parsing point if the full sequence
4477 * is available, -1 otherwise.
4478 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004479static int
Owen Taylor3473f882001-02-23 17:55:21 +00004480htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
William M. Brackc1939562003-08-05 15:52:22 +00004481 xmlChar next, xmlChar third, int iscomment) {
Owen Taylor3473f882001-02-23 17:55:21 +00004482 int base, len;
4483 htmlParserInputPtr in;
4484 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004485 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004486
4487 in = ctxt->input;
4488 if (in == NULL) return(-1);
4489 base = in->cur - in->base;
4490 if (base < 0) return(-1);
4491 if (ctxt->checkIndex > base)
4492 base = ctxt->checkIndex;
4493 if (in->buf == NULL) {
4494 buf = in->base;
4495 len = in->length;
4496 } else {
4497 buf = in->buf->buffer->content;
4498 len = in->buf->buffer->use;
4499 }
4500 /* take into account the sequence length */
4501 if (third) len -= 2;
4502 else if (next) len --;
4503 for (;base < len;base++) {
William M. Brackc1939562003-08-05 15:52:22 +00004504 if (!incomment && (base + 4 < len) && !iscomment) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004505 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4506 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4507 incomment = 1;
Daniel Veillard97e01882003-07-30 18:59:19 +00004508 /* do not increment past <! - some people use <!--> */
4509 base += 2;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004510 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004511 }
4512 if (incomment) {
William M. Brack4a557d92003-07-29 04:28:04 +00004513 if (base + 3 > len)
Daniel Veillardc1f78342001-11-10 11:43:05 +00004514 return(-1);
4515 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4516 (buf[base + 2] == '>')) {
4517 incomment = 0;
4518 base += 2;
4519 }
4520 continue;
4521 }
Owen Taylor3473f882001-02-23 17:55:21 +00004522 if (buf[base] == first) {
4523 if (third != 0) {
4524 if ((buf[base + 1] != next) ||
4525 (buf[base + 2] != third)) continue;
4526 } else if (next != 0) {
4527 if (buf[base + 1] != next) continue;
4528 }
4529 ctxt->checkIndex = 0;
4530#ifdef DEBUG_PUSH
4531 if (next == 0)
4532 xmlGenericError(xmlGenericErrorContext,
4533 "HPP: lookup '%c' found at %d\n",
4534 first, base);
4535 else if (third == 0)
4536 xmlGenericError(xmlGenericErrorContext,
4537 "HPP: lookup '%c%c' found at %d\n",
4538 first, next, base);
4539 else
4540 xmlGenericError(xmlGenericErrorContext,
4541 "HPP: lookup '%c%c%c' found at %d\n",
4542 first, next, third, base);
4543#endif
4544 return(base - (in->cur - in->base));
4545 }
4546 }
4547 ctxt->checkIndex = base;
4548#ifdef DEBUG_PUSH
4549 if (next == 0)
4550 xmlGenericError(xmlGenericErrorContext,
4551 "HPP: lookup '%c' failed\n", first);
4552 else if (third == 0)
4553 xmlGenericError(xmlGenericErrorContext,
4554 "HPP: lookup '%c%c' failed\n", first, next);
4555 else
4556 xmlGenericError(xmlGenericErrorContext,
4557 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4558#endif
4559 return(-1);
4560}
4561
4562/**
4563 * htmlParseTryOrFinish:
4564 * @ctxt: an HTML parser context
4565 * @terminate: last chunk indicator
4566 *
4567 * Try to progress on parsing
4568 *
4569 * Returns zero if no parsing was possible
4570 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004571static int
Owen Taylor3473f882001-02-23 17:55:21 +00004572htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4573 int ret = 0;
4574 htmlParserInputPtr in;
4575 int avail = 0;
4576 xmlChar cur, next;
4577
4578#ifdef DEBUG_PUSH
4579 switch (ctxt->instate) {
4580 case XML_PARSER_EOF:
4581 xmlGenericError(xmlGenericErrorContext,
4582 "HPP: try EOF\n"); break;
4583 case XML_PARSER_START:
4584 xmlGenericError(xmlGenericErrorContext,
4585 "HPP: try START\n"); break;
4586 case XML_PARSER_MISC:
4587 xmlGenericError(xmlGenericErrorContext,
4588 "HPP: try MISC\n");break;
4589 case XML_PARSER_COMMENT:
4590 xmlGenericError(xmlGenericErrorContext,
4591 "HPP: try COMMENT\n");break;
4592 case XML_PARSER_PROLOG:
4593 xmlGenericError(xmlGenericErrorContext,
4594 "HPP: try PROLOG\n");break;
4595 case XML_PARSER_START_TAG:
4596 xmlGenericError(xmlGenericErrorContext,
4597 "HPP: try START_TAG\n");break;
4598 case XML_PARSER_CONTENT:
4599 xmlGenericError(xmlGenericErrorContext,
4600 "HPP: try CONTENT\n");break;
4601 case XML_PARSER_CDATA_SECTION:
4602 xmlGenericError(xmlGenericErrorContext,
4603 "HPP: try CDATA_SECTION\n");break;
4604 case XML_PARSER_END_TAG:
4605 xmlGenericError(xmlGenericErrorContext,
4606 "HPP: try END_TAG\n");break;
4607 case XML_PARSER_ENTITY_DECL:
4608 xmlGenericError(xmlGenericErrorContext,
4609 "HPP: try ENTITY_DECL\n");break;
4610 case XML_PARSER_ENTITY_VALUE:
4611 xmlGenericError(xmlGenericErrorContext,
4612 "HPP: try ENTITY_VALUE\n");break;
4613 case XML_PARSER_ATTRIBUTE_VALUE:
4614 xmlGenericError(xmlGenericErrorContext,
4615 "HPP: try ATTRIBUTE_VALUE\n");break;
4616 case XML_PARSER_DTD:
4617 xmlGenericError(xmlGenericErrorContext,
4618 "HPP: try DTD\n");break;
4619 case XML_PARSER_EPILOG:
4620 xmlGenericError(xmlGenericErrorContext,
4621 "HPP: try EPILOG\n");break;
4622 case XML_PARSER_PI:
4623 xmlGenericError(xmlGenericErrorContext,
4624 "HPP: try PI\n");break;
4625 case XML_PARSER_SYSTEM_LITERAL:
4626 xmlGenericError(xmlGenericErrorContext,
4627 "HPP: try SYSTEM_LITERAL\n");break;
4628 }
4629#endif
4630
4631 while (1) {
4632
4633 in = ctxt->input;
4634 if (in == NULL) break;
4635 if (in->buf == NULL)
4636 avail = in->length - (in->cur - in->base);
4637 else
4638 avail = in->buf->buffer->use - (in->cur - in->base);
4639 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004640 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004641 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4642 /*
4643 * SAX: end of the document processing.
4644 */
4645 ctxt->instate = XML_PARSER_EOF;
4646 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4647 ctxt->sax->endDocument(ctxt->userData);
4648 }
4649 }
4650 if (avail < 1)
4651 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004652 cur = in->cur[0];
4653 if (cur == 0) {
4654 SKIP(1);
4655 continue;
4656 }
4657
Owen Taylor3473f882001-02-23 17:55:21 +00004658 switch (ctxt->instate) {
4659 case XML_PARSER_EOF:
4660 /*
4661 * Document parsing is done !
4662 */
4663 goto done;
4664 case XML_PARSER_START:
4665 /*
4666 * Very first chars read from the document flow.
4667 */
4668 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004669 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004670 SKIP_BLANKS;
4671 if (in->buf == NULL)
4672 avail = in->length - (in->cur - in->base);
4673 else
4674 avail = in->buf->buffer->use - (in->cur - in->base);
4675 }
4676 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4677 ctxt->sax->setDocumentLocator(ctxt->userData,
4678 &xmlDefaultSAXLocator);
4679 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4680 (!ctxt->disableSAX))
4681 ctxt->sax->startDocument(ctxt->userData);
4682
4683 cur = in->cur[0];
4684 next = in->cur[1];
4685 if ((cur == '<') && (next == '!') &&
4686 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4687 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4688 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4689 (UPP(8) == 'E')) {
4690 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004691 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004692 goto done;
4693#ifdef DEBUG_PUSH
4694 xmlGenericError(xmlGenericErrorContext,
4695 "HPP: Parsing internal subset\n");
4696#endif
4697 htmlParseDocTypeDecl(ctxt);
4698 ctxt->instate = XML_PARSER_PROLOG;
4699#ifdef DEBUG_PUSH
4700 xmlGenericError(xmlGenericErrorContext,
4701 "HPP: entering PROLOG\n");
4702#endif
4703 } else {
4704 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004705#ifdef DEBUG_PUSH
Daniel Veillard597f1c12005-07-03 23:00:18 +00004706 xmlGenericError(xmlGenericErrorContext,
4707 "HPP: entering MISC\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004708#endif
Daniel Veillard597f1c12005-07-03 23:00:18 +00004709 }
Owen Taylor3473f882001-02-23 17:55:21 +00004710 break;
4711 case XML_PARSER_MISC:
4712 SKIP_BLANKS;
4713 if (in->buf == NULL)
4714 avail = in->length - (in->cur - in->base);
4715 else
4716 avail = in->buf->buffer->use - (in->cur - in->base);
4717 if (avail < 2)
4718 goto done;
4719 cur = in->cur[0];
4720 next = in->cur[1];
4721 if ((cur == '<') && (next == '!') &&
4722 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4723 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004724 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004725 goto done;
4726#ifdef DEBUG_PUSH
4727 xmlGenericError(xmlGenericErrorContext,
4728 "HPP: Parsing Comment\n");
4729#endif
4730 htmlParseComment(ctxt);
4731 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004732 } else if ((cur == '<') && (next == '?')) {
4733 if ((!terminate) &&
4734 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4735 goto done;
4736#ifdef DEBUG_PUSH
4737 xmlGenericError(xmlGenericErrorContext,
4738 "HPP: Parsing PI\n");
4739#endif
4740 htmlParsePI(ctxt);
4741 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004742 } else if ((cur == '<') && (next == '!') &&
4743 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4744 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4745 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4746 (UPP(8) == 'E')) {
4747 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004748 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004749 goto done;
4750#ifdef DEBUG_PUSH
4751 xmlGenericError(xmlGenericErrorContext,
4752 "HPP: Parsing internal subset\n");
4753#endif
4754 htmlParseDocTypeDecl(ctxt);
4755 ctxt->instate = XML_PARSER_PROLOG;
4756#ifdef DEBUG_PUSH
4757 xmlGenericError(xmlGenericErrorContext,
4758 "HPP: entering PROLOG\n");
4759#endif
4760 } else if ((cur == '<') && (next == '!') &&
4761 (avail < 9)) {
4762 goto done;
4763 } else {
4764 ctxt->instate = XML_PARSER_START_TAG;
4765#ifdef DEBUG_PUSH
4766 xmlGenericError(xmlGenericErrorContext,
4767 "HPP: entering START_TAG\n");
4768#endif
4769 }
4770 break;
4771 case XML_PARSER_PROLOG:
4772 SKIP_BLANKS;
4773 if (in->buf == NULL)
4774 avail = in->length - (in->cur - in->base);
4775 else
4776 avail = in->buf->buffer->use - (in->cur - in->base);
4777 if (avail < 2)
4778 goto done;
4779 cur = in->cur[0];
4780 next = in->cur[1];
4781 if ((cur == '<') && (next == '!') &&
4782 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4783 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004784 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004785 goto done;
4786#ifdef DEBUG_PUSH
4787 xmlGenericError(xmlGenericErrorContext,
4788 "HPP: Parsing Comment\n");
4789#endif
4790 htmlParseComment(ctxt);
4791 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004792 } else if ((cur == '<') && (next == '?')) {
4793 if ((!terminate) &&
4794 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4795 goto done;
4796#ifdef DEBUG_PUSH
4797 xmlGenericError(xmlGenericErrorContext,
4798 "HPP: Parsing PI\n");
4799#endif
4800 htmlParsePI(ctxt);
4801 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004802 } else if ((cur == '<') && (next == '!') &&
4803 (avail < 4)) {
4804 goto done;
4805 } else {
4806 ctxt->instate = XML_PARSER_START_TAG;
4807#ifdef DEBUG_PUSH
4808 xmlGenericError(xmlGenericErrorContext,
4809 "HPP: entering START_TAG\n");
4810#endif
4811 }
4812 break;
4813 case XML_PARSER_EPILOG:
4814 if (in->buf == NULL)
4815 avail = in->length - (in->cur - in->base);
4816 else
4817 avail = in->buf->buffer->use - (in->cur - in->base);
4818 if (avail < 1)
4819 goto done;
4820 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004821 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004822 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004823 goto done;
4824 }
4825 if (avail < 2)
4826 goto done;
4827 next = in->cur[1];
4828 if ((cur == '<') && (next == '!') &&
4829 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4830 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004831 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004832 goto done;
4833#ifdef DEBUG_PUSH
4834 xmlGenericError(xmlGenericErrorContext,
4835 "HPP: Parsing Comment\n");
4836#endif
4837 htmlParseComment(ctxt);
4838 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004839 } else if ((cur == '<') && (next == '?')) {
4840 if ((!terminate) &&
4841 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4842 goto done;
4843#ifdef DEBUG_PUSH
4844 xmlGenericError(xmlGenericErrorContext,
4845 "HPP: Parsing PI\n");
4846#endif
4847 htmlParsePI(ctxt);
4848 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004849 } else if ((cur == '<') && (next == '!') &&
4850 (avail < 4)) {
4851 goto done;
4852 } else {
4853 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004854 ctxt->wellFormed = 0;
4855 ctxt->instate = XML_PARSER_EOF;
4856#ifdef DEBUG_PUSH
4857 xmlGenericError(xmlGenericErrorContext,
4858 "HPP: entering EOF\n");
4859#endif
4860 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4861 ctxt->sax->endDocument(ctxt->userData);
4862 goto done;
4863 }
4864 break;
4865 case XML_PARSER_START_TAG: {
Daniel Veillard6a0baa02005-12-10 11:11:12 +00004866 const xmlChar *name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004867 int failed;
Daniel Veillardbb371292001-08-16 23:26:59 +00004868 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004869
4870 if (avail < 2)
4871 goto done;
4872 cur = in->cur[0];
4873 if (cur != '<') {
4874 ctxt->instate = XML_PARSER_CONTENT;
4875#ifdef DEBUG_PUSH
4876 xmlGenericError(xmlGenericErrorContext,
4877 "HPP: entering CONTENT\n");
4878#endif
4879 break;
4880 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004881 if (in->cur[1] == '/') {
4882 ctxt->instate = XML_PARSER_END_TAG;
4883 ctxt->checkIndex = 0;
4884#ifdef DEBUG_PUSH
4885 xmlGenericError(xmlGenericErrorContext,
4886 "HPP: entering END_TAG\n");
4887#endif
4888 break;
4889 }
Owen Taylor3473f882001-02-23 17:55:21 +00004890 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004891 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004892 goto done;
4893
Daniel Veillard597f1c12005-07-03 23:00:18 +00004894 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004895 name = ctxt->name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004896 if (failed ||
Owen Taylor3473f882001-02-23 17:55:21 +00004897 (name == NULL)) {
4898 if (CUR == '>')
4899 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004900 break;
4901 }
Owen Taylor3473f882001-02-23 17:55:21 +00004902
4903 /*
4904 * Lookup the info for that element.
4905 */
4906 info = htmlTagLookup(name);
4907 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004908 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4909 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004910 }
4911
4912 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004913 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004914 */
4915 if ((CUR == '/') && (NXT(1) == '>')) {
4916 SKIP(2);
4917 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4918 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00004919 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004920 ctxt->instate = XML_PARSER_CONTENT;
4921#ifdef DEBUG_PUSH
4922 xmlGenericError(xmlGenericErrorContext,
4923 "HPP: entering CONTENT\n");
4924#endif
4925 break;
4926 }
4927
4928 if (CUR == '>') {
4929 NEXT;
4930 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004931 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4932 "Couldn't find end of Start Tag %s\n",
4933 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004934
4935 /*
4936 * end of parsing of this node.
4937 */
4938 if (xmlStrEqual(name, ctxt->name)) {
4939 nodePop(ctxt);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00004940 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004941 }
4942
4943 ctxt->instate = XML_PARSER_CONTENT;
4944#ifdef DEBUG_PUSH
4945 xmlGenericError(xmlGenericErrorContext,
4946 "HPP: entering CONTENT\n");
4947#endif
4948 break;
4949 }
4950
4951 /*
4952 * Check for an Empty Element from DTD definition
4953 */
4954 if ((info != NULL) && (info->empty)) {
4955 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4956 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00004957 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004958 }
4959 ctxt->instate = XML_PARSER_CONTENT;
4960#ifdef DEBUG_PUSH
4961 xmlGenericError(xmlGenericErrorContext,
4962 "HPP: entering CONTENT\n");
4963#endif
4964 break;
4965 }
4966 case XML_PARSER_CONTENT: {
4967 long cons;
4968 /*
4969 * Handle preparsed entities and charRef
4970 */
4971 if (ctxt->token != 0) {
4972 xmlChar chr[2] = { 0 , 0 } ;
4973
4974 chr[0] = (xmlChar) ctxt->token;
4975 htmlCheckParagraph(ctxt);
4976 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4977 ctxt->sax->characters(ctxt->userData, chr, 1);
4978 ctxt->token = 0;
4979 ctxt->checkIndex = 0;
4980 }
4981 if ((avail == 1) && (terminate)) {
4982 cur = in->cur[0];
4983 if ((cur != '<') && (cur != '&')) {
4984 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00004985 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004986 if (ctxt->sax->ignorableWhitespace != NULL)
4987 ctxt->sax->ignorableWhitespace(
4988 ctxt->userData, &cur, 1);
4989 } else {
4990 htmlCheckParagraph(ctxt);
4991 if (ctxt->sax->characters != NULL)
4992 ctxt->sax->characters(
4993 ctxt->userData, &cur, 1);
4994 }
4995 }
4996 ctxt->token = 0;
4997 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00004998 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00004999 break;
Owen Taylor3473f882001-02-23 17:55:21 +00005000 }
Owen Taylor3473f882001-02-23 17:55:21 +00005001 }
5002 if (avail < 2)
5003 goto done;
5004 cur = in->cur[0];
5005 next = in->cur[1];
5006 cons = ctxt->nbChars;
5007 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5008 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5009 /*
5010 * Handle SCRIPT/STYLE separately
5011 */
Daniel Veillard68716a72006-10-16 09:32:17 +00005012 if (!terminate) {
5013 int idx;
5014 xmlChar val;
5015
5016 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5017 if (idx < 0)
5018 goto done;
5019 val = in->cur[idx + 2];
5020 if (val == 0) /* bad cut of input */
5021 goto done;
5022 }
Owen Taylor3473f882001-02-23 17:55:21 +00005023 htmlParseScript(ctxt);
5024 if ((cur == '<') && (next == '/')) {
5025 ctxt->instate = XML_PARSER_END_TAG;
5026 ctxt->checkIndex = 0;
5027#ifdef DEBUG_PUSH
5028 xmlGenericError(xmlGenericErrorContext,
5029 "HPP: entering END_TAG\n");
5030#endif
5031 break;
5032 }
5033 } else {
5034 /*
5035 * Sometimes DOCTYPE arrives in the middle of the document
5036 */
5037 if ((cur == '<') && (next == '!') &&
5038 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5039 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5040 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5041 (UPP(8) == 'E')) {
5042 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005043 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005044 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00005045 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5046 "Misplaced DOCTYPE declaration\n",
5047 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005048 htmlParseDocTypeDecl(ctxt);
5049 } else if ((cur == '<') && (next == '!') &&
5050 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5051 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005052 (htmlParseLookupSequence(
5053 ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005054 goto done;
5055#ifdef DEBUG_PUSH
5056 xmlGenericError(xmlGenericErrorContext,
5057 "HPP: Parsing Comment\n");
5058#endif
5059 htmlParseComment(ctxt);
5060 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005061 } else if ((cur == '<') && (next == '?')) {
5062 if ((!terminate) &&
5063 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5064 goto done;
5065#ifdef DEBUG_PUSH
5066 xmlGenericError(xmlGenericErrorContext,
5067 "HPP: Parsing PI\n");
5068#endif
5069 htmlParsePI(ctxt);
5070 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00005071 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5072 goto done;
5073 } else if ((cur == '<') && (next == '/')) {
5074 ctxt->instate = XML_PARSER_END_TAG;
5075 ctxt->checkIndex = 0;
5076#ifdef DEBUG_PUSH
5077 xmlGenericError(xmlGenericErrorContext,
5078 "HPP: entering END_TAG\n");
5079#endif
5080 break;
5081 } else if (cur == '<') {
5082 ctxt->instate = XML_PARSER_START_TAG;
5083 ctxt->checkIndex = 0;
5084#ifdef DEBUG_PUSH
5085 xmlGenericError(xmlGenericErrorContext,
5086 "HPP: entering START_TAG\n");
5087#endif
5088 break;
5089 } else if (cur == '&') {
5090 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005091 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005092 goto done;
5093#ifdef DEBUG_PUSH
5094 xmlGenericError(xmlGenericErrorContext,
5095 "HPP: Parsing Reference\n");
5096#endif
5097 /* TODO: check generation of subtrees if noent !!! */
5098 htmlParseReference(ctxt);
5099 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005100 /*
5101 * check that the text sequence is complete
5102 * before handing out the data to the parser
5103 * to avoid problems with erroneous end of
5104 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00005105 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00005106 if ((!terminate) &&
5107 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5108 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00005109 ctxt->checkIndex = 0;
5110#ifdef DEBUG_PUSH
5111 xmlGenericError(xmlGenericErrorContext,
5112 "HPP: Parsing char data\n");
5113#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005114 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005115 }
5116 }
5117 if (cons == ctxt->nbChars) {
5118 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005119 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5120 "detected an error in element content\n",
5121 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005122 }
5123 NEXT;
5124 break;
5125 }
5126
5127 break;
5128 }
5129 case XML_PARSER_END_TAG:
5130 if (avail < 2)
5131 goto done;
5132 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005133 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005134 goto done;
5135 htmlParseEndTag(ctxt);
5136 if (ctxt->nameNr == 0) {
5137 ctxt->instate = XML_PARSER_EPILOG;
5138 } else {
5139 ctxt->instate = XML_PARSER_CONTENT;
5140 }
5141 ctxt->checkIndex = 0;
5142#ifdef DEBUG_PUSH
5143 xmlGenericError(xmlGenericErrorContext,
5144 "HPP: entering CONTENT\n");
5145#endif
5146 break;
5147 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005148 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5149 "HPP: internal error, state == CDATA\n",
5150 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005151 ctxt->instate = XML_PARSER_CONTENT;
5152 ctxt->checkIndex = 0;
5153#ifdef DEBUG_PUSH
5154 xmlGenericError(xmlGenericErrorContext,
5155 "HPP: entering CONTENT\n");
5156#endif
5157 break;
5158 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005159 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5160 "HPP: internal error, state == DTD\n",
5161 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005162 ctxt->instate = XML_PARSER_CONTENT;
5163 ctxt->checkIndex = 0;
5164#ifdef DEBUG_PUSH
5165 xmlGenericError(xmlGenericErrorContext,
5166 "HPP: entering CONTENT\n");
5167#endif
5168 break;
5169 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005170 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5171 "HPP: internal error, state == COMMENT\n",
5172 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005173 ctxt->instate = XML_PARSER_CONTENT;
5174 ctxt->checkIndex = 0;
5175#ifdef DEBUG_PUSH
5176 xmlGenericError(xmlGenericErrorContext,
5177 "HPP: entering CONTENT\n");
5178#endif
5179 break;
5180 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005181 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5182 "HPP: internal error, state == PI\n",
5183 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005184 ctxt->instate = XML_PARSER_CONTENT;
5185 ctxt->checkIndex = 0;
5186#ifdef DEBUG_PUSH
5187 xmlGenericError(xmlGenericErrorContext,
5188 "HPP: entering CONTENT\n");
5189#endif
5190 break;
5191 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005192 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5193 "HPP: internal error, state == ENTITY_DECL\n",
5194 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005195 ctxt->instate = XML_PARSER_CONTENT;
5196 ctxt->checkIndex = 0;
5197#ifdef DEBUG_PUSH
5198 xmlGenericError(xmlGenericErrorContext,
5199 "HPP: entering CONTENT\n");
5200#endif
5201 break;
5202 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005203 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5204 "HPP: internal error, state == ENTITY_VALUE\n",
5205 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005206 ctxt->instate = XML_PARSER_CONTENT;
5207 ctxt->checkIndex = 0;
5208#ifdef DEBUG_PUSH
5209 xmlGenericError(xmlGenericErrorContext,
5210 "HPP: entering DTD\n");
5211#endif
5212 break;
5213 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005214 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5215 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5216 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005217 ctxt->instate = XML_PARSER_START_TAG;
5218 ctxt->checkIndex = 0;
5219#ifdef DEBUG_PUSH
5220 xmlGenericError(xmlGenericErrorContext,
5221 "HPP: entering START_TAG\n");
5222#endif
5223 break;
5224 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005225 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5226 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5227 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005228 ctxt->instate = XML_PARSER_CONTENT;
5229 ctxt->checkIndex = 0;
5230#ifdef DEBUG_PUSH
5231 xmlGenericError(xmlGenericErrorContext,
5232 "HPP: entering CONTENT\n");
5233#endif
5234 break;
5235 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005236 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5237 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5238 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005239 ctxt->instate = XML_PARSER_CONTENT;
5240 ctxt->checkIndex = 0;
5241#ifdef DEBUG_PUSH
5242 xmlGenericError(xmlGenericErrorContext,
5243 "HPP: entering CONTENT\n");
5244#endif
5245 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005246 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005247 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5248 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5249 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005250 ctxt->instate = XML_PARSER_CONTENT;
5251 ctxt->checkIndex = 0;
5252#ifdef DEBUG_PUSH
5253 xmlGenericError(xmlGenericErrorContext,
5254 "HPP: entering CONTENT\n");
5255#endif
5256 break;
5257
Owen Taylor3473f882001-02-23 17:55:21 +00005258 }
5259 }
5260done:
5261 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005262 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005263 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5264 /*
5265 * SAX: end of the document processing.
5266 */
5267 ctxt->instate = XML_PARSER_EOF;
5268 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5269 ctxt->sax->endDocument(ctxt->userData);
5270 }
5271 }
5272 if ((ctxt->myDoc != NULL) &&
5273 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5274 (ctxt->instate == XML_PARSER_EPILOG))) {
5275 xmlDtdPtr dtd;
5276 dtd = xmlGetIntSubset(ctxt->myDoc);
5277 if (dtd == NULL)
5278 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00005279 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005280 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5281 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5282 }
5283#ifdef DEBUG_PUSH
5284 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5285#endif
5286 return(ret);
5287}
5288
5289/**
Owen Taylor3473f882001-02-23 17:55:21 +00005290 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00005291 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00005292 * @chunk: an char array
5293 * @size: the size in byte of the chunk
5294 * @terminate: last chunk indicator
5295 *
5296 * Parse a Chunk of memory
5297 *
5298 * Returns zero if no error, the xmlParserErrors otherwise.
5299 */
5300int
5301htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5302 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00005303 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5304 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5305 "htmlParseChunk: context error\n", NULL, NULL);
5306 return(XML_ERR_INTERNAL_ERROR);
5307 }
Owen Taylor3473f882001-02-23 17:55:21 +00005308 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5309 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5310 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5311 int cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005312 int res;
Owen Taylor3473f882001-02-23 17:55:21 +00005313
Daniel Veillardd2755a82005-08-07 23:42:39 +00005314 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5315 if (res < 0) {
5316 ctxt->errNo = XML_PARSER_EOF;
5317 ctxt->disableSAX = 1;
5318 return (XML_PARSER_EOF);
5319 }
Owen Taylor3473f882001-02-23 17:55:21 +00005320 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5321 ctxt->input->cur = ctxt->input->base + cur;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005322 ctxt->input->end =
5323 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005324#ifdef DEBUG_PUSH
5325 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5326#endif
5327
Daniel Veillard14f752c2003-08-09 11:44:50 +00005328#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00005329 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5330 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005331#endif
Owen Taylor3473f882001-02-23 17:55:21 +00005332 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005333 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5334 xmlParserInputBufferPtr in = ctxt->input->buf;
5335 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5336 (in->raw != NULL)) {
5337 int nbchars;
5338
5339 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5340 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005341 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5342 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005343 return(XML_ERR_INVALID_ENCODING);
5344 }
5345 }
5346 }
Owen Taylor3473f882001-02-23 17:55:21 +00005347 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00005348 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00005349 if (terminate) {
5350 if ((ctxt->instate != XML_PARSER_EOF) &&
5351 (ctxt->instate != XML_PARSER_EPILOG) &&
5352 (ctxt->instate != XML_PARSER_MISC)) {
5353 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005354 ctxt->wellFormed = 0;
5355 }
5356 if (ctxt->instate != XML_PARSER_EOF) {
5357 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5358 ctxt->sax->endDocument(ctxt->userData);
5359 }
5360 ctxt->instate = XML_PARSER_EOF;
5361 }
5362 return((xmlParserErrors) ctxt->errNo);
5363}
5364
5365/************************************************************************
5366 * *
5367 * User entry points *
5368 * *
5369 ************************************************************************/
5370
5371/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005372 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005373 * @sax: a SAX handler
5374 * @user_data: The user data returned on SAX callbacks
5375 * @chunk: a pointer to an array of chars
5376 * @size: number of chars in the array
5377 * @filename: an optional file name or URI
5378 * @enc: an optional encoding
5379 *
5380 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005381 * The value of @filename is used for fetching external entities
5382 * and error/warning reports.
5383 *
5384 * Returns the new parser context or NULL
5385 */
5386htmlParserCtxtPtr
5387htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5388 const char *chunk, int size, const char *filename,
5389 xmlCharEncoding enc) {
5390 htmlParserCtxtPtr ctxt;
5391 htmlParserInputPtr inputStream;
5392 xmlParserInputBufferPtr buf;
5393
Daniel Veillardd0463562001-10-13 09:15:48 +00005394 xmlInitParser();
5395
Owen Taylor3473f882001-02-23 17:55:21 +00005396 buf = xmlAllocParserInputBuffer(enc);
5397 if (buf == NULL) return(NULL);
5398
Daniel Veillardf403d292003-10-05 13:51:35 +00005399 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005400 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005401 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005402 return(NULL);
5403 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00005404 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5405 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005406 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005407 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005408 xmlFree(ctxt->sax);
5409 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5410 if (ctxt->sax == NULL) {
5411 xmlFree(buf);
5412 xmlFree(ctxt);
5413 return(NULL);
5414 }
5415 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5416 if (user_data != NULL)
5417 ctxt->userData = user_data;
5418 }
5419 if (filename == NULL) {
5420 ctxt->directory = NULL;
5421 } else {
5422 ctxt->directory = xmlParserGetDirectory(filename);
5423 }
5424
5425 inputStream = htmlNewInputStream(ctxt);
5426 if (inputStream == NULL) {
5427 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005428 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005429 return(NULL);
5430 }
5431
5432 if (filename == NULL)
5433 inputStream->filename = NULL;
5434 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005435 inputStream->filename = (char *)
5436 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005437 inputStream->buf = buf;
5438 inputStream->base = inputStream->buf->buffer->content;
5439 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillard5f704af2003-03-05 10:01:43 +00005440 inputStream->end =
5441 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005442
5443 inputPush(ctxt, inputStream);
5444
5445 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5446 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005447 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5448 int cur = ctxt->input->cur - ctxt->input->base;
5449
Owen Taylor3473f882001-02-23 17:55:21 +00005450 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005451
5452 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5453 ctxt->input->cur = ctxt->input->base + cur;
5454 ctxt->input->end =
5455 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005456#ifdef DEBUG_PUSH
5457 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5458#endif
5459 }
Daniel Veillard68716a72006-10-16 09:32:17 +00005460 ctxt->progressive = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00005461
5462 return(ctxt);
5463}
William M. Brack21e4ef22005-01-02 09:53:13 +00005464#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00005465
5466/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005467 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005468 * @cur: a pointer to an array of xmlChar
5469 * @encoding: a free form C string describing the HTML document encoding, or NULL
5470 * @sax: the SAX handler block
5471 * @userData: if using SAX, this pointer will be provided on callbacks.
5472 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005473 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5474 * to handle parse events. If sax is NULL, fallback to the default DOM
5475 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00005476 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005477 * Returns the resulting document tree unless SAX is NULL or the document is
5478 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005479 */
5480
5481htmlDocPtr
5482htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5483 htmlDocPtr ret;
5484 htmlParserCtxtPtr ctxt;
5485
Daniel Veillardd0463562001-10-13 09:15:48 +00005486 xmlInitParser();
5487
Owen Taylor3473f882001-02-23 17:55:21 +00005488 if (cur == NULL) return(NULL);
5489
5490
5491 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5492 if (ctxt == NULL) return(NULL);
5493 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005494 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005495 ctxt->sax = sax;
5496 ctxt->userData = userData;
5497 }
5498
5499 htmlParseDocument(ctxt);
5500 ret = ctxt->myDoc;
5501 if (sax != NULL) {
5502 ctxt->sax = NULL;
5503 ctxt->userData = NULL;
5504 }
5505 htmlFreeParserCtxt(ctxt);
5506
5507 return(ret);
5508}
5509
5510/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005511 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005512 * @cur: a pointer to an array of xmlChar
5513 * @encoding: a free form C string describing the HTML document encoding, or NULL
5514 *
5515 * parse an HTML in-memory document and build a tree.
5516 *
5517 * Returns the resulting document tree
5518 */
5519
5520htmlDocPtr
5521htmlParseDoc(xmlChar *cur, const char *encoding) {
5522 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5523}
5524
5525
5526/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005527 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005528 * @filename: the filename
5529 * @encoding: a free form C string describing the HTML document encoding, or NULL
5530 *
5531 * Create a parser context for a file content.
5532 * Automatic support for ZLIB/Compress compressed document is provided
5533 * by default if found at compile-time.
5534 *
5535 * Returns the new parser context or NULL
5536 */
5537htmlParserCtxtPtr
5538htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5539{
5540 htmlParserCtxtPtr ctxt;
5541 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005542 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005543 /* htmlCharEncoding enc; */
5544 xmlChar *content, *content_line = (xmlChar *) "charset=";
5545
Daniel Veillarda03e3652004-11-02 18:45:30 +00005546 if (filename == NULL)
5547 return(NULL);
5548
Daniel Veillardf403d292003-10-05 13:51:35 +00005549 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005550 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00005551 return(NULL);
5552 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005553 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5554 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00005555#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005556 if (xmlDefaultSAXHandler.error != NULL) {
5557 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5558 }
Daniel Veillard87247e82004-01-13 20:42:02 +00005559#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00005560 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005561 return(NULL);
5562 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005563
5564 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5565 xmlFree(canonicFilename);
5566 if (inputStream == NULL) {
5567 xmlFreeParserCtxt(ctxt);
5568 return(NULL);
5569 }
Owen Taylor3473f882001-02-23 17:55:21 +00005570
5571 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005572
Owen Taylor3473f882001-02-23 17:55:21 +00005573 /* set encoding */
5574 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005575 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Owen Taylor3473f882001-02-23 17:55:21 +00005576 if (content) {
5577 strcpy ((char *)content, (char *)content_line);
5578 strcat ((char *)content, (char *)encoding);
5579 htmlCheckEncoding (ctxt, content);
5580 xmlFree (content);
5581 }
5582 }
5583
5584 return(ctxt);
5585}
5586
5587/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005588 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005589 * @filename: the filename
5590 * @encoding: a free form C string describing the HTML document encoding, or NULL
5591 * @sax: the SAX handler block
5592 * @userData: if using SAX, this pointer will be provided on callbacks.
5593 *
5594 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5595 * compressed document is provided by default if found at compile-time.
5596 * It use the given SAX function block to handle the parsing callback.
5597 * If sax is NULL, fallback to the default DOM tree building routines.
5598 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005599 * Returns the resulting document tree unless SAX is NULL or the document is
5600 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005601 */
5602
5603htmlDocPtr
5604htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5605 void *userData) {
5606 htmlDocPtr ret;
5607 htmlParserCtxtPtr ctxt;
5608 htmlSAXHandlerPtr oldsax = NULL;
5609
Daniel Veillardd0463562001-10-13 09:15:48 +00005610 xmlInitParser();
5611
Owen Taylor3473f882001-02-23 17:55:21 +00005612 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5613 if (ctxt == NULL) return(NULL);
5614 if (sax != NULL) {
5615 oldsax = ctxt->sax;
5616 ctxt->sax = sax;
5617 ctxt->userData = userData;
5618 }
5619
5620 htmlParseDocument(ctxt);
5621
5622 ret = ctxt->myDoc;
5623 if (sax != NULL) {
5624 ctxt->sax = oldsax;
5625 ctxt->userData = NULL;
5626 }
5627 htmlFreeParserCtxt(ctxt);
5628
5629 return(ret);
5630}
5631
5632/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005633 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005634 * @filename: the filename
5635 * @encoding: a free form C string describing the HTML document encoding, or NULL
5636 *
5637 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5638 * compressed document is provided by default if found at compile-time.
5639 *
5640 * Returns the resulting document tree
5641 */
5642
5643htmlDocPtr
5644htmlParseFile(const char *filename, const char *encoding) {
5645 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5646}
5647
5648/**
5649 * htmlHandleOmittedElem:
5650 * @val: int 0 or 1
5651 *
5652 * Set and return the previous value for handling HTML omitted tags.
5653 *
5654 * Returns the last value for 0 for no handling, 1 for auto insertion.
5655 */
5656
5657int
5658htmlHandleOmittedElem(int val) {
5659 int old = htmlOmittedDefaultValue;
5660
5661 htmlOmittedDefaultValue = val;
5662 return(old);
5663}
5664
Daniel Veillard930dfb62003-02-05 10:17:38 +00005665/**
5666 * htmlElementAllowedHere:
5667 * @parent: HTML parent element
5668 * @elt: HTML element
5669 *
5670 * Checks whether an HTML element may be a direct child of a parent element.
5671 * Note - doesn't check for deprecated elements
5672 *
5673 * Returns 1 if allowed; 0 otherwise.
5674 */
5675int
5676htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5677 const char** p ;
5678
5679 if ( ! elt || ! parent || ! parent->subelts )
5680 return 0 ;
5681
5682 for ( p = parent->subelts; *p; ++p )
5683 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5684 return 1 ;
5685
5686 return 0 ;
5687}
5688/**
5689 * htmlElementStatusHere:
5690 * @parent: HTML parent element
5691 * @elt: HTML element
5692 *
5693 * Checks whether an HTML element may be a direct child of a parent element.
5694 * and if so whether it is valid or deprecated.
5695 *
5696 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5697 */
5698htmlStatus
5699htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5700 if ( ! parent || ! elt )
5701 return HTML_INVALID ;
5702 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5703 return HTML_INVALID ;
5704
5705 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5706}
5707/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005708 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005709 * @elt: HTML element
5710 * @attr: HTML attribute
5711 * @legacy: whether to allow deprecated attributes
5712 *
5713 * Checks whether an attribute is valid for an element
5714 * Has full knowledge of Required and Deprecated attributes
5715 *
5716 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5717 */
5718htmlStatus
5719htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5720 const char** p ;
5721
5722 if ( !elt || ! attr )
5723 return HTML_INVALID ;
5724
5725 if ( elt->attrs_req )
5726 for ( p = elt->attrs_req; *p; ++p)
5727 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5728 return HTML_REQUIRED ;
5729
5730 if ( elt->attrs_opt )
5731 for ( p = elt->attrs_opt; *p; ++p)
5732 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5733 return HTML_VALID ;
5734
5735 if ( legacy && elt->attrs_depr )
5736 for ( p = elt->attrs_depr; *p; ++p)
5737 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5738 return HTML_DEPRECATED ;
5739
5740 return HTML_INVALID ;
5741}
5742/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005743 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005744 * @node: an htmlNodePtr in a tree
5745 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005746 * for Element nodes)
5747 *
5748 * Checks whether the tree node is valid. Experimental (the author
5749 * only uses the HTML enhancements in a SAX parser)
5750 *
5751 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5752 * legacy allowed) or htmlElementStatusHere (otherwise).
5753 * for Attribute nodes, a return from htmlAttrAllowed
5754 * for other nodes, HTML_NA (no checks performed)
5755 */
5756htmlStatus
5757htmlNodeStatus(const htmlNodePtr node, int legacy) {
5758 if ( ! node )
5759 return HTML_INVALID ;
5760
5761 switch ( node->type ) {
5762 case XML_ELEMENT_NODE:
5763 return legacy
5764 ? ( htmlElementAllowedHere (
5765 htmlTagLookup(node->parent->name) , node->name
5766 ) ? HTML_VALID : HTML_INVALID )
5767 : htmlElementStatusHere(
5768 htmlTagLookup(node->parent->name) ,
5769 htmlTagLookup(node->name) )
5770 ;
5771 case XML_ATTRIBUTE_NODE:
5772 return htmlAttrAllowed(
5773 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5774 default: return HTML_NA ;
5775 }
5776}
Daniel Veillard9475a352003-09-26 12:47:50 +00005777/************************************************************************
5778 * *
5779 * New set (2.6.0) of simpler and more flexible APIs *
5780 * *
5781 ************************************************************************/
5782/**
5783 * DICT_FREE:
5784 * @str: a string
5785 *
5786 * Free a string if it is not owned by the "dict" dictionnary in the
5787 * current scope
5788 */
5789#define DICT_FREE(str) \
5790 if ((str) && ((!dict) || \
5791 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5792 xmlFree((char *)(str));
5793
5794/**
5795 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00005796 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00005797 *
5798 * Reset a parser context
5799 */
5800void
5801htmlCtxtReset(htmlParserCtxtPtr ctxt)
5802{
5803 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00005804 xmlDictPtr dict;
5805
5806 if (ctxt == NULL)
5807 return;
5808
Daniel Veillardf1a27c62006-10-13 22:33:03 +00005809 xmlInitParser();
Daniel Veillarda03e3652004-11-02 18:45:30 +00005810 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00005811
5812 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5813 xmlFreeInputStream(input);
5814 }
5815 ctxt->inputNr = 0;
5816 ctxt->input = NULL;
5817
5818 ctxt->spaceNr = 0;
Daniel Veillarda521d282004-11-09 14:59:59 +00005819 if (ctxt->spaceTab != NULL) {
5820 ctxt->spaceTab[0] = -1;
5821 ctxt->space = &ctxt->spaceTab[0];
5822 } else {
5823 ctxt->space = NULL;
5824 }
Daniel Veillard9475a352003-09-26 12:47:50 +00005825
5826
5827 ctxt->nodeNr = 0;
5828 ctxt->node = NULL;
5829
5830 ctxt->nameNr = 0;
5831 ctxt->name = NULL;
5832
5833 DICT_FREE(ctxt->version);
5834 ctxt->version = NULL;
5835 DICT_FREE(ctxt->encoding);
5836 ctxt->encoding = NULL;
5837 DICT_FREE(ctxt->directory);
5838 ctxt->directory = NULL;
5839 DICT_FREE(ctxt->extSubURI);
5840 ctxt->extSubURI = NULL;
5841 DICT_FREE(ctxt->extSubSystem);
5842 ctxt->extSubSystem = NULL;
5843 if (ctxt->myDoc != NULL)
5844 xmlFreeDoc(ctxt->myDoc);
5845 ctxt->myDoc = NULL;
5846
5847 ctxt->standalone = -1;
5848 ctxt->hasExternalSubset = 0;
5849 ctxt->hasPErefs = 0;
5850 ctxt->html = 1;
5851 ctxt->external = 0;
5852 ctxt->instate = XML_PARSER_START;
5853 ctxt->token = 0;
5854
5855 ctxt->wellFormed = 1;
5856 ctxt->nsWellFormed = 1;
5857 ctxt->valid = 1;
5858 ctxt->vctxt.userData = ctxt;
5859 ctxt->vctxt.error = xmlParserValidityError;
5860 ctxt->vctxt.warning = xmlParserValidityWarning;
5861 ctxt->record_info = 0;
5862 ctxt->nbChars = 0;
5863 ctxt->checkIndex = 0;
5864 ctxt->inSubset = 0;
5865 ctxt->errNo = XML_ERR_OK;
5866 ctxt->depth = 0;
5867 ctxt->charset = XML_CHAR_ENCODING_UTF8;
5868 ctxt->catalogs = NULL;
5869 xmlInitNodeInfoSeq(&ctxt->node_seq);
5870
5871 if (ctxt->attsDefault != NULL) {
5872 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5873 ctxt->attsDefault = NULL;
5874 }
5875 if (ctxt->attsSpecial != NULL) {
5876 xmlHashFree(ctxt->attsSpecial, NULL);
5877 ctxt->attsSpecial = NULL;
5878 }
5879}
5880
5881/**
5882 * htmlCtxtUseOptions:
5883 * @ctxt: an HTML parser context
5884 * @options: a combination of htmlParserOption(s)
5885 *
5886 * Applies the options to the parser context
5887 *
5888 * Returns 0 in case of success, the set of unknown or unimplemented options
5889 * in case of error.
5890 */
5891int
5892htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5893{
Daniel Veillarda03e3652004-11-02 18:45:30 +00005894 if (ctxt == NULL)
5895 return(-1);
5896
Daniel Veillard9475a352003-09-26 12:47:50 +00005897 if (options & HTML_PARSE_NOWARNING) {
5898 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005899 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00005900 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005901 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00005902 }
5903 if (options & HTML_PARSE_NOERROR) {
5904 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005905 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00005906 ctxt->sax->fatalError = NULL;
5907 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005908 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00005909 }
5910 if (options & HTML_PARSE_PEDANTIC) {
5911 ctxt->pedantic = 1;
5912 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005913 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00005914 } else
5915 ctxt->pedantic = 0;
5916 if (options & XML_PARSE_NOBLANKS) {
5917 ctxt->keepBlanks = 0;
5918 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5919 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005920 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00005921 } else
5922 ctxt->keepBlanks = 1;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00005923 if (options & HTML_PARSE_RECOVER) {
5924 ctxt->recovery = 1;
Daniel Veillardaf616a72006-10-17 20:18:39 +00005925 options -= HTML_PARSE_RECOVER;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00005926 } else
5927 ctxt->recovery = 0;
Daniel Veillard8874b942005-08-25 13:19:21 +00005928 if (options & HTML_PARSE_COMPACT) {
5929 ctxt->options |= HTML_PARSE_COMPACT;
5930 options -= HTML_PARSE_COMPACT;
5931 }
Daniel Veillard9475a352003-09-26 12:47:50 +00005932 ctxt->dictNames = 0;
5933 return (options);
5934}
5935
5936/**
5937 * htmlDoRead:
5938 * @ctxt: an HTML parser context
5939 * @URL: the base URL to use for the document
5940 * @encoding: the document encoding, or NULL
5941 * @options: a combination of htmlParserOption(s)
5942 * @reuse: keep the context for reuse
5943 *
5944 * Common front-end for the htmlRead functions
5945 *
5946 * Returns the resulting document tree or NULL
5947 */
5948static htmlDocPtr
5949htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5950 int options, int reuse)
5951{
5952 htmlDocPtr ret;
5953
5954 htmlCtxtUseOptions(ctxt, options);
5955 ctxt->html = 1;
5956 if (encoding != NULL) {
5957 xmlCharEncodingHandlerPtr hdlr;
5958
5959 hdlr = xmlFindCharEncodingHandler(encoding);
5960 if (hdlr != NULL)
5961 xmlSwitchToEncoding(ctxt, hdlr);
5962 }
5963 if ((URL != NULL) && (ctxt->input != NULL) &&
5964 (ctxt->input->filename == NULL))
5965 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
5966 htmlParseDocument(ctxt);
5967 ret = ctxt->myDoc;
5968 ctxt->myDoc = NULL;
5969 if (!reuse) {
5970 if ((ctxt->dictNames) &&
5971 (ret != NULL) &&
5972 (ret->dict == ctxt->dict))
5973 ctxt->dict = NULL;
5974 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00005975 }
5976 return (ret);
5977}
5978
5979/**
5980 * htmlReadDoc:
5981 * @cur: a pointer to a zero terminated string
5982 * @URL: the base URL to use for the document
5983 * @encoding: the document encoding, or NULL
5984 * @options: a combination of htmlParserOption(s)
5985 *
5986 * parse an XML in-memory document and build a tree.
5987 *
5988 * Returns the resulting document tree
5989 */
5990htmlDocPtr
5991htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
5992{
5993 htmlParserCtxtPtr ctxt;
5994
5995 if (cur == NULL)
5996 return (NULL);
5997
Daniel Veillardf1a27c62006-10-13 22:33:03 +00005998 xmlInitParser();
Daniel Veillard8a82ae12006-10-17 20:04:10 +00005999 ctxt = htmlCreateDocParserCtxt(cur, NULL);
Daniel Veillard9475a352003-09-26 12:47:50 +00006000 if (ctxt == NULL)
6001 return (NULL);
6002 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6003}
6004
6005/**
6006 * htmlReadFile:
6007 * @filename: a file or URL
6008 * @encoding: the document encoding, or NULL
6009 * @options: a combination of htmlParserOption(s)
6010 *
6011 * parse an XML file from the filesystem or the network.
6012 *
6013 * Returns the resulting document tree
6014 */
6015htmlDocPtr
6016htmlReadFile(const char *filename, const char *encoding, int options)
6017{
6018 htmlParserCtxtPtr ctxt;
6019
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006020 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006021 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6022 if (ctxt == NULL)
6023 return (NULL);
6024 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6025}
6026
6027/**
6028 * htmlReadMemory:
6029 * @buffer: a pointer to a char array
6030 * @size: the size of the array
6031 * @URL: the base URL to use for the document
6032 * @encoding: the document encoding, or NULL
6033 * @options: a combination of htmlParserOption(s)
6034 *
6035 * parse an XML in-memory document and build a tree.
6036 *
6037 * Returns the resulting document tree
6038 */
6039htmlDocPtr
6040htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6041{
6042 htmlParserCtxtPtr ctxt;
6043
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006044 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006045 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6046 if (ctxt == NULL)
6047 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006048 htmlDefaultSAXHandlerInit();
William M. Brackd43cdcd2004-08-03 15:13:29 +00006049 if (ctxt->sax != NULL)
6050 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00006051 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6052}
6053
6054/**
6055 * htmlReadFd:
6056 * @fd: an open file descriptor
6057 * @URL: the base URL to use for the document
6058 * @encoding: the document encoding, or NULL
6059 * @options: a combination of htmlParserOption(s)
6060 *
6061 * parse an XML from a file descriptor and build a tree.
6062 *
6063 * Returns the resulting document tree
6064 */
6065htmlDocPtr
6066htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6067{
6068 htmlParserCtxtPtr ctxt;
6069 xmlParserInputBufferPtr input;
6070 xmlParserInputPtr stream;
6071
6072 if (fd < 0)
6073 return (NULL);
6074
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006075 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006076 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6077 if (input == NULL)
6078 return (NULL);
6079 ctxt = xmlNewParserCtxt();
6080 if (ctxt == NULL) {
6081 xmlFreeParserInputBuffer(input);
6082 return (NULL);
6083 }
6084 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6085 if (stream == NULL) {
6086 xmlFreeParserInputBuffer(input);
6087 xmlFreeParserCtxt(ctxt);
6088 return (NULL);
6089 }
6090 inputPush(ctxt, stream);
6091 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6092}
6093
6094/**
6095 * htmlReadIO:
6096 * @ioread: an I/O read function
6097 * @ioclose: an I/O close function
6098 * @ioctx: an I/O handler
6099 * @URL: the base URL to use for the document
6100 * @encoding: the document encoding, or NULL
6101 * @options: a combination of htmlParserOption(s)
6102 *
6103 * parse an HTML document from I/O functions and source and build a tree.
6104 *
6105 * Returns the resulting document tree
6106 */
6107htmlDocPtr
6108htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6109 void *ioctx, const char *URL, const char *encoding, int options)
6110{
6111 htmlParserCtxtPtr ctxt;
6112 xmlParserInputBufferPtr input;
6113 xmlParserInputPtr stream;
6114
6115 if (ioread == NULL)
6116 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006117 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006118
6119 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6120 XML_CHAR_ENCODING_NONE);
6121 if (input == NULL)
6122 return (NULL);
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006123 ctxt = htmlNewParserCtxt();
Daniel Veillard9475a352003-09-26 12:47:50 +00006124 if (ctxt == NULL) {
6125 xmlFreeParserInputBuffer(input);
6126 return (NULL);
6127 }
6128 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6129 if (stream == NULL) {
6130 xmlFreeParserInputBuffer(input);
6131 xmlFreeParserCtxt(ctxt);
6132 return (NULL);
6133 }
6134 inputPush(ctxt, stream);
6135 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6136}
6137
6138/**
6139 * htmlCtxtReadDoc:
6140 * @ctxt: an HTML parser context
6141 * @cur: a pointer to a zero terminated string
6142 * @URL: the base URL to use for the document
6143 * @encoding: the document encoding, or NULL
6144 * @options: a combination of htmlParserOption(s)
6145 *
6146 * parse an XML in-memory document and build a tree.
6147 * This reuses the existing @ctxt parser context
6148 *
6149 * Returns the resulting document tree
6150 */
6151htmlDocPtr
6152htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6153 const char *URL, const char *encoding, int options)
6154{
6155 xmlParserInputPtr stream;
6156
6157 if (cur == NULL)
6158 return (NULL);
6159 if (ctxt == NULL)
6160 return (NULL);
6161
6162 htmlCtxtReset(ctxt);
6163
6164 stream = xmlNewStringInputStream(ctxt, cur);
6165 if (stream == NULL) {
6166 return (NULL);
6167 }
6168 inputPush(ctxt, stream);
6169 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6170}
6171
6172/**
6173 * htmlCtxtReadFile:
6174 * @ctxt: an HTML parser context
6175 * @filename: a file or URL
6176 * @encoding: the document encoding, or NULL
6177 * @options: a combination of htmlParserOption(s)
6178 *
6179 * parse an XML file from the filesystem or the network.
6180 * This reuses the existing @ctxt parser context
6181 *
6182 * Returns the resulting document tree
6183 */
6184htmlDocPtr
6185htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6186 const char *encoding, int options)
6187{
6188 xmlParserInputPtr stream;
6189
6190 if (filename == NULL)
6191 return (NULL);
6192 if (ctxt == NULL)
6193 return (NULL);
6194
6195 htmlCtxtReset(ctxt);
6196
Daniel Veillard29614c72004-11-26 10:47:26 +00006197 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006198 if (stream == NULL) {
6199 return (NULL);
6200 }
6201 inputPush(ctxt, stream);
6202 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6203}
6204
6205/**
6206 * htmlCtxtReadMemory:
6207 * @ctxt: an HTML parser context
6208 * @buffer: a pointer to a char array
6209 * @size: the size of the array
6210 * @URL: the base URL to use for the document
6211 * @encoding: the document encoding, or NULL
6212 * @options: a combination of htmlParserOption(s)
6213 *
6214 * parse an XML in-memory document and build a tree.
6215 * This reuses the existing @ctxt parser context
6216 *
6217 * Returns the resulting document tree
6218 */
6219htmlDocPtr
6220htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6221 const char *URL, const char *encoding, int options)
6222{
6223 xmlParserInputBufferPtr input;
6224 xmlParserInputPtr stream;
6225
6226 if (ctxt == NULL)
6227 return (NULL);
6228 if (buffer == NULL)
6229 return (NULL);
6230
6231 htmlCtxtReset(ctxt);
6232
6233 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6234 if (input == NULL) {
6235 return(NULL);
6236 }
6237
6238 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6239 if (stream == NULL) {
6240 xmlFreeParserInputBuffer(input);
6241 return(NULL);
6242 }
6243
6244 inputPush(ctxt, stream);
6245 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6246}
6247
6248/**
6249 * htmlCtxtReadFd:
6250 * @ctxt: an HTML parser context
6251 * @fd: an open file descriptor
6252 * @URL: the base URL to use for the document
6253 * @encoding: the document encoding, or NULL
6254 * @options: a combination of htmlParserOption(s)
6255 *
6256 * parse an XML from a file descriptor and build a tree.
6257 * This reuses the existing @ctxt parser context
6258 *
6259 * Returns the resulting document tree
6260 */
6261htmlDocPtr
6262htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6263 const char *URL, const char *encoding, int options)
6264{
6265 xmlParserInputBufferPtr input;
6266 xmlParserInputPtr stream;
6267
6268 if (fd < 0)
6269 return (NULL);
6270 if (ctxt == NULL)
6271 return (NULL);
6272
6273 htmlCtxtReset(ctxt);
6274
6275
6276 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6277 if (input == NULL)
6278 return (NULL);
6279 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6280 if (stream == NULL) {
6281 xmlFreeParserInputBuffer(input);
6282 return (NULL);
6283 }
6284 inputPush(ctxt, stream);
6285 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6286}
6287
6288/**
6289 * htmlCtxtReadIO:
6290 * @ctxt: an HTML parser context
6291 * @ioread: an I/O read function
6292 * @ioclose: an I/O close function
6293 * @ioctx: an I/O handler
6294 * @URL: the base URL to use for the document
6295 * @encoding: the document encoding, or NULL
6296 * @options: a combination of htmlParserOption(s)
6297 *
6298 * parse an HTML document from I/O functions and source and build a tree.
6299 * This reuses the existing @ctxt parser context
6300 *
6301 * Returns the resulting document tree
6302 */
6303htmlDocPtr
6304htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6305 xmlInputCloseCallback ioclose, void *ioctx,
6306 const char *URL,
6307 const char *encoding, int options)
6308{
6309 xmlParserInputBufferPtr input;
6310 xmlParserInputPtr stream;
6311
6312 if (ioread == NULL)
6313 return (NULL);
6314 if (ctxt == NULL)
6315 return (NULL);
6316
6317 htmlCtxtReset(ctxt);
6318
6319 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6320 XML_CHAR_ENCODING_NONE);
6321 if (input == NULL)
6322 return (NULL);
6323 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6324 if (stream == NULL) {
6325 xmlFreeParserInputBuffer(input);
6326 return (NULL);
6327 }
6328 inputPush(ctxt, stream);
6329 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6330}
6331
Daniel Veillard5d4644e2005-04-01 13:11:58 +00006332#define bottom_HTMLparser
6333#include "elfgcchack.h"
Owen Taylor3473f882001-02-23 17:55:21 +00006334#endif /* LIBXML_HTML_ENABLED */