blob: 29aa89a8040577ada11f4cb94f56988c842bcd1e [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillardf403d292003-10-05 13:51:35 +000062 * Some factorized error routines *
63 * *
64 ************************************************************************/
65
66/**
William M. Brackedb65a72004-02-06 07:36:04 +000067 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000068 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
Daniel Veillard157fee02003-10-31 10:36:03 +000076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000079 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
Daniel Veillard157fee02003-10-31 10:36:03 +0000109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000112 if (ctxt != NULL)
113 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
117 NULL, 0, 0,
118 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000119 if (ctxt != NULL)
120 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
128 * @val: integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
135{
Daniel Veillard157fee02003-10-31 10:36:03 +0000136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
138 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000139 if (ctxt != NULL)
140 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000144 if (ctxt != NULL)
145 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000146}
147
148/************************************************************************
149 * *
Owen Taylor3473f882001-02-23 17:55:21 +0000150 * Parser stacks related functions and macros *
151 * *
152 ************************************************************************/
153
Daniel Veillard1c732d22002-11-30 11:22:59 +0000154/**
155 * htmlnamePush:
156 * @ctxt: an HTML parser context
157 * @value: the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000162 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000163static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000165{
166 if (ctxt->nameNr >= ctxt->nameMax) {
167 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000168 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000169 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000170 ctxt->nameMax *
171 sizeof(ctxt->nameTab[0]));
172 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000173 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000174 return (0);
175 }
176 }
177 ctxt->nameTab[ctxt->nameNr] = value;
178 ctxt->name = value;
179 return (ctxt->nameNr++);
180}
181/**
182 * htmlnamePop:
183 * @ctxt: an HTML parser context
184 *
185 * Pops the top element name from the name stack
186 *
187 * Returns the name just removed
188 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000189static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000190htmlnamePop(htmlParserCtxtPtr ctxt)
191{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000192 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000193
Daniel Veillard1c732d22002-11-30 11:22:59 +0000194 if (ctxt->nameNr <= 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000195 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000196 ctxt->nameNr--;
197 if (ctxt->nameNr < 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000198 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000199 if (ctxt->nameNr > 0)
200 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
201 else
202 ctxt->name = NULL;
203 ret = ctxt->nameTab[ctxt->nameNr];
Daniel Veillard24505b02005-07-28 23:49:35 +0000204 ctxt->nameTab[ctxt->nameNr] = NULL;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000205 return (ret);
206}
Owen Taylor3473f882001-02-23 17:55:21 +0000207
208/*
209 * Macros for accessing the content. Those should be used only by the parser,
210 * and not exported.
211 *
212 * Dirty macros, i.e. one need to make assumption on the context to use them
213 *
214 * CUR_PTR return the current pointer to the xmlChar to be parsed.
215 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
216 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
217 * in UNICODE mode. This should be used internally by the parser
218 * only to compare to ASCII values otherwise it would break when
219 * running with UTF-8 encoding.
220 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
221 * to compare on ASCII based substring.
222 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
223 * it should be used only to compare on ASCII based substring.
224 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000225 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000226 *
227 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
228 *
229 * CURRENT Returns the current char value, with the full decoding of
230 * UTF-8 if we are using this mode. It returns an int.
231 * NEXT Skip to the next character, this does the proper decoding
232 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000233 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000234 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
235 */
236
237#define UPPER (toupper(*ctxt->input->cur))
238
Daniel Veillard77a90a72003-03-22 00:04:05 +0000239#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000240
241#define NXT(val) ctxt->input->cur[(val)]
242
243#define UPP(val) (toupper(ctxt->input->cur[(val)]))
244
245#define CUR_PTR ctxt->input->cur
246
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000247#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
248 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
249 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000250
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000251#define GROW if ((ctxt->progressive == 0) && \
252 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
253 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000254
255#define CURRENT ((int) (*ctxt->input->cur))
256
257#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
258
259/* Inported from XML */
260
Daniel Veillard561b7f82002-03-20 21:55:57 +0000261/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
262#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000263#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000264
Daniel Veillard561b7f82002-03-20 21:55:57 +0000265#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000266#define NXT(val) ctxt->input->cur[(val)]
267#define CUR_PTR ctxt->input->cur
268
269
270#define NEXTL(l) do { \
271 if (*(ctxt->input->cur) == '\n') { \
272 ctxt->input->line++; ctxt->input->col = 1; \
273 } else ctxt->input->col++; \
274 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
275 } while (0)
276
277/************
278 \
279 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
280 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
281 ************/
282
283#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
284#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
285
286#define COPY_BUF(l,b,i,v) \
287 if (l == 1) b[i++] = (xmlChar) v; \
288 else i += xmlCopyChar(l,&b[i],v)
289
290/**
291 * htmlCurrentChar:
292 * @ctxt: the HTML parser context
293 * @len: pointer to the length of the char read
294 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000295 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000296 * bytes in the input buffer. Implement the end of line normalization:
297 * 2.11 End-of-Line Handling
298 * If the encoding is unspecified, in the case we find an ISO-Latin-1
299 * char, then the encoding converter is plugged in automatically.
300 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000301 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000302 */
303
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000304static int
Owen Taylor3473f882001-02-23 17:55:21 +0000305htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
306 if (ctxt->instate == XML_PARSER_EOF)
307 return(0);
308
309 if (ctxt->token != 0) {
310 *len = 0;
311 return(ctxt->token);
312 }
313 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
314 /*
315 * We are supposed to handle UTF8, check it's valid
316 * From rfc2044: encoding of the Unicode values on UTF-8:
317 *
318 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
319 * 0000 0000-0000 007F 0xxxxxxx
320 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
321 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
322 *
323 * Check for the 0x110000 limit too
324 */
325 const unsigned char *cur = ctxt->input->cur;
326 unsigned char c;
327 unsigned int val;
328
329 c = *cur;
330 if (c & 0x80) {
331 if (cur[1] == 0)
332 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
333 if ((cur[1] & 0xc0) != 0x80)
334 goto encoding_error;
335 if ((c & 0xe0) == 0xe0) {
336
337 if (cur[2] == 0)
338 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
339 if ((cur[2] & 0xc0) != 0x80)
340 goto encoding_error;
341 if ((c & 0xf0) == 0xf0) {
342 if (cur[3] == 0)
343 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
344 if (((c & 0xf8) != 0xf0) ||
345 ((cur[3] & 0xc0) != 0x80))
346 goto encoding_error;
347 /* 4-byte code */
348 *len = 4;
349 val = (cur[0] & 0x7) << 18;
350 val |= (cur[1] & 0x3f) << 12;
351 val |= (cur[2] & 0x3f) << 6;
352 val |= cur[3] & 0x3f;
353 } else {
354 /* 3-byte code */
355 *len = 3;
356 val = (cur[0] & 0xf) << 12;
357 val |= (cur[1] & 0x3f) << 6;
358 val |= cur[2] & 0x3f;
359 }
360 } else {
361 /* 2-byte code */
362 *len = 2;
363 val = (cur[0] & 0x1f) << 6;
364 val |= cur[1] & 0x3f;
365 }
366 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000367 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
368 "Char 0x%X out of allowed range\n", val);
Owen Taylor3473f882001-02-23 17:55:21 +0000369 }
370 return(val);
371 } else {
372 /* 1-byte code */
373 *len = 1;
374 return((int) *ctxt->input->cur);
375 }
376 }
377 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000378 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000379 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000380 * XML constructs only use < 128 chars
381 */
382 *len = 1;
383 if ((int) *ctxt->input->cur < 0x80)
384 return((int) *ctxt->input->cur);
385
386 /*
387 * Humm this is bad, do an automatic flow conversion
388 */
389 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
390 ctxt->charset = XML_CHAR_ENCODING_UTF8;
391 return(xmlCurrentChar(ctxt, len));
392
393encoding_error:
394 /*
395 * If we detect an UTF8 error that probably mean that the
396 * input encoding didn't get properly advertized in the
397 * declaration header. Report the error and switch the encoding
398 * to ISO-Latin-1 (if you don't like this policy, just declare the
399 * encoding !)
400 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000401 {
402 char buffer[150];
403
404 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
Owen Taylor3473f882001-02-23 17:55:21 +0000405 ctxt->input->cur[0], ctxt->input->cur[1],
406 ctxt->input->cur[2], ctxt->input->cur[3]);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000407 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
408 "Input is not proper UTF-8, indicate encoding !\n",
409 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000410 }
411
412 ctxt->charset = XML_CHAR_ENCODING_8859_1;
413 *len = 1;
414 return((int) *ctxt->input->cur);
415}
416
417/**
Owen Taylor3473f882001-02-23 17:55:21 +0000418 * htmlSkipBlankChars:
419 * @ctxt: the HTML parser context
420 *
421 * skip all blanks character found at that point in the input streams.
422 *
423 * Returns the number of space chars skipped
424 */
425
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000426static int
Owen Taylor3473f882001-02-23 17:55:21 +0000427htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
428 int res = 0;
429
William M. Brack76e95df2003-10-18 16:20:14 +0000430 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000431 if ((*ctxt->input->cur == 0) &&
432 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
433 xmlPopInput(ctxt);
434 } else {
435 if (*(ctxt->input->cur) == '\n') {
436 ctxt->input->line++; ctxt->input->col = 1;
437 } else ctxt->input->col++;
438 ctxt->input->cur++;
439 ctxt->nbChars++;
440 if (*ctxt->input->cur == 0)
441 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
442 }
443 res++;
444 }
445 return(res);
446}
447
448
449
450/************************************************************************
451 * *
452 * The list of HTML elements and their properties *
453 * *
454 ************************************************************************/
455
456/*
457 * Start Tag: 1 means the start tag can be ommited
458 * End Tag: 1 means the end tag can be ommited
459 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000460 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000461 * Depr: this element is deprecated
462 * DTD: 1 means that this element is valid only in the Loose DTD
463 * 2 means that this element is valid only in the Frameset DTD
464 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000465 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000466 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000467 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000468
469/* Definitions and a couple of vars for HTML Elements */
470
471#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000472#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000473#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000474#define NB_PHRASE 10
Daniel Veillard491e58e2007-05-02 16:15:18 +0000475#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
476#define NB_SPECIAL 16
Daniel Veillard930dfb62003-02-05 10:17:38 +0000477#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000478#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
479#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
480#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000481#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000482#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000483#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000484#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000485#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000486#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000487#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000488#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000489#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000490#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000491#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000492#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000493#define EMPTY NULL
494
495
Daniel Veillard065abe82006-07-03 08:55:04 +0000496static const char* const html_flow[] = { FLOW, NULL } ;
497static const char* const html_inline[] = { INLINE, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000498
499/* placeholders: elts with content but no subelements */
Daniel Veillard065abe82006-07-03 08:55:04 +0000500static const char* const html_pcdata[] = { NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000501#define html_cdata html_pcdata
502
503
504/* ... and for HTML Attributes */
505
506#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000507#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000508#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000509#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000510#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000511#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000512#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000513#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000514#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000515#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000516#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000517#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000518
Daniel Veillard065abe82006-07-03 08:55:04 +0000519static const char* const html_attrs[] = { ATTRS, NULL } ;
520static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
521static const char* const core_attrs[] = { COREATTRS, NULL } ;
522static const char* const i18n_attrs[] = { I18N, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000523
524
525/* Other declarations that should go inline ... */
Daniel Veillard065abe82006-07-03 08:55:04 +0000526static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000527 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
528 "tabindex", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000529static const char* const target_attr[] = { "target", NULL } ;
530static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
531static const char* const alt_attr[] = { "alt", NULL } ;
532static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
533static const char* const href_attrs[] = { "href", NULL } ;
534static const char* const clear_attrs[] = { "clear", NULL } ;
535static const char* const inline_p[] = { INLINE, "p", NULL } ;
536
537static const char* const flow_param[] = { FLOW, "param", NULL } ;
538static const char* const applet_attrs[] = { COREATTRS , "codebase",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000539 "archive", "alt", "name", "height", "width", "align",
540 "hspace", "vspace", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000541static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000542 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000543static const char* const basefont_attrs[] =
Daniel Veillard930dfb62003-02-05 10:17:38 +0000544 { "id", "size", "color", "face", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000545static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
546static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
547static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
548static const char* const body_depr[] = { "background", "bgcolor", "text",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000549 "link", "vlink", "alink", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000550static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000551 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
552
553
Daniel Veillard065abe82006-07-03 08:55:04 +0000554static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
555static const char* const col_elt[] = { "col", NULL } ;
556static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
557static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
558static const char* const dl_contents[] = { "dt", "dd", NULL } ;
559static const char* const compact_attr[] = { "compact", NULL } ;
560static const char* const label_attr[] = { "label", NULL } ;
561static const char* const fieldset_contents[] = { FLOW, "legend" } ;
562static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
563static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
564static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
565static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
566static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
567static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
568static const char* const head_attrs[] = { I18N, "profile", NULL } ;
569static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
570static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
571static const char* const version_attr[] = { "version", NULL } ;
572static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
573static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
574static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
Daniel Veillard491e58e2007-05-02 16:15:18 +0000575static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000576static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
577static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
578static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
579static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
580static const char* const align_attr[] = { "align", NULL } ;
581static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
582static const char* const map_contents[] = { BLOCK, "area", NULL } ;
583static const char* const name_attr[] = { "name", NULL } ;
584static const char* const action_attr[] = { "action", NULL } ;
585static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
586static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
587static const char* const content_attr[] = { "content", NULL } ;
588static const char* const type_attr[] = { "type", NULL } ;
589static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
590static const char* const object_contents[] = { FLOW, "param", NULL } ;
591static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
592static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
593static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
594static const char* const option_elt[] = { "option", NULL } ;
595static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
596static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
597static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
598static const char* const width_attr[] = { "width", NULL } ;
599static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
600static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
601static const char* const language_attr[] = { "language", NULL } ;
602static const char* const select_content[] = { "optgroup", "option", NULL } ;
603static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
604static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
605static const char* const table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
606static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
607static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
608static const char* const tr_elt[] = { "tr", NULL } ;
609static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
610static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
611static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
612static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
613static const char* const tr_contents[] = { "th", "td", NULL } ;
614static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
615static const char* const li_elt[] = { "li", NULL } ;
616static const char* const ul_depr[] = { "type", "compact", NULL} ;
617static const char* const dir_attr[] = { "dir", NULL} ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000618
619#define DECL (const char**)
620
Daniel Veillard22090732001-07-16 00:06:07 +0000621static const htmlElemDesc
622html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000623{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
624 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
625},
626{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
627 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
628},
629{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
630 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
631},
632{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
633 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
634},
635{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
636 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
637},
638{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
639 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
640},
641{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
642 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
643},
644{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
645 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
646},
647{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
648 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
649},
650{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
651 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
652},
653{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
654 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
655},
656{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
657 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
658},
659{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
660 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
661},
662{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
663 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
664},
665{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
666 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
667},
668{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
669 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
670},
671{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
672 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
673},
674{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
675 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
676},
677{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
678 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
679},
680{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
681 EMPTY , NULL , DECL col_attrs , NULL, NULL
682},
683{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
684 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
685},
686{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
687 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
688},
689{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
690 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
691},
692{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
693 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
694},
695{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
696 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
697},
698{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
699 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
700},
701{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
William M. Bracke978ae22007-03-21 06:16:02 +0000702 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
Daniel Veillard930dfb62003-02-05 10:17:38 +0000703},
704{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
705 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
706},
707{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
708 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
709},
Daniel Veillard491e58e2007-05-02 16:15:18 +0000710{ "embed", 0, 1, 2, 0, 1, 1, 1, "generic embedded object ",
711 EMPTY, NULL, DECL embed_attrs, NULL, NULL
712},
Daniel Veillard930dfb62003-02-05 10:17:38 +0000713{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
714 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
715},
716{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
717 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
718},
719{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
720 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
721},
722{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
723 EMPTY, NULL, NULL, DECL frame_attrs, NULL
724},
725{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
726 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
727},
728{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
729 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
730},
731{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
732 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
733},
734{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
735 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
736},
737{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
738 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
739},
740{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
741 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
742},
743{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
744 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
745},
746{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
747 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
748},
749{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
750 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
751},
752{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
753 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
754},
755{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
756 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
757},
758{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
759 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
760},
761{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
William M. Bracke978ae22007-03-21 06:16:02 +0000762 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
Daniel Veillard930dfb62003-02-05 10:17:38 +0000763},
764{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
765 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
766},
767{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
768 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
769},
770{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
771 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
772},
773{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
774 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
775},
776{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
777 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
778},
779{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
780 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
781},
782{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
783 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
784},
785{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
786 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
787},
788{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
William M. Bracke978ae22007-03-21 06:16:02 +0000789 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000790},
791{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
792 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
793},
794{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
795 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
796},
797{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
798 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
799},
800{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
801 DECL html_flow, "div", DECL html_attrs, NULL, NULL
802},
803{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
804 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
805},
806{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
807 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
808},
809{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
William M. Bracke978ae22007-03-21 06:16:02 +0000810 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000811},
812{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
813 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
814},
815{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
816 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
817},
818{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
William M. Bracke978ae22007-03-21 06:16:02 +0000819 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000820},
821{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
822 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
823},
824{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
825 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
826},
827{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
828 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
829},
830{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
831 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
832},
833{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
834 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
835},
836{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
837 DECL select_content, NULL, DECL select_attrs, NULL, NULL
838},
839{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
840 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
841},
842{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
843 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
844},
845{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
846 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
847},
848{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
849 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
850},
851{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
852 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
853},
854{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
855 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
856},
857{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
858 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
859},
860{ "table", 0, 0, 0, 0, 0, 0, 0, "",
861 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
862},
863{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
864 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
865},
866{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
867 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
868},
869{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
870 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
871},
872{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
873 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
874},
875{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
876 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
877},
878{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
879 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
880},
881{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
882 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
883},
884{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
885 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
886},
887{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
888 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
889},
890{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
891 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
892},
893{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
894 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
895},
896{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
897 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
898}
Owen Taylor3473f882001-02-23 17:55:21 +0000899};
900
901/*
Owen Taylor3473f882001-02-23 17:55:21 +0000902 * start tags that imply the end of current element
903 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000904static const char * const htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000905"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
906 "dl", "ul", "ol", "menu", "dir", "address", "pre",
907 "listing", "xmp", "head", NULL,
908"head", "p", NULL,
909"title", "p", NULL,
910"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +0000911"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000912"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
913 "pre", "listing", "xmp", "head", "li", NULL,
914"hr", "p", "head", NULL,
915"h1", "p", "head", NULL,
916"h2", "p", "head", NULL,
917"h3", "p", "head", NULL,
918"h4", "p", "head", NULL,
919"h5", "p", "head", NULL,
920"h6", "p", "head", NULL,
921"dir", "p", "head", NULL,
922"address", "p", "head", "ul", NULL,
923"pre", "p", "head", "ul", NULL,
924"listing", "p", "head", NULL,
925"xmp", "p", "head", NULL,
926"blockquote", "p", "head", NULL,
927"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
928 "xmp", "head", NULL,
929"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
930 "head", "dd", NULL,
931"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
932 "head", "dt", NULL,
933"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
934 "listing", "xmp", NULL,
935"ol", "p", "head", "ul", NULL,
936"menu", "p", "head", "ul", NULL,
937"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
938"div", "p", "head", NULL,
939"noscript", "p", "head", NULL,
940"center", "font", "b", "i", "p", "head", NULL,
941"a", "a", NULL,
942"caption", "p", NULL,
943"colgroup", "caption", "colgroup", "col", "p", NULL,
944"col", "caption", "col", "p", NULL,
945"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
946 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000947"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
948"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000949"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
950"thead", "caption", "col", "colgroup", NULL,
951"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
952 "tbody", "p", NULL,
953"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
954 "tfoot", "tbody", "p", NULL,
955"optgroup", "option", NULL,
956"option", "option", NULL,
957"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
958 "pre", "listing", "xmp", "a", NULL,
959NULL
960};
961
962/*
963 * The list of HTML elements which are supposed not to have
964 * CDATA content and where a p element will be implied
965 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000966 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000967 * implied paragraph
968 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000969static const char *const htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000970 "html",
971 "head",
Owen Taylor3473f882001-02-23 17:55:21 +0000972 NULL
973};
974
975/*
976 * The list of HTML attributes which are of content %Script;
977 * NOTE: when adding ones, check htmlIsScriptAttribute() since
978 * it assumes the name starts with 'on'
979 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000980static const char *const htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000981 "onclick",
982 "ondblclick",
983 "onmousedown",
984 "onmouseup",
985 "onmouseover",
986 "onmousemove",
987 "onmouseout",
988 "onkeypress",
989 "onkeydown",
990 "onkeyup",
991 "onload",
992 "onunload",
993 "onfocus",
994 "onblur",
995 "onsubmit",
996 "onrest",
997 "onchange",
998 "onselect"
999};
1000
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001001/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001002 * This table is used by the htmlparser to know what to do with
1003 * broken html pages. By assigning different priorities to different
1004 * elements the parser can decide how to handle extra endtags.
1005 * Endtags are only allowed to close elements with lower or equal
1006 * priority.
1007 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001008
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001009typedef struct {
1010 const char *name;
1011 int priority;
1012} elementPriority;
1013
Daniel Veillard22090732001-07-16 00:06:07 +00001014static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001015 {"div", 150},
1016 {"td", 160},
1017 {"th", 160},
1018 {"tr", 170},
1019 {"thead", 180},
1020 {"tbody", 180},
1021 {"tfoot", 180},
1022 {"table", 190},
1023 {"head", 200},
1024 {"body", 200},
1025 {"html", 220},
1026 {NULL, 100} /* Default priority */
1027};
Owen Taylor3473f882001-02-23 17:55:21 +00001028
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001029static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001030static int htmlStartCloseIndexinitialized = 0;
1031
1032/************************************************************************
1033 * *
1034 * functions to handle HTML specific data *
1035 * *
1036 ************************************************************************/
1037
1038/**
1039 * htmlInitAutoClose:
1040 *
1041 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1042 * This is not reentrant. Call xmlInitParser() once before processing in
1043 * case of use in multithreaded programs.
1044 */
1045void
1046htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001047 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001048
1049 if (htmlStartCloseIndexinitialized) return;
1050
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001051 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1052 indx = 0;
1053 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
Daniel Veillard28aac0b2006-10-16 08:31:18 +00001054 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001055 while (htmlStartClose[i] != NULL) i++;
1056 i++;
1057 }
1058 htmlStartCloseIndexinitialized = 1;
1059}
1060
1061/**
1062 * htmlTagLookup:
1063 * @tag: The tag name in lowercase
1064 *
1065 * Lookup the HTML tag in the ElementTable
1066 *
1067 * Returns the related htmlElemDescPtr or NULL if not found.
1068 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001069const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001070htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001071 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001072
1073 for (i = 0; i < (sizeof(html40ElementTable) /
1074 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001075 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001076 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001077 }
1078 return(NULL);
1079}
1080
1081/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001082 * htmlGetEndPriority:
1083 * @name: The name of the element to look up the priority for.
1084 *
1085 * Return value: The "endtag" priority.
1086 **/
1087static int
1088htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001089 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001090
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001091 while ((htmlEndPriority[i].name != NULL) &&
1092 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1093 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001094
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001095 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001096}
1097
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001098
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001099/**
Owen Taylor3473f882001-02-23 17:55:21 +00001100 * htmlCheckAutoClose:
1101 * @newtag: The new tag name
1102 * @oldtag: The old tag name
1103 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001104 * Checks whether the new tag is one of the registered valid tags for
1105 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001106 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1107 *
1108 * Returns 0 if no, 1 if yes.
1109 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001110static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001111htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1112{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001113 int i, indx;
1114 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001115
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001116 if (htmlStartCloseIndexinitialized == 0)
1117 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001118
1119 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001120 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001121 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001122 if (closed == NULL)
1123 return (0);
1124 if (xmlStrEqual(BAD_CAST * closed, newtag))
1125 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001126 }
1127
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001128 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001129 i++;
1130 while (htmlStartClose[i] != NULL) {
1131 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001132 return (1);
1133 }
1134 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001135 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001136 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001137}
1138
1139/**
1140 * htmlAutoCloseOnClose:
1141 * @ctxt: an HTML parser context
1142 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001143 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001144 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001145 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001146 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001147static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001148htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1149{
1150 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001151 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001152
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001153 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001154
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001155 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001156
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001157 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1158 break;
1159 /*
1160 * A missplaced endtag can only close elements with lower
1161 * or equal priority, so if we find an element with higher
1162 * priority before we find an element with
1163 * matching name, we just ignore this endtag
1164 */
1165 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1166 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001167 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001168 if (i < 0)
1169 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001170
1171 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001172 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001173 if ((info != NULL) && (info->endTag == 3)) {
1174 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1175 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001176 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001177 }
1178 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1179 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001180 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001181 }
1182}
1183
1184/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001185 * htmlAutoCloseOnEnd:
1186 * @ctxt: an HTML parser context
1187 *
1188 * Close all remaining tags at the end of the stream
1189 */
1190static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001191htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1192{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001193 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001194
William M. Brack899e64a2003-09-26 18:03:42 +00001195 if (ctxt->nameNr == 0)
1196 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001197 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001198 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1199 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001200 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001201 }
1202}
1203
1204/**
Owen Taylor3473f882001-02-23 17:55:21 +00001205 * htmlAutoClose:
1206 * @ctxt: an HTML parser context
1207 * @newtag: The new tag name or NULL
1208 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001209 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001210 * The list is kept in htmlStartClose array. This function is
1211 * called when a new tag has been detected and generates the
1212 * appropriates closes if possible/needed.
1213 * If newtag is NULL this mean we are at the end of the resource
1214 * and we should check
1215 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001216static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001217htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1218{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001219 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001220 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001221 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1222 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001223 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001224 }
1225 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001226 htmlAutoCloseOnEnd(ctxt);
1227 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001228 }
1229 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001230 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1231 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1232 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001233 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1234 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001235 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001236 }
Owen Taylor3473f882001-02-23 17:55:21 +00001237}
1238
1239/**
1240 * htmlAutoCloseTag:
1241 * @doc: the HTML document
1242 * @name: The tag name
1243 * @elem: the HTML element
1244 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001245 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001246 * The list is kept in htmlStartClose array. This function checks
1247 * if the element or one of it's children would autoclose the
1248 * given tag.
1249 *
1250 * Returns 1 if autoclose, 0 otherwise
1251 */
1252int
1253htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1254 htmlNodePtr child;
1255
1256 if (elem == NULL) return(1);
1257 if (xmlStrEqual(name, elem->name)) return(0);
1258 if (htmlCheckAutoClose(elem->name, name)) return(1);
1259 child = elem->children;
1260 while (child != NULL) {
1261 if (htmlAutoCloseTag(doc, name, child)) return(1);
1262 child = child->next;
1263 }
1264 return(0);
1265}
1266
1267/**
1268 * htmlIsAutoClosed:
1269 * @doc: the HTML document
1270 * @elem: the HTML element
1271 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001272 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001273 * The list is kept in htmlStartClose array. This function checks
1274 * if a tag is autoclosed by one of it's child
1275 *
1276 * Returns 1 if autoclosed, 0 otherwise
1277 */
1278int
1279htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1280 htmlNodePtr child;
1281
1282 if (elem == NULL) return(1);
1283 child = elem->children;
1284 while (child != NULL) {
1285 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1286 child = child->next;
1287 }
1288 return(0);
1289}
1290
1291/**
1292 * htmlCheckImplied:
1293 * @ctxt: an HTML parser context
1294 * @newtag: The new tag name
1295 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001296 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001297 * called when a new tag has been detected and generates the
1298 * appropriates implicit tags if missing
1299 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001300static void
Owen Taylor3473f882001-02-23 17:55:21 +00001301htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1302 if (!htmlOmittedDefaultValue)
1303 return;
1304 if (xmlStrEqual(newtag, BAD_CAST"html"))
1305 return;
1306 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001307 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001308 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1309 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1310 }
1311 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1312 return;
1313 if ((ctxt->nameNr <= 1) &&
1314 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1315 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1316 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1317 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1318 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1319 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1320 /*
1321 * dropped OBJECT ... i you put it first BODY will be
1322 * assumed !
1323 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001324 htmlnamePush(ctxt, BAD_CAST"head");
Owen Taylor3473f882001-02-23 17:55:21 +00001325 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1326 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1327 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1328 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1329 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1330 int i;
1331 for (i = 0;i < ctxt->nameNr;i++) {
1332 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1333 return;
1334 }
1335 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1336 return;
1337 }
1338 }
1339
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001340 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001341 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1342 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1343 }
1344}
1345
1346/**
1347 * htmlCheckParagraph
1348 * @ctxt: an HTML parser context
1349 *
1350 * Check whether a p element need to be implied before inserting
1351 * characters in the current element.
1352 *
1353 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1354 * in case of error.
1355 */
1356
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001357static int
Owen Taylor3473f882001-02-23 17:55:21 +00001358htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1359 const xmlChar *tag;
1360 int i;
1361
1362 if (ctxt == NULL)
1363 return(-1);
1364 tag = ctxt->name;
1365 if (tag == NULL) {
1366 htmlAutoClose(ctxt, BAD_CAST"p");
1367 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001368 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001369 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1370 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1371 return(1);
1372 }
1373 if (!htmlOmittedDefaultValue)
1374 return(0);
1375 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1376 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001377 htmlAutoClose(ctxt, BAD_CAST"p");
1378 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001379 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001380 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1381 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1382 return(1);
1383 }
1384 }
1385 return(0);
1386}
1387
1388/**
1389 * htmlIsScriptAttribute:
1390 * @name: an attribute name
1391 *
1392 * Check if an attribute is of content type Script
1393 *
1394 * Returns 1 is the attribute is a script 0 otherwise
1395 */
1396int
1397htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001398 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001399
1400 if (name == NULL)
1401 return(0);
1402 /*
1403 * all script attributes start with 'on'
1404 */
1405 if ((name[0] != 'o') || (name[1] != 'n'))
1406 return(0);
1407 for (i = 0;
1408 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1409 i++) {
1410 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1411 return(1);
1412 }
1413 return(0);
1414}
1415
1416/************************************************************************
1417 * *
1418 * The list of HTML predefined entities *
1419 * *
1420 ************************************************************************/
1421
1422
Daniel Veillard22090732001-07-16 00:06:07 +00001423static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001424/*
1425 * the 4 absolute ones, plus apostrophe.
1426 */
1427{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1428{ 38, "amp", "ampersand, U+0026 ISOnum" },
1429{ 39, "apos", "single quote" },
1430{ 60, "lt", "less-than sign, U+003C ISOnum" },
1431{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1432
1433/*
1434 * A bunch still in the 128-255 range
1435 * Replacing them depend really on the charset used.
1436 */
1437{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1438{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1439{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1440{ 163, "pound","pound sign, U+00A3 ISOnum" },
1441{ 164, "curren","currency sign, U+00A4 ISOnum" },
1442{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1443{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1444{ 167, "sect", "section sign, U+00A7 ISOnum" },
1445{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1446{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1447{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1448{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1449{ 172, "not", "not sign, U+00AC ISOnum" },
1450{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1451{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1452{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1453{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1454{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1455{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1456{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1457{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1458{ 181, "micro","micro sign, U+00B5 ISOnum" },
1459{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1460{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1461{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1462{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1463{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1464{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1465{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1466{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1467{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1468{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1469{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1470{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1471{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1472{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1473{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1474{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1475{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1476{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1477{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1478{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1479{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1480{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1481{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1482{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1483{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1484{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1485{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1486{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1487{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1488{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1489{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1490{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1491{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1492{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1493{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1494{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1495{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1496{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1497{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1498{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1499{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1500{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1501{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1502{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1503{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1504{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1505{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1506{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1507{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1508{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1509{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1510{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1511{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1512{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1513{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1514{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1515{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1516{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1517{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1518{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1519{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1520{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1521{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1522{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1523{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1524{ 247, "divide","division sign, U+00F7 ISOnum" },
1525{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1526{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1527{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1528{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1529{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1530{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1531{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1532{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1533
1534{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1535{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1536{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1537{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1538{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1539
1540/*
1541 * Anything below should really be kept as entities references
1542 */
1543{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1544
1545{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1546{ 732, "tilde","small tilde, U+02DC ISOdia" },
1547
1548{ 913, "Alpha","greek capital letter alpha, U+0391" },
1549{ 914, "Beta", "greek capital letter beta, U+0392" },
1550{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1551{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1552{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1553{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1554{ 919, "Eta", "greek capital letter eta, U+0397" },
1555{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1556{ 921, "Iota", "greek capital letter iota, U+0399" },
1557{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001558{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001559{ 924, "Mu", "greek capital letter mu, U+039C" },
1560{ 925, "Nu", "greek capital letter nu, U+039D" },
1561{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1562{ 927, "Omicron","greek capital letter omicron, U+039F" },
1563{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1564{ 929, "Rho", "greek capital letter rho, U+03A1" },
1565{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1566{ 932, "Tau", "greek capital letter tau, U+03A4" },
1567{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1568{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1569{ 935, "Chi", "greek capital letter chi, U+03A7" },
1570{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1571{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1572
1573{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1574{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1575{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1576{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1577{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1578{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1579{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1580{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1581{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1582{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1583{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1584{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1585{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1586{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1587{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1588{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1589{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1590{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1591{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1592{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1593{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1594{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1595{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1596{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1597{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1598{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1599{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1600{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1601
1602{ 8194, "ensp", "en space, U+2002 ISOpub" },
1603{ 8195, "emsp", "em space, U+2003 ISOpub" },
1604{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1605{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1606{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1607{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1608{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1609{ 8211, "ndash","en dash, U+2013 ISOpub" },
1610{ 8212, "mdash","em dash, U+2014 ISOpub" },
1611{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1612{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1613{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1614{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1615{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1616{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1617{ 8224, "dagger","dagger, U+2020 ISOpub" },
1618{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1619
1620{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1621{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1622
1623{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1624
1625{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1626{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1627
1628{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1629{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1630
1631{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1632{ 8260, "frasl","fraction slash, U+2044 NEW" },
1633
1634{ 8364, "euro", "euro sign, U+20AC NEW" },
1635
1636{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1637{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1638{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1639{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1640{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1641{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1642{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1643{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1644{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1645{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1646{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1647{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1648{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1649{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1650{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1651{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1652
1653{ 8704, "forall","for all, U+2200 ISOtech" },
1654{ 8706, "part", "partial differential, U+2202 ISOtech" },
1655{ 8707, "exist","there exists, U+2203 ISOtech" },
1656{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1657{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1658{ 8712, "isin", "element of, U+2208 ISOtech" },
1659{ 8713, "notin","not an element of, U+2209 ISOtech" },
1660{ 8715, "ni", "contains as member, U+220B ISOtech" },
1661{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001662{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001663{ 8722, "minus","minus sign, U+2212 ISOtech" },
1664{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1665{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1666{ 8733, "prop", "proportional to, U+221D ISOtech" },
1667{ 8734, "infin","infinity, U+221E ISOtech" },
1668{ 8736, "ang", "angle, U+2220 ISOamso" },
1669{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1670{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1671{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1672{ 8746, "cup", "union = cup, U+222A ISOtech" },
1673{ 8747, "int", "integral, U+222B ISOtech" },
1674{ 8756, "there4","therefore, U+2234 ISOtech" },
1675{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1676{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1677{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1678{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1679{ 8801, "equiv","identical to, U+2261 ISOtech" },
1680{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1681{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1682{ 8834, "sub", "subset of, U+2282 ISOtech" },
1683{ 8835, "sup", "superset of, U+2283 ISOtech" },
1684{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1685{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1686{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1687{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1688{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1689{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1690{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1691{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1692{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1693{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1694{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1695{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1696{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1697{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1698
1699{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1700{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1701{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1702{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1703
1704};
1705
1706/************************************************************************
1707 * *
1708 * Commodity functions to handle entities *
1709 * *
1710 ************************************************************************/
1711
1712/*
1713 * Macro used to grow the current buffer.
1714 */
1715#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001716 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001717 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001718 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1719 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001720 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001721 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001722 return(NULL); \
1723 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001724 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001725}
1726
1727/**
1728 * htmlEntityLookup:
1729 * @name: the entity name
1730 *
1731 * Lookup the given entity in EntitiesTable
1732 *
1733 * TODO: the linear scan is really ugly, an hash table is really needed.
1734 *
1735 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1736 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001737const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001738htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001739 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001740
1741 for (i = 0;i < (sizeof(html40EntitiesTable)/
1742 sizeof(html40EntitiesTable[0]));i++) {
1743 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001744 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001745 }
1746 }
1747 return(NULL);
1748}
1749
1750/**
1751 * htmlEntityValueLookup:
1752 * @value: the entity's unicode value
1753 *
1754 * Lookup the given entity in EntitiesTable
1755 *
1756 * TODO: the linear scan is really ugly, an hash table is really needed.
1757 *
1758 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1759 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001760const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001761htmlEntityValueLookup(unsigned int value) {
1762 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001763
1764 for (i = 0;i < (sizeof(html40EntitiesTable)/
1765 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001766 if (html40EntitiesTable[i].value >= value) {
1767 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001768 break;
William M. Brack78637da2003-07-31 14:47:38 +00001769 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001770 }
Owen Taylor3473f882001-02-23 17:55:21 +00001771 }
1772 return(NULL);
1773}
1774
1775/**
1776 * UTF8ToHtml:
1777 * @out: a pointer to an array of bytes to store the result
1778 * @outlen: the length of @out
1779 * @in: a pointer to an array of UTF-8 chars
1780 * @inlen: the length of @in
1781 *
1782 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1783 * plus HTML entities block of chars out.
1784 *
1785 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1786 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001787 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001788 * The value of @outlen after return is the number of octets consumed.
1789 */
1790int
1791UTF8ToHtml(unsigned char* out, int *outlen,
1792 const unsigned char* in, int *inlen) {
1793 const unsigned char* processed = in;
1794 const unsigned char* outend;
1795 const unsigned char* outstart = out;
1796 const unsigned char* instart = in;
1797 const unsigned char* inend;
1798 unsigned int c, d;
1799 int trailing;
1800
Daniel Veillardce682bc2004-11-05 17:22:25 +00001801 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00001802 if (in == NULL) {
1803 /*
1804 * initialization nothing to do
1805 */
1806 *outlen = 0;
1807 *inlen = 0;
1808 return(0);
1809 }
1810 inend = in + (*inlen);
1811 outend = out + (*outlen);
1812 while (in < inend) {
1813 d = *in++;
1814 if (d < 0x80) { c= d; trailing= 0; }
1815 else if (d < 0xC0) {
1816 /* trailing byte in leading position */
1817 *outlen = out - outstart;
1818 *inlen = processed - instart;
1819 return(-2);
1820 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1821 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1822 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1823 else {
1824 /* no chance for this in Ascii */
1825 *outlen = out - outstart;
1826 *inlen = processed - instart;
1827 return(-2);
1828 }
1829
1830 if (inend - in < trailing) {
1831 break;
1832 }
1833
1834 for ( ; trailing; trailing--) {
1835 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1836 break;
1837 c <<= 6;
1838 c |= d & 0x3F;
1839 }
1840
1841 /* assertion: c is a single UTF-4 value */
1842 if (c < 0x80) {
1843 if (out + 1 >= outend)
1844 break;
1845 *out++ = c;
1846 } else {
1847 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001848 const htmlEntityDesc * ent;
Daniel Veillard1032ac42006-11-23 16:18:30 +00001849 const char *cp;
1850 char nbuf[16];
Owen Taylor3473f882001-02-23 17:55:21 +00001851
1852 /*
1853 * Try to lookup a predefined HTML entity for it
1854 */
1855
1856 ent = htmlEntityValueLookup(c);
1857 if (ent == NULL) {
Daniel Veillard1032ac42006-11-23 16:18:30 +00001858 snprintf(nbuf, sizeof(nbuf), "#%u", c);
1859 cp = nbuf;
Owen Taylor3473f882001-02-23 17:55:21 +00001860 }
Daniel Veillard1032ac42006-11-23 16:18:30 +00001861 else
1862 cp = ent->name;
1863 len = strlen(cp);
Owen Taylor3473f882001-02-23 17:55:21 +00001864 if (out + 2 + len >= outend)
1865 break;
1866 *out++ = '&';
Daniel Veillard1032ac42006-11-23 16:18:30 +00001867 memcpy(out, cp, len);
Owen Taylor3473f882001-02-23 17:55:21 +00001868 out += len;
1869 *out++ = ';';
1870 }
1871 processed = in;
1872 }
1873 *outlen = out - outstart;
1874 *inlen = processed - instart;
1875 return(0);
1876}
1877
1878/**
1879 * htmlEncodeEntities:
1880 * @out: a pointer to an array of bytes to store the result
1881 * @outlen: the length of @out
1882 * @in: a pointer to an array of UTF-8 chars
1883 * @inlen: the length of @in
1884 * @quoteChar: the quote character to escape (' or ") or zero.
1885 *
1886 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1887 * plus HTML entities block of chars out.
1888 *
1889 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1890 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001891 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001892 * The value of @outlen after return is the number of octets consumed.
1893 */
1894int
1895htmlEncodeEntities(unsigned char* out, int *outlen,
1896 const unsigned char* in, int *inlen, int quoteChar) {
1897 const unsigned char* processed = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001898 const unsigned char* outend;
Owen Taylor3473f882001-02-23 17:55:21 +00001899 const unsigned char* outstart = out;
1900 const unsigned char* instart = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001901 const unsigned char* inend;
Owen Taylor3473f882001-02-23 17:55:21 +00001902 unsigned int c, d;
1903 int trailing;
1904
Daniel Veillardce682bc2004-11-05 17:22:25 +00001905 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
1906 return(-1);
1907 outend = out + (*outlen);
1908 inend = in + (*inlen);
Owen Taylor3473f882001-02-23 17:55:21 +00001909 while (in < inend) {
1910 d = *in++;
1911 if (d < 0x80) { c= d; trailing= 0; }
1912 else if (d < 0xC0) {
1913 /* trailing byte in leading position */
1914 *outlen = out - outstart;
1915 *inlen = processed - instart;
1916 return(-2);
1917 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1918 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1919 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1920 else {
1921 /* no chance for this in Ascii */
1922 *outlen = out - outstart;
1923 *inlen = processed - instart;
1924 return(-2);
1925 }
1926
1927 if (inend - in < trailing)
1928 break;
1929
1930 while (trailing--) {
1931 if (((d= *in++) & 0xC0) != 0x80) {
1932 *outlen = out - outstart;
1933 *inlen = processed - instart;
1934 return(-2);
1935 }
1936 c <<= 6;
1937 c |= d & 0x3F;
1938 }
1939
1940 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001941 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1942 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001943 if (out >= outend)
1944 break;
1945 *out++ = c;
1946 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001947 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001948 const char *cp;
1949 char nbuf[16];
1950 int len;
1951
1952 /*
1953 * Try to lookup a predefined HTML entity for it
1954 */
1955 ent = htmlEntityValueLookup(c);
1956 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001957 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001958 cp = nbuf;
1959 }
1960 else
1961 cp = ent->name;
1962 len = strlen(cp);
1963 if (out + 2 + len > outend)
1964 break;
1965 *out++ = '&';
1966 memcpy(out, cp, len);
1967 out += len;
1968 *out++ = ';';
1969 }
1970 processed = in;
1971 }
1972 *outlen = out - outstart;
1973 *inlen = processed - instart;
1974 return(0);
1975}
1976
Owen Taylor3473f882001-02-23 17:55:21 +00001977/************************************************************************
1978 * *
1979 * Commodity functions to handle streams *
1980 * *
1981 ************************************************************************/
1982
1983/**
Owen Taylor3473f882001-02-23 17:55:21 +00001984 * htmlNewInputStream:
1985 * @ctxt: an HTML parser context
1986 *
1987 * Create a new input stream structure
1988 * Returns the new input stream or NULL
1989 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001990static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001991htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1992 htmlParserInputPtr input;
1993
1994 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1995 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00001996 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001997 return(NULL);
1998 }
1999 memset(input, 0, sizeof(htmlParserInput));
2000 input->filename = NULL;
2001 input->directory = NULL;
2002 input->base = NULL;
2003 input->cur = NULL;
2004 input->buf = NULL;
2005 input->line = 1;
2006 input->col = 1;
2007 input->buf = NULL;
2008 input->free = NULL;
2009 input->version = NULL;
2010 input->consumed = 0;
2011 input->length = 0;
2012 return(input);
2013}
2014
2015
2016/************************************************************************
2017 * *
2018 * Commodity functions, cleanup needed ? *
2019 * *
2020 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002021/*
2022 * all tags allowing pc data from the html 4.01 loose dtd
2023 * NOTE: it might be more apropriate to integrate this information
2024 * into the html40ElementTable array but I don't want to risk any
2025 * binary incomptibility
2026 */
2027static const char *allowPCData[] = {
2028 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2029 "blockquote", "body", "button", "caption", "center", "cite", "code",
2030 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2031 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2032 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2033 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2034};
Owen Taylor3473f882001-02-23 17:55:21 +00002035
2036/**
2037 * areBlanks:
2038 * @ctxt: an HTML parser context
2039 * @str: a xmlChar *
2040 * @len: the size of @str
2041 *
2042 * Is this a sequence of blank chars that one can ignore ?
2043 *
2044 * Returns 1 if ignorable 0 otherwise.
2045 */
2046
2047static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002048 unsigned int i;
2049 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002050 xmlNodePtr lastChild;
Daniel Veillard36d73402005-09-01 09:52:30 +00002051 xmlDtdPtr dtd;
Owen Taylor3473f882001-02-23 17:55:21 +00002052
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002053 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002054 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002055
2056 if (CUR == 0) return(1);
2057 if (CUR != '<') return(0);
2058 if (ctxt->name == NULL)
2059 return(1);
2060 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2061 return(1);
2062 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2063 return(1);
Daniel Veillard36d73402005-09-01 09:52:30 +00002064
2065 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2066 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2067 dtd = xmlGetIntSubset(ctxt->myDoc);
2068 if (dtd != NULL && dtd->ExternalID != NULL) {
2069 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2070 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2071 return(1);
2072 }
2073 }
2074
Owen Taylor3473f882001-02-23 17:55:21 +00002075 if (ctxt->node == NULL) return(0);
2076 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002077 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2078 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002079 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002080 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2081 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002082 /* keep ws in constructs like ...<b> </b>...
2083 for all tags "b" allowing PCDATA */
2084 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2085 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2086 return(0);
2087 }
2088 }
Owen Taylor3473f882001-02-23 17:55:21 +00002089 } else if (xmlNodeIsText(lastChild)) {
2090 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002091 } else {
2092 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2093 for all tags "p" allowing PCDATA */
2094 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2095 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2096 return(0);
2097 }
2098 }
Owen Taylor3473f882001-02-23 17:55:21 +00002099 }
2100 return(1);
2101}
2102
2103/**
Owen Taylor3473f882001-02-23 17:55:21 +00002104 * htmlNewDocNoDtD:
2105 * @URI: URI for the dtd, or NULL
2106 * @ExternalID: the external ID of the DTD, or NULL
2107 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002108 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2109 * are NULL
2110 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002111 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002112 */
2113htmlDocPtr
2114htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2115 xmlDocPtr cur;
2116
2117 /*
2118 * Allocate a new document and fill the fields.
2119 */
2120 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2121 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002122 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002123 return(NULL);
2124 }
2125 memset(cur, 0, sizeof(xmlDoc));
2126
2127 cur->type = XML_HTML_DOCUMENT_NODE;
2128 cur->version = NULL;
2129 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002130 cur->doc = cur;
2131 cur->name = NULL;
2132 cur->children = NULL;
2133 cur->extSubset = NULL;
2134 cur->oldNs = NULL;
2135 cur->encoding = NULL;
2136 cur->standalone = 1;
2137 cur->compression = 0;
2138 cur->ids = NULL;
2139 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002140 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002141 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002142 if ((ExternalID != NULL) ||
2143 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002144 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002145 return(cur);
2146}
2147
2148/**
2149 * htmlNewDoc:
2150 * @URI: URI for the dtd, or NULL
2151 * @ExternalID: the external ID of the DTD, or NULL
2152 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002153 * Creates a new HTML document
2154 *
Owen Taylor3473f882001-02-23 17:55:21 +00002155 * Returns a new document
2156 */
2157htmlDocPtr
2158htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2159 if ((URI == NULL) && (ExternalID == NULL))
2160 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002161 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2162 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002163
2164 return(htmlNewDocNoDtD(URI, ExternalID));
2165}
2166
2167
2168/************************************************************************
2169 * *
2170 * The parser itself *
2171 * Relates to http://www.w3.org/TR/html40 *
2172 * *
2173 ************************************************************************/
2174
2175/************************************************************************
2176 * *
2177 * The parser itself *
2178 * *
2179 ************************************************************************/
2180
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002181static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002182
Owen Taylor3473f882001-02-23 17:55:21 +00002183/**
2184 * htmlParseHTMLName:
2185 * @ctxt: an HTML parser context
2186 *
2187 * parse an HTML tag or attribute name, note that we convert it to lowercase
2188 * since HTML names are not case-sensitive.
2189 *
2190 * Returns the Tag Name parsed or NULL
2191 */
2192
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002193static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002194htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002195 int i = 0;
2196 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2197
William M. Brackd1757ab2004-10-02 22:07:48 +00002198 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Owen Taylor3473f882001-02-23 17:55:21 +00002199 (CUR != ':')) return(NULL);
2200
2201 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002202 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Owen Taylor3473f882001-02-23 17:55:21 +00002203 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2204 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2205 else loc[i] = CUR;
2206 i++;
2207
2208 NEXT;
2209 }
2210
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002211 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002212}
2213
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002214
2215/**
2216 * htmlParseHTMLName_nonInvasive:
2217 * @ctxt: an HTML parser context
2218 *
2219 * parse an HTML tag or attribute name, note that we convert it to lowercase
2220 * since HTML names are not case-sensitive, this doesn't consume the data
2221 * from the stream, it's a look-ahead
2222 *
2223 * Returns the Tag Name parsed or NULL
2224 */
2225
2226static const xmlChar *
2227htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2228 int i = 0;
2229 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2230
2231 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2232 (NXT(1) != ':')) return(NULL);
2233
2234 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2235 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2236 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2237 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2238 else loc[i] = NXT(1+i);
2239 i++;
2240 }
2241
2242 return(xmlDictLookup(ctxt->dict, loc, i));
2243}
2244
2245
Owen Taylor3473f882001-02-23 17:55:21 +00002246/**
2247 * htmlParseName:
2248 * @ctxt: an HTML parser context
2249 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002250 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002251 *
2252 * Returns the Name parsed or NULL
2253 */
2254
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002255static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002256htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002257 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002258 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002259 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002260
2261 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002262
2263 /*
2264 * Accelerator for simple ASCII names
2265 */
2266 in = ctxt->input->cur;
2267 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2268 ((*in >= 0x41) && (*in <= 0x5A)) ||
2269 (*in == '_') || (*in == ':')) {
2270 in++;
2271 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2272 ((*in >= 0x41) && (*in <= 0x5A)) ||
2273 ((*in >= 0x30) && (*in <= 0x39)) ||
2274 (*in == '_') || (*in == '-') ||
2275 (*in == ':') || (*in == '.'))
2276 in++;
2277 if ((*in > 0) && (*in < 0x80)) {
2278 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002279 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002280 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002281 ctxt->nbChars += count;
2282 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002283 return(ret);
2284 }
2285 }
2286 return(htmlParseNameComplex(ctxt));
2287}
2288
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002289static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002290htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002291 int len = 0, l;
2292 int c;
2293 int count = 0;
2294
2295 /*
2296 * Handler for more complex cases
2297 */
2298 GROW;
2299 c = CUR_CHAR(l);
2300 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2301 (!IS_LETTER(c) && (c != '_') &&
2302 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002303 return(NULL);
2304 }
2305
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002306 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2307 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2308 (c == '.') || (c == '-') ||
2309 (c == '_') || (c == ':') ||
2310 (IS_COMBINING(c)) ||
2311 (IS_EXTENDER(c)))) {
2312 if (count++ > 100) {
2313 count = 0;
2314 GROW;
2315 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002316 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002317 NEXTL(l);
2318 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002319 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002320 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002321}
2322
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002323
Owen Taylor3473f882001-02-23 17:55:21 +00002324/**
2325 * htmlParseHTMLAttribute:
2326 * @ctxt: an HTML parser context
2327 * @stop: a char stop value
2328 *
2329 * parse an HTML attribute value till the stop (quote), if
2330 * stop is 0 then it stops at the first space
2331 *
2332 * Returns the attribute parsed or NULL
2333 */
2334
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002335static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002336htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2337 xmlChar *buffer = NULL;
2338 int buffer_size = 0;
2339 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002340 const xmlChar *name = NULL;
2341 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002342 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002343
2344 /*
2345 * allocate a translation buffer.
2346 */
2347 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002348 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002349 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002350 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002351 return(NULL);
2352 }
2353 out = buffer;
2354
2355 /*
2356 * Ok loop until we reach one of the ending chars
2357 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002358 while ((CUR != 0) && (CUR != stop)) {
2359 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002360 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002361 if (CUR == '&') {
2362 if (NXT(1) == '#') {
2363 unsigned int c;
2364 int bits;
2365
2366 c = htmlParseCharRef(ctxt);
2367 if (c < 0x80)
2368 { *out++ = c; bits= -6; }
2369 else if (c < 0x800)
2370 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2371 else if (c < 0x10000)
2372 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2373 else
2374 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2375
2376 for ( ; bits >= 0; bits-= 6) {
2377 *out++ = ((c >> bits) & 0x3F) | 0x80;
2378 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002379
2380 if (out - buffer > buffer_size - 100) {
2381 int indx = out - buffer;
2382
2383 growBuffer(buffer);
2384 out = &buffer[indx];
2385 }
Owen Taylor3473f882001-02-23 17:55:21 +00002386 } else {
2387 ent = htmlParseEntityRef(ctxt, &name);
2388 if (name == NULL) {
2389 *out++ = '&';
2390 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002391 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002392
2393 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002394 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002395 }
2396 } else if (ent == NULL) {
2397 *out++ = '&';
2398 cur = name;
2399 while (*cur != 0) {
2400 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002401 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002402
2403 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002404 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002405 }
2406 *out++ = *cur++;
2407 }
Owen Taylor3473f882001-02-23 17:55:21 +00002408 } else {
2409 unsigned int c;
2410 int bits;
2411
2412 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002413 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002414
2415 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002416 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002417 }
Daniel Veillard48519092006-10-17 15:56:35 +00002418 c = ent->value;
Owen Taylor3473f882001-02-23 17:55:21 +00002419 if (c < 0x80)
2420 { *out++ = c; bits= -6; }
2421 else if (c < 0x800)
2422 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2423 else if (c < 0x10000)
2424 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2425 else
2426 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2427
2428 for ( ; bits >= 0; bits-= 6) {
2429 *out++ = ((c >> bits) & 0x3F) | 0x80;
2430 }
Owen Taylor3473f882001-02-23 17:55:21 +00002431 }
2432 }
2433 } else {
2434 unsigned int c;
2435 int bits, l;
2436
2437 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002438 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002439
2440 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002441 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002442 }
2443 c = CUR_CHAR(l);
2444 if (c < 0x80)
2445 { *out++ = c; bits= -6; }
2446 else if (c < 0x800)
2447 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2448 else if (c < 0x10000)
2449 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2450 else
2451 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2452
2453 for ( ; bits >= 0; bits-= 6) {
2454 *out++ = ((c >> bits) & 0x3F) | 0x80;
2455 }
2456 NEXT;
2457 }
2458 }
2459 *out++ = 0;
2460 return(buffer);
2461}
2462
2463/**
Owen Taylor3473f882001-02-23 17:55:21 +00002464 * htmlParseEntityRef:
2465 * @ctxt: an HTML parser context
2466 * @str: location to store the entity name
2467 *
2468 * parse an HTML ENTITY references
2469 *
2470 * [68] EntityRef ::= '&' Name ';'
2471 *
2472 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2473 * if non-NULL *str will have to be freed by the caller.
2474 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002475const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002476htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2477 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002478 const htmlEntityDesc * ent = NULL;
Daniel Veillard42595322004-11-08 10:52:06 +00002479
2480 if (str != NULL) *str = NULL;
2481 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002482
2483 if (CUR == '&') {
2484 NEXT;
2485 name = htmlParseName(ctxt);
2486 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002487 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2488 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002489 } else {
2490 GROW;
2491 if (CUR == ';') {
Daniel Veillard42595322004-11-08 10:52:06 +00002492 if (str != NULL)
2493 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002494
2495 /*
2496 * Lookup the entity in the table.
2497 */
2498 ent = htmlEntityLookup(name);
2499 if (ent != NULL) /* OK that's ugly !!! */
2500 NEXT;
2501 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002502 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2503 "htmlParseEntityRef: expecting ';'\n",
2504 NULL, NULL);
Daniel Veillard42595322004-11-08 10:52:06 +00002505 if (str != NULL)
2506 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002507 }
2508 }
2509 }
2510 return(ent);
2511}
2512
2513/**
2514 * htmlParseAttValue:
2515 * @ctxt: an HTML parser context
2516 *
2517 * parse a value for an attribute
2518 * Note: the parser won't do substitution of entities here, this
2519 * will be handled later in xmlStringGetNodeList, unless it was
2520 * asked for ctxt->replaceEntities != 0
2521 *
2522 * Returns the AttValue parsed or NULL.
2523 */
2524
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002525static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002526htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2527 xmlChar *ret = NULL;
2528
2529 if (CUR == '"') {
2530 NEXT;
2531 ret = htmlParseHTMLAttribute(ctxt, '"');
2532 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002533 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2534 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002535 } else
2536 NEXT;
2537 } else if (CUR == '\'') {
2538 NEXT;
2539 ret = htmlParseHTMLAttribute(ctxt, '\'');
2540 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002541 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2542 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002543 } else
2544 NEXT;
2545 } else {
2546 /*
2547 * That's an HTMLism, the attribute value may not be quoted
2548 */
2549 ret = htmlParseHTMLAttribute(ctxt, 0);
2550 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002551 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2552 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002553 }
2554 }
2555 return(ret);
2556}
2557
2558/**
2559 * htmlParseSystemLiteral:
2560 * @ctxt: an HTML parser context
2561 *
2562 * parse an HTML Literal
2563 *
2564 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2565 *
2566 * Returns the SystemLiteral parsed or NULL
2567 */
2568
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002569static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002570htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2571 const xmlChar *q;
2572 xmlChar *ret = NULL;
2573
2574 if (CUR == '"') {
2575 NEXT;
2576 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002577 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002578 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002579 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002580 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2581 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002582 } else {
2583 ret = xmlStrndup(q, CUR_PTR - q);
2584 NEXT;
2585 }
2586 } else if (CUR == '\'') {
2587 NEXT;
2588 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002589 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002590 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002591 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002592 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2593 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002594 } else {
2595 ret = xmlStrndup(q, CUR_PTR - q);
2596 NEXT;
2597 }
2598 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002599 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2600 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002601 }
2602
2603 return(ret);
2604}
2605
2606/**
2607 * htmlParsePubidLiteral:
2608 * @ctxt: an HTML parser context
2609 *
2610 * parse an HTML public literal
2611 *
2612 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2613 *
2614 * Returns the PubidLiteral parsed or NULL.
2615 */
2616
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002617static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002618htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2619 const xmlChar *q;
2620 xmlChar *ret = NULL;
2621 /*
2622 * Name ::= (Letter | '_') (NameChar)*
2623 */
2624 if (CUR == '"') {
2625 NEXT;
2626 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002627 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002628 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002629 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2630 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002631 } else {
2632 ret = xmlStrndup(q, CUR_PTR - q);
2633 NEXT;
2634 }
2635 } else if (CUR == '\'') {
2636 NEXT;
2637 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002638 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002639 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002640 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002641 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2642 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002643 } else {
2644 ret = xmlStrndup(q, CUR_PTR - q);
2645 NEXT;
2646 }
2647 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002648 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2649 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002650 }
2651
2652 return(ret);
2653}
2654
2655/**
2656 * htmlParseScript:
2657 * @ctxt: an HTML parser context
2658 *
2659 * parse the content of an HTML SCRIPT or STYLE element
2660 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2661 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2662 * http://www.w3.org/TR/html4/types.html#type-script
2663 * http://www.w3.org/TR/html4/types.html#h-6.15
2664 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2665 *
2666 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2667 * element and the value of intrinsic event attributes. User agents must
2668 * not evaluate script data as HTML markup but instead must pass it on as
2669 * data to a script engine.
2670 * NOTES:
2671 * - The content is passed like CDATA
2672 * - the attributes for style and scripting "onXXX" are also described
2673 * as CDATA but SGML allows entities references in attributes so their
2674 * processing is identical as other attributes
2675 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002676static void
Owen Taylor3473f882001-02-23 17:55:21 +00002677htmlParseScript(htmlParserCtxtPtr ctxt) {
Daniel Veillard7d2b3232005-07-14 08:57:39 +00002678 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
Owen Taylor3473f882001-02-23 17:55:21 +00002679 int nbchar = 0;
Daniel Veillard358fef42005-07-13 16:37:38 +00002680 int cur,l;
Owen Taylor3473f882001-02-23 17:55:21 +00002681
2682 SHRINK;
Daniel Veillard358fef42005-07-13 16:37:38 +00002683 cur = CUR_CHAR(l);
William M. Brack76e95df2003-10-18 16:20:14 +00002684 while (IS_CHAR_CH(cur)) {
Daniel Veillard42720242007-04-16 07:02:31 +00002685 if ((cur == '<') && (NXT(1) == '/')) {
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002686 /*
2687 * One should break here, the specification is clear:
2688 * Authors should therefore escape "</" within the content.
2689 * Escape mechanisms are specific to each scripting or
2690 * style sheet language.
2691 *
2692 * In recovery mode, only break if end tag match the
2693 * current tag, effectively ignoring all tags inside the
2694 * script/style block and treating the entire block as
2695 * CDATA.
2696 */
2697 if (ctxt->recovery) {
2698 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2699 xmlStrlen(ctxt->name)) == 0)
2700 {
2701 break; /* while */
2702 } else {
2703 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
Daniel Veillard2cf36a12005-10-25 12:21:29 +00002704 "Element %s embeds close tag\n",
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002705 ctxt->name, NULL);
2706 }
2707 } else {
2708 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2709 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2710 {
2711 break; /* while */
2712 }
2713 }
Owen Taylor3473f882001-02-23 17:55:21 +00002714 }
Daniel Veillard358fef42005-07-13 16:37:38 +00002715 COPY_BUF(l,buf,nbchar,cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002716 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2717 if (ctxt->sax->cdataBlock!= NULL) {
2718 /*
2719 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2720 */
2721 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002722 } else if (ctxt->sax->characters != NULL) {
2723 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002724 }
2725 nbchar = 0;
2726 }
Daniel Veillardb9900082005-10-25 12:36:29 +00002727 GROW;
Daniel Veillard358fef42005-07-13 16:37:38 +00002728 NEXTL(l);
2729 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002730 }
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002731
Daniel Veillard68716a72006-10-16 09:32:17 +00002732 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002733 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2734 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002735 NEXT;
2736 }
2737
2738 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2739 if (ctxt->sax->cdataBlock!= NULL) {
2740 /*
2741 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2742 */
2743 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002744 } else if (ctxt->sax->characters != NULL) {
2745 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002746 }
2747 }
2748}
2749
2750
2751/**
2752 * htmlParseCharData:
2753 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002754 *
2755 * parse a CharData section.
2756 * if we are within a CDATA section ']]>' marks an end of section.
2757 *
2758 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2759 */
2760
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002761static void
2762htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002763 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2764 int nbchar = 0;
2765 int cur, l;
2766
2767 SHRINK;
2768 cur = CUR_CHAR(l);
2769 while (((cur != '<') || (ctxt->token == '<')) &&
2770 ((cur != '&') || (ctxt->token == '&')) &&
2771 (IS_CHAR(cur))) {
2772 COPY_BUF(l,buf,nbchar,cur);
2773 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2774 /*
2775 * Ok the segment is to be consumed as chars.
2776 */
2777 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2778 if (areBlanks(ctxt, buf, nbchar)) {
2779 if (ctxt->sax->ignorableWhitespace != NULL)
2780 ctxt->sax->ignorableWhitespace(ctxt->userData,
2781 buf, nbchar);
2782 } else {
2783 htmlCheckParagraph(ctxt);
2784 if (ctxt->sax->characters != NULL)
2785 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2786 }
2787 }
2788 nbchar = 0;
2789 }
2790 NEXTL(l);
2791 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002792 if (cur == 0) {
2793 SHRINK;
2794 GROW;
2795 cur = CUR_CHAR(l);
2796 }
Owen Taylor3473f882001-02-23 17:55:21 +00002797 }
2798 if (nbchar != 0) {
Daniel Veillardd2755a82005-08-07 23:42:39 +00002799 buf[nbchar] = 0;
2800
Owen Taylor3473f882001-02-23 17:55:21 +00002801 /*
2802 * Ok the segment is to be consumed as chars.
2803 */
2804 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2805 if (areBlanks(ctxt, buf, nbchar)) {
2806 if (ctxt->sax->ignorableWhitespace != NULL)
2807 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2808 } else {
2809 htmlCheckParagraph(ctxt);
2810 if (ctxt->sax->characters != NULL)
2811 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2812 }
2813 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002814 } else {
2815 /*
2816 * Loop detection
2817 */
2818 if (cur == 0)
2819 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002820 }
2821}
2822
2823/**
2824 * htmlParseExternalID:
2825 * @ctxt: an HTML parser context
2826 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002827 *
2828 * Parse an External ID or a Public ID
2829 *
Owen Taylor3473f882001-02-23 17:55:21 +00002830 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2831 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2832 *
2833 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2834 *
2835 * Returns the function returns SystemLiteral and in the second
2836 * case publicID receives PubidLiteral, is strict is off
2837 * it is possible to return NULL and have publicID set.
2838 */
2839
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002840static xmlChar *
2841htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002842 xmlChar *URI = NULL;
2843
2844 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2845 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2846 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2847 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002848 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002849 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2850 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002851 }
2852 SKIP_BLANKS;
2853 URI = htmlParseSystemLiteral(ctxt);
2854 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002855 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2856 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002857 }
2858 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2859 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2860 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2861 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002862 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002863 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2864 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002865 }
2866 SKIP_BLANKS;
2867 *publicID = htmlParsePubidLiteral(ctxt);
2868 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002869 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2870 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2871 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002872 }
2873 SKIP_BLANKS;
2874 if ((CUR == '"') || (CUR == '\'')) {
2875 URI = htmlParseSystemLiteral(ctxt);
2876 }
2877 }
2878 return(URI);
2879}
2880
2881/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00002882 * xmlParsePI:
2883 * @ctxt: an XML parser context
2884 *
2885 * parse an XML Processing Instruction.
2886 *
2887 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2888 */
2889static void
2890htmlParsePI(htmlParserCtxtPtr ctxt) {
2891 xmlChar *buf = NULL;
2892 int len = 0;
2893 int size = HTML_PARSER_BUFFER_SIZE;
2894 int cur, l;
2895 const xmlChar *target;
2896 xmlParserInputState state;
2897 int count = 0;
2898
2899 if ((RAW == '<') && (NXT(1) == '?')) {
2900 state = ctxt->instate;
2901 ctxt->instate = XML_PARSER_PI;
2902 /*
2903 * this is a Processing Instruction.
2904 */
2905 SKIP(2);
2906 SHRINK;
2907
2908 /*
2909 * Parse the target name and check for special support like
2910 * namespace.
2911 */
2912 target = htmlParseName(ctxt);
2913 if (target != NULL) {
2914 if (RAW == '>') {
2915 SKIP(1);
2916
2917 /*
2918 * SAX: PI detected.
2919 */
2920 if ((ctxt->sax) && (!ctxt->disableSAX) &&
2921 (ctxt->sax->processingInstruction != NULL))
2922 ctxt->sax->processingInstruction(ctxt->userData,
2923 target, NULL);
2924 ctxt->instate = state;
2925 return;
2926 }
2927 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
2928 if (buf == NULL) {
2929 htmlErrMemory(ctxt, NULL);
2930 ctxt->instate = state;
2931 return;
2932 }
2933 cur = CUR;
2934 if (!IS_BLANK(cur)) {
2935 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2936 "ParsePI: PI %s space expected\n", target, NULL);
2937 }
2938 SKIP_BLANKS;
2939 cur = CUR_CHAR(l);
2940 while (IS_CHAR(cur) && (cur != '>')) {
2941 if (len + 5 >= size) {
2942 xmlChar *tmp;
2943
2944 size *= 2;
2945 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2946 if (tmp == NULL) {
2947 htmlErrMemory(ctxt, NULL);
2948 xmlFree(buf);
2949 ctxt->instate = state;
2950 return;
2951 }
2952 buf = tmp;
2953 }
2954 count++;
2955 if (count > 50) {
2956 GROW;
2957 count = 0;
2958 }
2959 COPY_BUF(l,buf,len,cur);
2960 NEXTL(l);
2961 cur = CUR_CHAR(l);
2962 if (cur == 0) {
2963 SHRINK;
2964 GROW;
2965 cur = CUR_CHAR(l);
2966 }
2967 }
2968 buf[len] = 0;
2969 if (cur != '>') {
2970 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
2971 "ParsePI: PI %s never end ...\n", target, NULL);
2972 } else {
2973 SKIP(1);
2974
2975 /*
2976 * SAX: PI detected.
2977 */
2978 if ((ctxt->sax) && (!ctxt->disableSAX) &&
2979 (ctxt->sax->processingInstruction != NULL))
2980 ctxt->sax->processingInstruction(ctxt->userData,
2981 target, buf);
2982 }
2983 xmlFree(buf);
2984 } else {
2985 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
2986 "PI is not started correctly", NULL, NULL);
2987 }
2988 ctxt->instate = state;
2989 }
2990}
2991
2992/**
Owen Taylor3473f882001-02-23 17:55:21 +00002993 * htmlParseComment:
2994 * @ctxt: an HTML parser context
2995 *
2996 * Parse an XML (SGML) comment <!-- .... -->
2997 *
2998 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2999 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003000static void
Owen Taylor3473f882001-02-23 17:55:21 +00003001htmlParseComment(htmlParserCtxtPtr ctxt) {
3002 xmlChar *buf = NULL;
3003 int len;
3004 int size = HTML_PARSER_BUFFER_SIZE;
3005 int q, ql;
3006 int r, rl;
3007 int cur, l;
3008 xmlParserInputState state;
3009
3010 /*
3011 * Check that there is a comment right here.
3012 */
3013 if ((RAW != '<') || (NXT(1) != '!') ||
3014 (NXT(2) != '-') || (NXT(3) != '-')) return;
3015
3016 state = ctxt->instate;
3017 ctxt->instate = XML_PARSER_COMMENT;
3018 SHRINK;
3019 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003020 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00003021 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003022 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003023 ctxt->instate = state;
3024 return;
3025 }
3026 q = CUR_CHAR(ql);
3027 NEXTL(ql);
3028 r = CUR_CHAR(rl);
3029 NEXTL(rl);
3030 cur = CUR_CHAR(l);
3031 len = 0;
3032 while (IS_CHAR(cur) &&
3033 ((cur != '>') ||
3034 (r != '-') || (q != '-'))) {
3035 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00003036 xmlChar *tmp;
3037
Owen Taylor3473f882001-02-23 17:55:21 +00003038 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00003039 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3040 if (tmp == NULL) {
3041 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00003042 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003043 ctxt->instate = state;
3044 return;
3045 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00003046 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00003047 }
3048 COPY_BUF(ql,buf,len,q);
3049 q = r;
3050 ql = rl;
3051 r = cur;
3052 rl = l;
3053 NEXTL(l);
3054 cur = CUR_CHAR(l);
3055 if (cur == 0) {
3056 SHRINK;
3057 GROW;
3058 cur = CUR_CHAR(l);
3059 }
3060 }
3061 buf[len] = 0;
3062 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003063 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3064 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003065 xmlFree(buf);
3066 } else {
3067 NEXT;
3068 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3069 (!ctxt->disableSAX))
3070 ctxt->sax->comment(ctxt->userData, buf);
3071 xmlFree(buf);
3072 }
3073 ctxt->instate = state;
3074}
3075
3076/**
3077 * htmlParseCharRef:
3078 * @ctxt: an HTML parser context
3079 *
3080 * parse Reference declarations
3081 *
3082 * [66] CharRef ::= '&#' [0-9]+ ';' |
3083 * '&#x' [0-9a-fA-F]+ ';'
3084 *
3085 * Returns the value parsed (as an int)
3086 */
3087int
3088htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3089 int val = 0;
3090
Daniel Veillarda03e3652004-11-02 18:45:30 +00003091 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3092 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3093 "htmlParseCharRef: context error\n",
3094 NULL, NULL);
3095 return(0);
3096 }
Owen Taylor3473f882001-02-23 17:55:21 +00003097 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003098 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003099 SKIP(3);
3100 while (CUR != ';') {
3101 if ((CUR >= '0') && (CUR <= '9'))
3102 val = val * 16 + (CUR - '0');
3103 else if ((CUR >= 'a') && (CUR <= 'f'))
3104 val = val * 16 + (CUR - 'a') + 10;
3105 else if ((CUR >= 'A') && (CUR <= 'F'))
3106 val = val * 16 + (CUR - 'A') + 10;
3107 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003108 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3109 "htmlParseCharRef: invalid hexadecimal value\n",
3110 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003111 return(0);
3112 }
3113 NEXT;
3114 }
3115 if (CUR == ';')
3116 NEXT;
3117 } else if ((CUR == '&') && (NXT(1) == '#')) {
3118 SKIP(2);
3119 while (CUR != ';') {
3120 if ((CUR >= '0') && (CUR <= '9'))
3121 val = val * 10 + (CUR - '0');
3122 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003123 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3124 "htmlParseCharRef: invalid decimal value\n",
3125 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003126 return(0);
3127 }
3128 NEXT;
3129 }
3130 if (CUR == ';')
3131 NEXT;
3132 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003133 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3134 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003135 }
3136 /*
3137 * Check the value IS_CHAR ...
3138 */
3139 if (IS_CHAR(val)) {
3140 return(val);
3141 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003142 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3143 "htmlParseCharRef: invalid xmlChar value %d\n",
3144 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003145 }
3146 return(0);
3147}
3148
3149
3150/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003151 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003152 * @ctxt: an HTML parser context
3153 *
3154 * parse a DOCTYPE declaration
3155 *
3156 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3157 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3158 */
3159
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003160static void
Owen Taylor3473f882001-02-23 17:55:21 +00003161htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003162 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003163 xmlChar *ExternalID = NULL;
3164 xmlChar *URI = NULL;
3165
3166 /*
3167 * We know that '<!DOCTYPE' has been detected.
3168 */
3169 SKIP(9);
3170
3171 SKIP_BLANKS;
3172
3173 /*
3174 * Parse the DOCTYPE name.
3175 */
3176 name = htmlParseName(ctxt);
3177 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003178 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3179 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3180 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003181 }
3182 /*
3183 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3184 */
3185
3186 SKIP_BLANKS;
3187
3188 /*
3189 * Check for SystemID and ExternalID
3190 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003191 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003192 SKIP_BLANKS;
3193
3194 /*
3195 * We should be at the end of the DOCTYPE declaration.
3196 */
3197 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003198 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3199 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003200 /* We shouldn't try to resynchronize ... */
3201 }
3202 NEXT;
3203
3204 /*
3205 * Create or update the document accordingly to the DOCTYPE
3206 */
3207 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3208 (!ctxt->disableSAX))
3209 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3210
3211 /*
3212 * Cleanup, since we don't use all those identifiers
3213 */
3214 if (URI != NULL) xmlFree(URI);
3215 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003216}
3217
3218/**
3219 * htmlParseAttribute:
3220 * @ctxt: an HTML parser context
3221 * @value: a xmlChar ** used to store the value of the attribute
3222 *
3223 * parse an attribute
3224 *
3225 * [41] Attribute ::= Name Eq AttValue
3226 *
3227 * [25] Eq ::= S? '=' S?
3228 *
3229 * With namespace:
3230 *
3231 * [NS 11] Attribute ::= QName Eq AttValue
3232 *
3233 * Also the case QName == xmlns:??? is handled independently as a namespace
3234 * definition.
3235 *
3236 * Returns the attribute name, and the value in *value.
3237 */
3238
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003239static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003240htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003241 const xmlChar *name;
3242 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003243
3244 *value = NULL;
3245 name = htmlParseHTMLName(ctxt);
3246 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003247 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3248 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003249 return(NULL);
3250 }
3251
3252 /*
3253 * read the value
3254 */
3255 SKIP_BLANKS;
3256 if (CUR == '=') {
3257 NEXT;
3258 SKIP_BLANKS;
3259 val = htmlParseAttValue(ctxt);
Daniel Veillardc47d2632006-10-17 16:13:27 +00003260 } else if (htmlIsBooleanAttr(name)) {
3261 /*
3262 * assume a minimized attribute
3263 */
3264 val = xmlStrdup(name);
Owen Taylor3473f882001-02-23 17:55:21 +00003265 }
3266
3267 *value = val;
3268 return(name);
3269}
3270
3271/**
3272 * htmlCheckEncoding:
3273 * @ctxt: an HTML parser context
3274 * @attvalue: the attribute value
3275 *
3276 * Checks an http-equiv attribute from a Meta tag to detect
3277 * the encoding
3278 * If a new encoding is detected the parser is switched to decode
3279 * it and pass UTF8
3280 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003281static void
Owen Taylor3473f882001-02-23 17:55:21 +00003282htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3283 const xmlChar *encoding;
3284
3285 if ((ctxt == NULL) || (attvalue == NULL))
3286 return;
3287
3288 /* do not change encoding */
3289 if (ctxt->input->encoding != NULL)
3290 return;
3291
3292 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3293 if (encoding != NULL) {
3294 encoding += 8;
3295 } else {
3296 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3297 if (encoding != NULL)
3298 encoding += 9;
3299 }
3300 if (encoding != NULL) {
3301 xmlCharEncoding enc;
3302 xmlCharEncodingHandlerPtr handler;
3303
3304 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3305
3306 if (ctxt->input->encoding != NULL)
3307 xmlFree((xmlChar *) ctxt->input->encoding);
3308 ctxt->input->encoding = xmlStrdup(encoding);
3309
3310 enc = xmlParseCharEncoding((const char *) encoding);
3311 /*
3312 * registered set of known encodings
3313 */
3314 if (enc != XML_CHAR_ENCODING_ERROR) {
Daniel Veillard7e303562006-10-16 13:14:55 +00003315 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3316 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3317 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3318 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3319 (ctxt->input->buf != NULL) &&
3320 (ctxt->input->buf->encoder == NULL)) {
3321 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3322 "htmlCheckEncoding: wrong encoding meta\n",
3323 NULL, NULL);
3324 } else {
3325 xmlSwitchEncoding(ctxt, enc);
3326 }
Owen Taylor3473f882001-02-23 17:55:21 +00003327 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3328 } else {
3329 /*
3330 * fallback for unknown encodings
3331 */
3332 handler = xmlFindCharEncodingHandler((const char *) encoding);
3333 if (handler != NULL) {
3334 xmlSwitchToEncoding(ctxt, handler);
3335 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3336 } else {
3337 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3338 }
3339 }
3340
3341 if ((ctxt->input->buf != NULL) &&
3342 (ctxt->input->buf->encoder != NULL) &&
3343 (ctxt->input->buf->raw != NULL) &&
3344 (ctxt->input->buf->buffer != NULL)) {
3345 int nbchars;
3346 int processed;
3347
3348 /*
3349 * convert as much as possible to the parser reading buffer.
3350 */
3351 processed = ctxt->input->cur - ctxt->input->base;
3352 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3353 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3354 ctxt->input->buf->buffer,
3355 ctxt->input->buf->raw);
3356 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003357 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3358 "htmlCheckEncoding: encoder error\n",
3359 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003360 }
3361 ctxt->input->base =
3362 ctxt->input->cur = ctxt->input->buf->buffer->content;
3363 }
3364 }
3365}
3366
3367/**
3368 * htmlCheckMeta:
3369 * @ctxt: an HTML parser context
3370 * @atts: the attributes values
3371 *
3372 * Checks an attributes from a Meta tag
3373 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003374static void
Owen Taylor3473f882001-02-23 17:55:21 +00003375htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3376 int i;
3377 const xmlChar *att, *value;
3378 int http = 0;
3379 const xmlChar *content = NULL;
3380
3381 if ((ctxt == NULL) || (atts == NULL))
3382 return;
3383
3384 i = 0;
3385 att = atts[i++];
3386 while (att != NULL) {
3387 value = atts[i++];
3388 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3389 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3390 http = 1;
3391 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3392 content = value;
3393 att = atts[i++];
3394 }
3395 if ((http) && (content != NULL))
3396 htmlCheckEncoding(ctxt, content);
3397
3398}
3399
3400/**
3401 * htmlParseStartTag:
3402 * @ctxt: an HTML parser context
3403 *
3404 * parse a start of tag either for rule element or
3405 * EmptyElement. In both case we don't parse the tag closing chars.
3406 *
3407 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3408 *
3409 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3410 *
3411 * With namespace:
3412 *
3413 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3414 *
3415 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3416 *
Daniel Veillard597f1c12005-07-03 23:00:18 +00003417 * Returns 0 in case of success and -1 in case of error.
Owen Taylor3473f882001-02-23 17:55:21 +00003418 */
3419
Daniel Veillard597f1c12005-07-03 23:00:18 +00003420static int
Owen Taylor3473f882001-02-23 17:55:21 +00003421htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003422 const xmlChar *name;
3423 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003424 xmlChar *attvalue;
Daniel Veillard30e76072006-03-09 14:13:55 +00003425 const xmlChar **atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003426 int nbatts = 0;
Daniel Veillard30e76072006-03-09 14:13:55 +00003427 int maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003428 int meta = 0;
3429 int i;
3430
Daniel Veillarda03e3652004-11-02 18:45:30 +00003431 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3432 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3433 "htmlParseStartTag: context error\n", NULL, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003434 return -1;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003435 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003436 if (CUR != '<') return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003437 NEXT;
3438
Daniel Veillard30e76072006-03-09 14:13:55 +00003439 atts = ctxt->atts;
3440 maxatts = ctxt->maxatts;
3441
Owen Taylor3473f882001-02-23 17:55:21 +00003442 GROW;
3443 name = htmlParseHTMLName(ctxt);
3444 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003445 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3446 "htmlParseStartTag: invalid element name\n",
3447 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003448 /* Dump the bogus tag like browsers do */
William M. Brack76e95df2003-10-18 16:20:14 +00003449 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Owen Taylor3473f882001-02-23 17:55:21 +00003450 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003451 return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003452 }
3453 if (xmlStrEqual(name, BAD_CAST"meta"))
3454 meta = 1;
3455
3456 /*
3457 * Check for auto-closure of HTML elements.
3458 */
3459 htmlAutoClose(ctxt, name);
3460
3461 /*
3462 * Check for implied HTML elements.
3463 */
3464 htmlCheckImplied(ctxt, name);
3465
3466 /*
3467 * Avoid html at any level > 0, head at any level != 1
3468 * or any attempt to recurse body
3469 */
3470 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003471 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3472 "htmlParseStartTag: misplaced <html> tag\n",
3473 name, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003474 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003475 }
3476 if ((ctxt->nameNr != 1) &&
3477 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003478 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3479 "htmlParseStartTag: misplaced <head> tag\n",
3480 name, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003481 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003482 }
3483 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003484 int indx;
3485 for (indx = 0;indx < ctxt->nameNr;indx++) {
3486 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003487 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3488 "htmlParseStartTag: misplaced <body> tag\n",
3489 name, NULL);
Daniel Veillardc59d8262003-11-20 21:59:12 +00003490 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3491 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003492 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003493 }
3494 }
3495 }
3496
3497 /*
3498 * Now parse the attributes, it ends up with the ending
3499 *
3500 * (S Attribute)* S?
3501 */
3502 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003503 while ((IS_CHAR_CH(CUR)) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003504 (CUR != '>') &&
3505 ((CUR != '/') || (NXT(1) != '>'))) {
3506 long cons = ctxt->nbChars;
3507
3508 GROW;
3509 attname = htmlParseAttribute(ctxt, &attvalue);
3510 if (attname != NULL) {
3511
3512 /*
3513 * Well formedness requires at most one declaration of an attribute
3514 */
3515 for (i = 0; i < nbatts;i += 2) {
3516 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003517 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3518 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003519 if (attvalue != NULL)
3520 xmlFree(attvalue);
3521 goto failed;
3522 }
3523 }
3524
3525 /*
3526 * Add the pair to atts
3527 */
3528 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003529 maxatts = 22; /* allow for 10 attrs by default */
3530 atts = (const xmlChar **)
3531 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003532 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003533 htmlErrMemory(ctxt, NULL);
3534 if (attvalue != NULL)
3535 xmlFree(attvalue);
3536 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003537 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003538 ctxt->atts = atts;
3539 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003540 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003541 const xmlChar **n;
3542
Owen Taylor3473f882001-02-23 17:55:21 +00003543 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003544 n = (const xmlChar **) xmlRealloc((void *) atts,
3545 maxatts * sizeof(const xmlChar *));
3546 if (n == NULL) {
3547 htmlErrMemory(ctxt, NULL);
3548 if (attvalue != NULL)
3549 xmlFree(attvalue);
3550 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003551 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003552 atts = n;
3553 ctxt->atts = atts;
3554 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003555 }
3556 atts[nbatts++] = attname;
3557 atts[nbatts++] = attvalue;
3558 atts[nbatts] = NULL;
3559 atts[nbatts + 1] = NULL;
3560 }
3561 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003562 if (attvalue != NULL)
3563 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003564 /* Dump the bogus attribute string up to the next blank or
3565 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003566 while ((IS_CHAR_CH(CUR)) &&
3567 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003568 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003569 NEXT;
3570 }
3571
3572failed:
3573 SKIP_BLANKS;
3574 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003575 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3576 "htmlParseStartTag: problem parsing attributes\n",
3577 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003578 break;
3579 }
3580 }
3581
3582 /*
3583 * Handle specific association to the META tag
3584 */
William M. Bracke978ae22007-03-21 06:16:02 +00003585 if (meta && (nbatts != 0))
Owen Taylor3473f882001-02-23 17:55:21 +00003586 htmlCheckMeta(ctxt, atts);
3587
3588 /*
3589 * SAX: Start of Element !
3590 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003591 htmlnamePush(ctxt, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003592 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3593 if (nbatts != 0)
3594 ctxt->sax->startElement(ctxt->userData, name, atts);
3595 else
3596 ctxt->sax->startElement(ctxt->userData, name, NULL);
3597 }
Owen Taylor3473f882001-02-23 17:55:21 +00003598
3599 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003600 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003601 if (atts[i] != NULL)
3602 xmlFree((xmlChar *) atts[i]);
3603 }
Owen Taylor3473f882001-02-23 17:55:21 +00003604 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003605
3606 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003607}
3608
3609/**
3610 * htmlParseEndTag:
3611 * @ctxt: an HTML parser context
3612 *
3613 * parse an end of tag
3614 *
3615 * [42] ETag ::= '</' Name S? '>'
3616 *
3617 * With namespace
3618 *
3619 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003620 *
3621 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003622 */
3623
Daniel Veillardf420ac52001-07-04 16:04:09 +00003624static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003625htmlParseEndTag(htmlParserCtxtPtr ctxt)
3626{
3627 const xmlChar *name;
3628 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003629 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003630
3631 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003632 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3633 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003634 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003635 }
3636 SKIP(2);
3637
3638 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003639 if (name == NULL)
3640 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003641
3642 /*
3643 * We should definitely be at the ending "S? '>'" part
3644 */
3645 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003646 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003647 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3648 "End tag : expected '>'\n", NULL, NULL);
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00003649 if (ctxt->recovery) {
3650 /*
3651 * We're not at the ending > !!
3652 * Error, unless in recover mode where we search forwards
3653 * until we find a >
3654 */
3655 while (CUR != '\0' && CUR != '>') NEXT;
3656 NEXT;
3657 }
Owen Taylor3473f882001-02-23 17:55:21 +00003658 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003659 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003660
3661 /*
3662 * If the name read is not one of the element in the parsing stack
3663 * then return, it's just an error.
3664 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003665 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3666 if (xmlStrEqual(name, ctxt->nameTab[i]))
3667 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003668 }
3669 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003670 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3671 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003672 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003673 }
3674
3675
3676 /*
3677 * Check for auto-closure of HTML elements.
3678 */
3679
3680 htmlAutoCloseOnClose(ctxt, name);
3681
3682 /*
3683 * Well formedness constraints, opening and closing must match.
3684 * With the exception that the autoclose may have popped stuff out
3685 * of the stack.
3686 */
3687 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003688 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003689 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3690 "Opening and ending tag mismatch: %s and %s\n",
3691 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003692 }
3693 }
3694
3695 /*
3696 * SAX: End of Tag
3697 */
3698 oldname = ctxt->name;
3699 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003700 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3701 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003702 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003703 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003704 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003705 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003706 }
3707
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003708 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003709}
3710
3711
3712/**
3713 * htmlParseReference:
3714 * @ctxt: an HTML parser context
3715 *
3716 * parse and handle entity references in content,
3717 * this will end-up in a call to character() since this is either a
3718 * CharRef, or a predefined entity.
3719 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003720static void
Owen Taylor3473f882001-02-23 17:55:21 +00003721htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003722 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003723 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003724 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003725 if (CUR != '&') return;
3726
3727 if (NXT(1) == '#') {
3728 unsigned int c;
3729 int bits, i = 0;
3730
3731 c = htmlParseCharRef(ctxt);
3732 if (c == 0)
3733 return;
3734
3735 if (c < 0x80) { out[i++]= c; bits= -6; }
3736 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3737 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3738 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3739
3740 for ( ; bits >= 0; bits-= 6) {
3741 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3742 }
3743 out[i] = 0;
3744
3745 htmlCheckParagraph(ctxt);
3746 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3747 ctxt->sax->characters(ctxt->userData, out, i);
3748 } else {
3749 ent = htmlParseEntityRef(ctxt, &name);
3750 if (name == NULL) {
3751 htmlCheckParagraph(ctxt);
3752 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3753 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3754 return;
3755 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003756 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003757 htmlCheckParagraph(ctxt);
3758 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3759 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3760 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3761 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3762 }
3763 } else {
3764 unsigned int c;
3765 int bits, i = 0;
3766
3767 c = ent->value;
3768 if (c < 0x80)
3769 { out[i++]= c; bits= -6; }
3770 else if (c < 0x800)
3771 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3772 else if (c < 0x10000)
3773 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3774 else
3775 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3776
3777 for ( ; bits >= 0; bits-= 6) {
3778 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3779 }
3780 out[i] = 0;
3781
3782 htmlCheckParagraph(ctxt);
3783 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3784 ctxt->sax->characters(ctxt->userData, out, i);
3785 }
Owen Taylor3473f882001-02-23 17:55:21 +00003786 }
3787}
3788
3789/**
3790 * htmlParseContent:
3791 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00003792 *
3793 * Parse a content: comment, sub-element, reference or text.
Owen Taylor3473f882001-02-23 17:55:21 +00003794 */
3795
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003796static void
Owen Taylor3473f882001-02-23 17:55:21 +00003797htmlParseContent(htmlParserCtxtPtr ctxt) {
3798 xmlChar *currentNode;
3799 int depth;
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003800 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003801
3802 currentNode = xmlStrdup(ctxt->name);
3803 depth = ctxt->nameNr;
3804 while (1) {
3805 long cons = ctxt->nbChars;
3806
3807 GROW;
3808 /*
3809 * Our tag or one of it's parent or children is ending.
3810 */
3811 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003812 if (htmlParseEndTag(ctxt) &&
3813 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3814 if (currentNode != NULL)
3815 xmlFree(currentNode);
3816 return;
3817 }
3818 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003819 }
3820
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003821 else if ((CUR == '<') &&
3822 ((IS_ASCII_LETTER(NXT(1))) ||
3823 (NXT(1) == '_') || (NXT(1) == ':'))) {
3824 name = htmlParseHTMLName_nonInvasive(ctxt);
3825 if (name == NULL) {
3826 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3827 "htmlParseStartTag: invalid element name\n",
3828 NULL, NULL);
3829 /* Dump the bogus tag like browsers do */
3830 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3831 NEXT;
3832
3833 if (currentNode != NULL)
3834 xmlFree(currentNode);
3835 return;
3836 }
3837
3838 if (ctxt->name != NULL) {
3839 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
3840 htmlAutoClose(ctxt, name);
3841 continue;
3842 }
3843 }
3844 }
3845
Owen Taylor3473f882001-02-23 17:55:21 +00003846 /*
3847 * Has this node been popped out during parsing of
3848 * the next element
3849 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003850 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3851 (!xmlStrEqual(currentNode, ctxt->name)))
3852 {
Owen Taylor3473f882001-02-23 17:55:21 +00003853 if (currentNode != NULL) xmlFree(currentNode);
3854 return;
3855 }
3856
Daniel Veillardf9533d12001-03-03 10:04:57 +00003857 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3858 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003859 /*
3860 * Handle SCRIPT/STYLE separately
3861 */
3862 htmlParseScript(ctxt);
3863 } else {
3864 /*
3865 * Sometimes DOCTYPE arrives in the middle of the document
3866 */
3867 if ((CUR == '<') && (NXT(1) == '!') &&
3868 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3869 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3870 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3871 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003872 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3873 "Misplaced DOCTYPE declaration\n",
3874 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003875 htmlParseDocTypeDecl(ctxt);
3876 }
3877
3878 /*
3879 * First case : a comment
3880 */
3881 if ((CUR == '<') && (NXT(1) == '!') &&
3882 (NXT(2) == '-') && (NXT(3) == '-')) {
3883 htmlParseComment(ctxt);
3884 }
3885
3886 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003887 * Second case : a Processing Instruction.
3888 */
3889 else if ((CUR == '<') && (NXT(1) == '?')) {
3890 htmlParsePI(ctxt);
3891 }
3892
3893 /*
3894 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00003895 */
3896 else if (CUR == '<') {
3897 htmlParseElement(ctxt);
3898 }
3899
3900 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003901 * Fourth case : a reference. If if has not been resolved,
Owen Taylor3473f882001-02-23 17:55:21 +00003902 * parsing returns it's Name, create the node
3903 */
3904 else if (CUR == '&') {
3905 htmlParseReference(ctxt);
3906 }
3907
3908 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003909 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00003910 */
3911 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003912 htmlAutoCloseOnEnd(ctxt);
3913 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003914 }
3915
3916 /*
3917 * Last case, text. Note that References are handled directly.
3918 */
3919 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003920 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003921 }
3922
3923 if (cons == ctxt->nbChars) {
3924 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003925 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3926 "detected an error in element content\n",
3927 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003928 }
3929 break;
3930 }
3931 }
3932 GROW;
3933 }
3934 if (currentNode != NULL) xmlFree(currentNode);
3935}
3936
3937/**
Daniel Veillard499cc922006-01-18 17:22:35 +00003938 * htmlParseContent:
3939 * @ctxt: an HTML parser context
3940 *
3941 * Parse a content: comment, sub-element, reference or text.
3942 */
3943
3944void
3945__htmlParseContent(void *ctxt) {
3946 if (ctxt != NULL)
3947 htmlParseContent((htmlParserCtxtPtr) ctxt);
3948}
3949
3950/**
Owen Taylor3473f882001-02-23 17:55:21 +00003951 * htmlParseElement:
3952 * @ctxt: an HTML parser context
3953 *
3954 * parse an HTML element, this is highly recursive
3955 *
3956 * [39] element ::= EmptyElemTag | STag content ETag
3957 *
3958 * [41] Attribute ::= Name Eq AttValue
3959 */
3960
3961void
3962htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003963 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003964 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003965 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003966 htmlParserNodeInfo node_info;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003967 int failed;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003968 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003969 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003970
Daniel Veillarda03e3652004-11-02 18:45:30 +00003971 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3972 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
Daniel Veillard597f1c12005-07-03 23:00:18 +00003973 "htmlParseElement: context error\n", NULL, NULL);
Daniel Veillarda03e3652004-11-02 18:45:30 +00003974 return;
3975 }
Owen Taylor3473f882001-02-23 17:55:21 +00003976 /* Capture start position */
3977 if (ctxt->record_info) {
3978 node_info.begin_pos = ctxt->input->consumed +
3979 (CUR_PTR - ctxt->input->base);
3980 node_info.begin_line = ctxt->input->line;
3981 }
3982
Daniel Veillard597f1c12005-07-03 23:00:18 +00003983 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003984 name = ctxt->name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003985 if (failed || (name == NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003986 if (CUR == '>')
3987 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003988 return;
3989 }
Owen Taylor3473f882001-02-23 17:55:21 +00003990
3991 /*
3992 * Lookup the info for that element.
3993 */
3994 info = htmlTagLookup(name);
3995 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003996 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
3997 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003998 }
3999
4000 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004001 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004002 */
4003 if ((CUR == '/') && (NXT(1) == '>')) {
4004 SKIP(2);
4005 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4006 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004007 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004008 return;
4009 }
4010
4011 if (CUR == '>') {
4012 NEXT;
4013 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004014 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4015 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004016
4017 /*
4018 * end of parsing of this node.
4019 */
4020 if (xmlStrEqual(name, ctxt->name)) {
4021 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00004022 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004023 }
4024
4025 /*
4026 * Capture end position and add node
4027 */
Daniel Veillard30e76072006-03-09 14:13:55 +00004028 if (ctxt->record_info) {
Owen Taylor3473f882001-02-23 17:55:21 +00004029 node_info.end_pos = ctxt->input->consumed +
4030 (CUR_PTR - ctxt->input->base);
4031 node_info.end_line = ctxt->input->line;
4032 node_info.node = ctxt->node;
4033 xmlParserAddNodeInfo(ctxt, &node_info);
4034 }
4035 return;
4036 }
4037
4038 /*
4039 * Check for an Empty Element from DTD definition
4040 */
4041 if ((info != NULL) && (info->empty)) {
4042 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4043 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004044 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004045 return;
4046 }
4047
4048 /*
4049 * Parse the content of the element:
4050 */
4051 currentNode = xmlStrdup(ctxt->name);
4052 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00004053 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00004054 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00004055 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00004056 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00004057 if (ctxt->nameNr < depth) break;
4058 }
4059
Owen Taylor3473f882001-02-23 17:55:21 +00004060 /*
4061 * Capture end position and add node
4062 */
4063 if ( currentNode != NULL && ctxt->record_info ) {
4064 node_info.end_pos = ctxt->input->consumed +
4065 (CUR_PTR - ctxt->input->base);
4066 node_info.end_line = ctxt->input->line;
4067 node_info.node = ctxt->node;
4068 xmlParserAddNodeInfo(ctxt, &node_info);
4069 }
William M. Brack76e95df2003-10-18 16:20:14 +00004070 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004071 htmlAutoCloseOnEnd(ctxt);
4072 }
4073
Owen Taylor3473f882001-02-23 17:55:21 +00004074 if (currentNode != NULL)
4075 xmlFree(currentNode);
4076}
4077
4078/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004079 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00004080 * @ctxt: an HTML parser context
4081 *
4082 * parse an HTML document (and build a tree if using the standard SAX
4083 * interface).
4084 *
4085 * Returns 0, -1 in case of error. the parser context is augmented
4086 * as a result of the parsing.
4087 */
4088
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00004089int
Owen Taylor3473f882001-02-23 17:55:21 +00004090htmlParseDocument(htmlParserCtxtPtr ctxt) {
4091 xmlDtdPtr dtd;
4092
Daniel Veillardd0463562001-10-13 09:15:48 +00004093 xmlInitParser();
4094
Owen Taylor3473f882001-02-23 17:55:21 +00004095 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00004096
Daniel Veillarda03e3652004-11-02 18:45:30 +00004097 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4098 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4099 "htmlParseDocument: context error\n", NULL, NULL);
4100 return(XML_ERR_INTERNAL_ERROR);
4101 }
4102 ctxt->html = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00004103 GROW;
4104 /*
4105 * SAX: beginning of the document processing.
4106 */
4107 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4108 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4109
4110 /*
4111 * Wipe out everything which is before the first '<'
4112 */
4113 SKIP_BLANKS;
4114 if (CUR == 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004115 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4116 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004117 }
4118
4119 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4120 ctxt->sax->startDocument(ctxt->userData);
4121
4122
4123 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004124 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004125 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004126 while (((CUR == '<') && (NXT(1) == '!') &&
4127 (NXT(2) == '-') && (NXT(3) == '-')) ||
4128 ((CUR == '<') && (NXT(1) == '?'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004129 htmlParseComment(ctxt);
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004130 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004131 SKIP_BLANKS;
4132 }
4133
4134
4135 /*
4136 * Then possibly doc type declaration(s) and more Misc
4137 * (doctypedecl Misc*)?
4138 */
4139 if ((CUR == '<') && (NXT(1) == '!') &&
4140 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4141 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4142 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4143 (UPP(8) == 'E')) {
4144 htmlParseDocTypeDecl(ctxt);
4145 }
4146 SKIP_BLANKS;
4147
4148 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004149 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004150 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004151 while (((CUR == '<') && (NXT(1) == '!') &&
4152 (NXT(2) == '-') && (NXT(3) == '-')) ||
4153 ((CUR == '<') && (NXT(1) == '?'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004154 htmlParseComment(ctxt);
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004155 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004156 SKIP_BLANKS;
4157 }
4158
4159 /*
4160 * Time to start parsing the tree itself
4161 */
4162 htmlParseContent(ctxt);
4163
4164 /*
4165 * autoclose
4166 */
4167 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004168 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004169
4170
4171 /*
4172 * SAX: end of the document processing.
4173 */
4174 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4175 ctxt->sax->endDocument(ctxt->userData);
4176
4177 if (ctxt->myDoc != NULL) {
4178 dtd = xmlGetIntSubset(ctxt->myDoc);
4179 if (dtd == NULL)
4180 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00004181 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004182 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4183 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4184 }
4185 if (! ctxt->wellFormed) return(-1);
4186 return(0);
4187}
4188
4189
4190/************************************************************************
4191 * *
4192 * Parser contexts handling *
4193 * *
4194 ************************************************************************/
4195
4196/**
William M. Brackedb65a72004-02-06 07:36:04 +00004197 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004198 * @ctxt: an HTML parser context
4199 *
4200 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004201 *
4202 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004203 */
4204
Daniel Veillardf403d292003-10-05 13:51:35 +00004205static int
Owen Taylor3473f882001-02-23 17:55:21 +00004206htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4207{
4208 htmlSAXHandler *sax;
4209
Daniel Veillardf403d292003-10-05 13:51:35 +00004210 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004211 memset(ctxt, 0, sizeof(htmlParserCtxt));
4212
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004213 ctxt->dict = xmlDictCreate();
4214 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004215 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4216 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004217 }
Owen Taylor3473f882001-02-23 17:55:21 +00004218 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4219 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004220 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4221 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004222 }
4223 else
4224 memset(sax, 0, sizeof(htmlSAXHandler));
4225
4226 /* Allocate the Input stack */
4227 ctxt->inputTab = (htmlParserInputPtr *)
4228 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4229 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004230 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004231 ctxt->inputNr = 0;
4232 ctxt->inputMax = 0;
4233 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004234 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004235 }
4236 ctxt->inputNr = 0;
4237 ctxt->inputMax = 5;
4238 ctxt->input = NULL;
4239 ctxt->version = NULL;
4240 ctxt->encoding = NULL;
4241 ctxt->standalone = -1;
4242 ctxt->instate = XML_PARSER_START;
4243
4244 /* Allocate the Node stack */
4245 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4246 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004247 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004248 ctxt->nodeNr = 0;
4249 ctxt->nodeMax = 0;
4250 ctxt->node = NULL;
4251 ctxt->inputNr = 0;
4252 ctxt->inputMax = 0;
4253 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004254 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004255 }
4256 ctxt->nodeNr = 0;
4257 ctxt->nodeMax = 10;
4258 ctxt->node = NULL;
4259
4260 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004261 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004262 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004263 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004264 ctxt->nameNr = 0;
4265 ctxt->nameMax = 10;
4266 ctxt->name = NULL;
4267 ctxt->nodeNr = 0;
4268 ctxt->nodeMax = 0;
4269 ctxt->node = NULL;
4270 ctxt->inputNr = 0;
4271 ctxt->inputMax = 0;
4272 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004273 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004274 }
4275 ctxt->nameNr = 0;
4276 ctxt->nameMax = 10;
4277 ctxt->name = NULL;
4278
Daniel Veillard092643b2003-09-25 14:29:29 +00004279 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004280 else {
4281 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004282 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004283 }
4284 ctxt->userData = ctxt;
4285 ctxt->myDoc = NULL;
4286 ctxt->wellFormed = 1;
4287 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004288 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004289 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004290 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004291 ctxt->vctxt.userData = ctxt;
4292 ctxt->vctxt.error = xmlParserValidityError;
4293 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004294 ctxt->record_info = 0;
4295 ctxt->validate = 0;
4296 ctxt->nbChars = 0;
4297 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004298 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004299 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004300 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004301}
4302
4303/**
4304 * htmlFreeParserCtxt:
4305 * @ctxt: an HTML parser context
4306 *
4307 * Free all the memory used by a parser context. However the parsed
4308 * document in ctxt->myDoc is not freed.
4309 */
4310
4311void
4312htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4313{
4314 xmlFreeParserCtxt(ctxt);
4315}
4316
4317/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004318 * htmlNewParserCtxt:
4319 *
4320 * Allocate and initialize a new parser context.
4321 *
Daniel Veillard34c647c2006-09-21 06:53:59 +00004322 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
Daniel Veillard1d995272002-07-22 16:43:32 +00004323 */
4324
Daniel Veillard34c647c2006-09-21 06:53:59 +00004325htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004326htmlNewParserCtxt(void)
4327{
4328 xmlParserCtxtPtr ctxt;
4329
4330 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4331 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004332 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004333 return(NULL);
4334 }
4335 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004336 if (htmlInitParserCtxt(ctxt) < 0) {
4337 htmlFreeParserCtxt(ctxt);
4338 return(NULL);
4339 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004340 return(ctxt);
4341}
4342
4343/**
4344 * htmlCreateMemoryParserCtxt:
4345 * @buffer: a pointer to a char array
4346 * @size: the size of the array
4347 *
4348 * Create a parser context for an HTML in-memory document.
4349 *
4350 * Returns the new parser context or NULL
4351 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004352htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004353htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4354 xmlParserCtxtPtr ctxt;
4355 xmlParserInputPtr input;
4356 xmlParserInputBufferPtr buf;
4357
4358 if (buffer == NULL)
4359 return(NULL);
4360 if (size <= 0)
4361 return(NULL);
4362
4363 ctxt = htmlNewParserCtxt();
4364 if (ctxt == NULL)
4365 return(NULL);
4366
4367 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4368 if (buf == NULL) return(NULL);
4369
4370 input = xmlNewInputStream(ctxt);
4371 if (input == NULL) {
4372 xmlFreeParserCtxt(ctxt);
4373 return(NULL);
4374 }
4375
4376 input->filename = NULL;
4377 input->buf = buf;
4378 input->base = input->buf->buffer->content;
4379 input->cur = input->buf->buffer->content;
4380 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4381
4382 inputPush(ctxt, input);
4383 return(ctxt);
4384}
4385
4386/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004387 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004388 * @cur: a pointer to an array of xmlChar
4389 * @encoding: a free form C string describing the HTML document encoding, or NULL
4390 *
4391 * Create a parser context for an HTML document.
4392 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004393 * TODO: check the need to add encoding handling there
4394 *
Owen Taylor3473f882001-02-23 17:55:21 +00004395 * Returns the new parser context or NULL
4396 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004397static htmlParserCtxtPtr
Daniel Veillard4d1320f2007-04-26 08:55:33 +00004398htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004399 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004400 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004401
Daniel Veillard1d995272002-07-22 16:43:32 +00004402 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004403 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004404 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004405 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
Daniel Veillard739e9d02007-04-27 09:33:58 +00004406 if (ctxt == NULL)
4407 return(NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004408
4409 if (encoding != NULL) {
4410 xmlCharEncoding enc;
4411 xmlCharEncodingHandlerPtr handler;
4412
4413 if (ctxt->input->encoding != NULL)
4414 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004415 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004416
4417 enc = xmlParseCharEncoding(encoding);
4418 /*
4419 * registered set of known encodings
4420 */
4421 if (enc != XML_CHAR_ENCODING_ERROR) {
4422 xmlSwitchEncoding(ctxt, enc);
4423 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004424 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4425 "Unsupported encoding %s\n",
4426 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004427 }
4428 } else {
4429 /*
4430 * fallback for unknown encodings
4431 */
4432 handler = xmlFindCharEncodingHandler((const char *) encoding);
4433 if (handler != NULL) {
4434 xmlSwitchToEncoding(ctxt, handler);
4435 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004436 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4437 "Unsupported encoding %s\n",
4438 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004439 }
4440 }
4441 }
4442 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004443}
4444
Daniel Veillard73b013f2003-09-30 12:36:01 +00004445#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004446/************************************************************************
4447 * *
4448 * Progressive parsing interfaces *
4449 * *
4450 ************************************************************************/
4451
4452/**
4453 * htmlParseLookupSequence:
4454 * @ctxt: an HTML parser context
4455 * @first: the first char to lookup
4456 * @next: the next char to lookup or zero
4457 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004458 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004459 *
4460 * Try to find if a sequence (first, next, third) or just (first next) or
4461 * (first) is available in the input stream.
4462 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4463 * to avoid rescanning sequences of bytes, it DOES change the state of the
4464 * parser, do not use liberally.
4465 * This is basically similar to xmlParseLookupSequence()
4466 *
4467 * Returns the index to the current parsing point if the full sequence
4468 * is available, -1 otherwise.
4469 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004470static int
Owen Taylor3473f882001-02-23 17:55:21 +00004471htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
William M. Brackc1939562003-08-05 15:52:22 +00004472 xmlChar next, xmlChar third, int iscomment) {
Owen Taylor3473f882001-02-23 17:55:21 +00004473 int base, len;
4474 htmlParserInputPtr in;
4475 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004476 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004477
4478 in = ctxt->input;
4479 if (in == NULL) return(-1);
4480 base = in->cur - in->base;
4481 if (base < 0) return(-1);
4482 if (ctxt->checkIndex > base)
4483 base = ctxt->checkIndex;
4484 if (in->buf == NULL) {
4485 buf = in->base;
4486 len = in->length;
4487 } else {
4488 buf = in->buf->buffer->content;
4489 len = in->buf->buffer->use;
4490 }
4491 /* take into account the sequence length */
4492 if (third) len -= 2;
4493 else if (next) len --;
4494 for (;base < len;base++) {
William M. Brackc1939562003-08-05 15:52:22 +00004495 if (!incomment && (base + 4 < len) && !iscomment) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004496 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4497 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4498 incomment = 1;
Daniel Veillard97e01882003-07-30 18:59:19 +00004499 /* do not increment past <! - some people use <!--> */
4500 base += 2;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004501 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004502 }
4503 if (incomment) {
William M. Brack4a557d92003-07-29 04:28:04 +00004504 if (base + 3 > len)
Daniel Veillardc1f78342001-11-10 11:43:05 +00004505 return(-1);
4506 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4507 (buf[base + 2] == '>')) {
4508 incomment = 0;
4509 base += 2;
4510 }
4511 continue;
4512 }
Owen Taylor3473f882001-02-23 17:55:21 +00004513 if (buf[base] == first) {
4514 if (third != 0) {
4515 if ((buf[base + 1] != next) ||
4516 (buf[base + 2] != third)) continue;
4517 } else if (next != 0) {
4518 if (buf[base + 1] != next) continue;
4519 }
4520 ctxt->checkIndex = 0;
4521#ifdef DEBUG_PUSH
4522 if (next == 0)
4523 xmlGenericError(xmlGenericErrorContext,
4524 "HPP: lookup '%c' found at %d\n",
4525 first, base);
4526 else if (third == 0)
4527 xmlGenericError(xmlGenericErrorContext,
4528 "HPP: lookup '%c%c' found at %d\n",
4529 first, next, base);
4530 else
4531 xmlGenericError(xmlGenericErrorContext,
4532 "HPP: lookup '%c%c%c' found at %d\n",
4533 first, next, third, base);
4534#endif
4535 return(base - (in->cur - in->base));
4536 }
4537 }
4538 ctxt->checkIndex = base;
4539#ifdef DEBUG_PUSH
4540 if (next == 0)
4541 xmlGenericError(xmlGenericErrorContext,
4542 "HPP: lookup '%c' failed\n", first);
4543 else if (third == 0)
4544 xmlGenericError(xmlGenericErrorContext,
4545 "HPP: lookup '%c%c' failed\n", first, next);
4546 else
4547 xmlGenericError(xmlGenericErrorContext,
4548 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4549#endif
4550 return(-1);
4551}
4552
4553/**
4554 * htmlParseTryOrFinish:
4555 * @ctxt: an HTML parser context
4556 * @terminate: last chunk indicator
4557 *
4558 * Try to progress on parsing
4559 *
4560 * Returns zero if no parsing was possible
4561 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004562static int
Owen Taylor3473f882001-02-23 17:55:21 +00004563htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4564 int ret = 0;
4565 htmlParserInputPtr in;
4566 int avail = 0;
4567 xmlChar cur, next;
4568
4569#ifdef DEBUG_PUSH
4570 switch (ctxt->instate) {
4571 case XML_PARSER_EOF:
4572 xmlGenericError(xmlGenericErrorContext,
4573 "HPP: try EOF\n"); break;
4574 case XML_PARSER_START:
4575 xmlGenericError(xmlGenericErrorContext,
4576 "HPP: try START\n"); break;
4577 case XML_PARSER_MISC:
4578 xmlGenericError(xmlGenericErrorContext,
4579 "HPP: try MISC\n");break;
4580 case XML_PARSER_COMMENT:
4581 xmlGenericError(xmlGenericErrorContext,
4582 "HPP: try COMMENT\n");break;
4583 case XML_PARSER_PROLOG:
4584 xmlGenericError(xmlGenericErrorContext,
4585 "HPP: try PROLOG\n");break;
4586 case XML_PARSER_START_TAG:
4587 xmlGenericError(xmlGenericErrorContext,
4588 "HPP: try START_TAG\n");break;
4589 case XML_PARSER_CONTENT:
4590 xmlGenericError(xmlGenericErrorContext,
4591 "HPP: try CONTENT\n");break;
4592 case XML_PARSER_CDATA_SECTION:
4593 xmlGenericError(xmlGenericErrorContext,
4594 "HPP: try CDATA_SECTION\n");break;
4595 case XML_PARSER_END_TAG:
4596 xmlGenericError(xmlGenericErrorContext,
4597 "HPP: try END_TAG\n");break;
4598 case XML_PARSER_ENTITY_DECL:
4599 xmlGenericError(xmlGenericErrorContext,
4600 "HPP: try ENTITY_DECL\n");break;
4601 case XML_PARSER_ENTITY_VALUE:
4602 xmlGenericError(xmlGenericErrorContext,
4603 "HPP: try ENTITY_VALUE\n");break;
4604 case XML_PARSER_ATTRIBUTE_VALUE:
4605 xmlGenericError(xmlGenericErrorContext,
4606 "HPP: try ATTRIBUTE_VALUE\n");break;
4607 case XML_PARSER_DTD:
4608 xmlGenericError(xmlGenericErrorContext,
4609 "HPP: try DTD\n");break;
4610 case XML_PARSER_EPILOG:
4611 xmlGenericError(xmlGenericErrorContext,
4612 "HPP: try EPILOG\n");break;
4613 case XML_PARSER_PI:
4614 xmlGenericError(xmlGenericErrorContext,
4615 "HPP: try PI\n");break;
4616 case XML_PARSER_SYSTEM_LITERAL:
4617 xmlGenericError(xmlGenericErrorContext,
4618 "HPP: try SYSTEM_LITERAL\n");break;
4619 }
4620#endif
4621
4622 while (1) {
4623
4624 in = ctxt->input;
4625 if (in == NULL) break;
4626 if (in->buf == NULL)
4627 avail = in->length - (in->cur - in->base);
4628 else
4629 avail = in->buf->buffer->use - (in->cur - in->base);
4630 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004631 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004632 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4633 /*
4634 * SAX: end of the document processing.
4635 */
4636 ctxt->instate = XML_PARSER_EOF;
4637 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4638 ctxt->sax->endDocument(ctxt->userData);
4639 }
4640 }
4641 if (avail < 1)
4642 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004643 cur = in->cur[0];
4644 if (cur == 0) {
4645 SKIP(1);
4646 continue;
4647 }
4648
Owen Taylor3473f882001-02-23 17:55:21 +00004649 switch (ctxt->instate) {
4650 case XML_PARSER_EOF:
4651 /*
4652 * Document parsing is done !
4653 */
4654 goto done;
4655 case XML_PARSER_START:
4656 /*
4657 * Very first chars read from the document flow.
4658 */
4659 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004660 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004661 SKIP_BLANKS;
4662 if (in->buf == NULL)
4663 avail = in->length - (in->cur - in->base);
4664 else
4665 avail = in->buf->buffer->use - (in->cur - in->base);
4666 }
4667 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4668 ctxt->sax->setDocumentLocator(ctxt->userData,
4669 &xmlDefaultSAXLocator);
4670 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4671 (!ctxt->disableSAX))
4672 ctxt->sax->startDocument(ctxt->userData);
4673
4674 cur = in->cur[0];
4675 next = in->cur[1];
4676 if ((cur == '<') && (next == '!') &&
4677 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4678 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4679 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4680 (UPP(8) == 'E')) {
4681 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004682 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004683 goto done;
4684#ifdef DEBUG_PUSH
4685 xmlGenericError(xmlGenericErrorContext,
4686 "HPP: Parsing internal subset\n");
4687#endif
4688 htmlParseDocTypeDecl(ctxt);
4689 ctxt->instate = XML_PARSER_PROLOG;
4690#ifdef DEBUG_PUSH
4691 xmlGenericError(xmlGenericErrorContext,
4692 "HPP: entering PROLOG\n");
4693#endif
4694 } else {
4695 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004696#ifdef DEBUG_PUSH
Daniel Veillard597f1c12005-07-03 23:00:18 +00004697 xmlGenericError(xmlGenericErrorContext,
4698 "HPP: entering MISC\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004699#endif
Daniel Veillard597f1c12005-07-03 23:00:18 +00004700 }
Owen Taylor3473f882001-02-23 17:55:21 +00004701 break;
4702 case XML_PARSER_MISC:
4703 SKIP_BLANKS;
4704 if (in->buf == NULL)
4705 avail = in->length - (in->cur - in->base);
4706 else
4707 avail = in->buf->buffer->use - (in->cur - in->base);
4708 if (avail < 2)
4709 goto done;
4710 cur = in->cur[0];
4711 next = in->cur[1];
4712 if ((cur == '<') && (next == '!') &&
4713 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4714 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004715 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004716 goto done;
4717#ifdef DEBUG_PUSH
4718 xmlGenericError(xmlGenericErrorContext,
4719 "HPP: Parsing Comment\n");
4720#endif
4721 htmlParseComment(ctxt);
4722 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004723 } else if ((cur == '<') && (next == '?')) {
4724 if ((!terminate) &&
4725 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4726 goto done;
4727#ifdef DEBUG_PUSH
4728 xmlGenericError(xmlGenericErrorContext,
4729 "HPP: Parsing PI\n");
4730#endif
4731 htmlParsePI(ctxt);
4732 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004733 } else if ((cur == '<') && (next == '!') &&
4734 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4735 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4736 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4737 (UPP(8) == 'E')) {
4738 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004739 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004740 goto done;
4741#ifdef DEBUG_PUSH
4742 xmlGenericError(xmlGenericErrorContext,
4743 "HPP: Parsing internal subset\n");
4744#endif
4745 htmlParseDocTypeDecl(ctxt);
4746 ctxt->instate = XML_PARSER_PROLOG;
4747#ifdef DEBUG_PUSH
4748 xmlGenericError(xmlGenericErrorContext,
4749 "HPP: entering PROLOG\n");
4750#endif
4751 } else if ((cur == '<') && (next == '!') &&
4752 (avail < 9)) {
4753 goto done;
4754 } else {
4755 ctxt->instate = XML_PARSER_START_TAG;
4756#ifdef DEBUG_PUSH
4757 xmlGenericError(xmlGenericErrorContext,
4758 "HPP: entering START_TAG\n");
4759#endif
4760 }
4761 break;
4762 case XML_PARSER_PROLOG:
4763 SKIP_BLANKS;
4764 if (in->buf == NULL)
4765 avail = in->length - (in->cur - in->base);
4766 else
4767 avail = in->buf->buffer->use - (in->cur - in->base);
4768 if (avail < 2)
4769 goto done;
4770 cur = in->cur[0];
4771 next = in->cur[1];
4772 if ((cur == '<') && (next == '!') &&
4773 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4774 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004775 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004776 goto done;
4777#ifdef DEBUG_PUSH
4778 xmlGenericError(xmlGenericErrorContext,
4779 "HPP: Parsing Comment\n");
4780#endif
4781 htmlParseComment(ctxt);
4782 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004783 } else if ((cur == '<') && (next == '?')) {
4784 if ((!terminate) &&
4785 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4786 goto done;
4787#ifdef DEBUG_PUSH
4788 xmlGenericError(xmlGenericErrorContext,
4789 "HPP: Parsing PI\n");
4790#endif
4791 htmlParsePI(ctxt);
4792 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004793 } else if ((cur == '<') && (next == '!') &&
4794 (avail < 4)) {
4795 goto done;
4796 } else {
4797 ctxt->instate = XML_PARSER_START_TAG;
4798#ifdef DEBUG_PUSH
4799 xmlGenericError(xmlGenericErrorContext,
4800 "HPP: entering START_TAG\n");
4801#endif
4802 }
4803 break;
4804 case XML_PARSER_EPILOG:
4805 if (in->buf == NULL)
4806 avail = in->length - (in->cur - in->base);
4807 else
4808 avail = in->buf->buffer->use - (in->cur - in->base);
4809 if (avail < 1)
4810 goto done;
4811 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004812 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004813 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004814 goto done;
4815 }
4816 if (avail < 2)
4817 goto done;
4818 next = in->cur[1];
4819 if ((cur == '<') && (next == '!') &&
4820 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4821 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004822 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004823 goto done;
4824#ifdef DEBUG_PUSH
4825 xmlGenericError(xmlGenericErrorContext,
4826 "HPP: Parsing Comment\n");
4827#endif
4828 htmlParseComment(ctxt);
4829 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004830 } else if ((cur == '<') && (next == '?')) {
4831 if ((!terminate) &&
4832 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4833 goto done;
4834#ifdef DEBUG_PUSH
4835 xmlGenericError(xmlGenericErrorContext,
4836 "HPP: Parsing PI\n");
4837#endif
4838 htmlParsePI(ctxt);
4839 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004840 } else if ((cur == '<') && (next == '!') &&
4841 (avail < 4)) {
4842 goto done;
4843 } else {
4844 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004845 ctxt->wellFormed = 0;
4846 ctxt->instate = XML_PARSER_EOF;
4847#ifdef DEBUG_PUSH
4848 xmlGenericError(xmlGenericErrorContext,
4849 "HPP: entering EOF\n");
4850#endif
4851 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4852 ctxt->sax->endDocument(ctxt->userData);
4853 goto done;
4854 }
4855 break;
4856 case XML_PARSER_START_TAG: {
Daniel Veillard6a0baa02005-12-10 11:11:12 +00004857 const xmlChar *name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004858 int failed;
Daniel Veillardbb371292001-08-16 23:26:59 +00004859 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004860
4861 if (avail < 2)
4862 goto done;
4863 cur = in->cur[0];
4864 if (cur != '<') {
4865 ctxt->instate = XML_PARSER_CONTENT;
4866#ifdef DEBUG_PUSH
4867 xmlGenericError(xmlGenericErrorContext,
4868 "HPP: entering CONTENT\n");
4869#endif
4870 break;
4871 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004872 if (in->cur[1] == '/') {
4873 ctxt->instate = XML_PARSER_END_TAG;
4874 ctxt->checkIndex = 0;
4875#ifdef DEBUG_PUSH
4876 xmlGenericError(xmlGenericErrorContext,
4877 "HPP: entering END_TAG\n");
4878#endif
4879 break;
4880 }
Owen Taylor3473f882001-02-23 17:55:21 +00004881 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004882 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004883 goto done;
4884
Daniel Veillard597f1c12005-07-03 23:00:18 +00004885 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004886 name = ctxt->name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004887 if (failed ||
Owen Taylor3473f882001-02-23 17:55:21 +00004888 (name == NULL)) {
4889 if (CUR == '>')
4890 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004891 break;
4892 }
Owen Taylor3473f882001-02-23 17:55:21 +00004893
4894 /*
4895 * Lookup the info for that element.
4896 */
4897 info = htmlTagLookup(name);
4898 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004899 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4900 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004901 }
4902
4903 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004904 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004905 */
4906 if ((CUR == '/') && (NXT(1) == '>')) {
4907 SKIP(2);
4908 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4909 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00004910 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004911 ctxt->instate = XML_PARSER_CONTENT;
4912#ifdef DEBUG_PUSH
4913 xmlGenericError(xmlGenericErrorContext,
4914 "HPP: entering CONTENT\n");
4915#endif
4916 break;
4917 }
4918
4919 if (CUR == '>') {
4920 NEXT;
4921 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004922 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4923 "Couldn't find end of Start Tag %s\n",
4924 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004925
4926 /*
4927 * end of parsing of this node.
4928 */
4929 if (xmlStrEqual(name, ctxt->name)) {
4930 nodePop(ctxt);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00004931 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004932 }
4933
4934 ctxt->instate = XML_PARSER_CONTENT;
4935#ifdef DEBUG_PUSH
4936 xmlGenericError(xmlGenericErrorContext,
4937 "HPP: entering CONTENT\n");
4938#endif
4939 break;
4940 }
4941
4942 /*
4943 * Check for an Empty Element from DTD definition
4944 */
4945 if ((info != NULL) && (info->empty)) {
4946 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4947 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00004948 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004949 }
4950 ctxt->instate = XML_PARSER_CONTENT;
4951#ifdef DEBUG_PUSH
4952 xmlGenericError(xmlGenericErrorContext,
4953 "HPP: entering CONTENT\n");
4954#endif
4955 break;
4956 }
4957 case XML_PARSER_CONTENT: {
4958 long cons;
4959 /*
4960 * Handle preparsed entities and charRef
4961 */
4962 if (ctxt->token != 0) {
4963 xmlChar chr[2] = { 0 , 0 } ;
4964
4965 chr[0] = (xmlChar) ctxt->token;
4966 htmlCheckParagraph(ctxt);
4967 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4968 ctxt->sax->characters(ctxt->userData, chr, 1);
4969 ctxt->token = 0;
4970 ctxt->checkIndex = 0;
4971 }
4972 if ((avail == 1) && (terminate)) {
4973 cur = in->cur[0];
4974 if ((cur != '<') && (cur != '&')) {
4975 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00004976 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004977 if (ctxt->sax->ignorableWhitespace != NULL)
4978 ctxt->sax->ignorableWhitespace(
4979 ctxt->userData, &cur, 1);
4980 } else {
4981 htmlCheckParagraph(ctxt);
4982 if (ctxt->sax->characters != NULL)
4983 ctxt->sax->characters(
4984 ctxt->userData, &cur, 1);
4985 }
4986 }
4987 ctxt->token = 0;
4988 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00004989 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00004990 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004991 }
Owen Taylor3473f882001-02-23 17:55:21 +00004992 }
4993 if (avail < 2)
4994 goto done;
4995 cur = in->cur[0];
4996 next = in->cur[1];
4997 cons = ctxt->nbChars;
4998 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4999 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5000 /*
5001 * Handle SCRIPT/STYLE separately
5002 */
Daniel Veillard68716a72006-10-16 09:32:17 +00005003 if (!terminate) {
5004 int idx;
5005 xmlChar val;
5006
5007 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5008 if (idx < 0)
5009 goto done;
5010 val = in->cur[idx + 2];
5011 if (val == 0) /* bad cut of input */
5012 goto done;
5013 }
Owen Taylor3473f882001-02-23 17:55:21 +00005014 htmlParseScript(ctxt);
5015 if ((cur == '<') && (next == '/')) {
5016 ctxt->instate = XML_PARSER_END_TAG;
5017 ctxt->checkIndex = 0;
5018#ifdef DEBUG_PUSH
5019 xmlGenericError(xmlGenericErrorContext,
5020 "HPP: entering END_TAG\n");
5021#endif
5022 break;
5023 }
5024 } else {
5025 /*
5026 * Sometimes DOCTYPE arrives in the middle of the document
5027 */
5028 if ((cur == '<') && (next == '!') &&
5029 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5030 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5031 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5032 (UPP(8) == 'E')) {
5033 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005034 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005035 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00005036 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5037 "Misplaced DOCTYPE declaration\n",
5038 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005039 htmlParseDocTypeDecl(ctxt);
5040 } else if ((cur == '<') && (next == '!') &&
5041 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5042 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005043 (htmlParseLookupSequence(
5044 ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005045 goto done;
5046#ifdef DEBUG_PUSH
5047 xmlGenericError(xmlGenericErrorContext,
5048 "HPP: Parsing Comment\n");
5049#endif
5050 htmlParseComment(ctxt);
5051 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005052 } else if ((cur == '<') && (next == '?')) {
5053 if ((!terminate) &&
5054 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5055 goto done;
5056#ifdef DEBUG_PUSH
5057 xmlGenericError(xmlGenericErrorContext,
5058 "HPP: Parsing PI\n");
5059#endif
5060 htmlParsePI(ctxt);
5061 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00005062 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5063 goto done;
5064 } else if ((cur == '<') && (next == '/')) {
5065 ctxt->instate = XML_PARSER_END_TAG;
5066 ctxt->checkIndex = 0;
5067#ifdef DEBUG_PUSH
5068 xmlGenericError(xmlGenericErrorContext,
5069 "HPP: entering END_TAG\n");
5070#endif
5071 break;
5072 } else if (cur == '<') {
5073 ctxt->instate = XML_PARSER_START_TAG;
5074 ctxt->checkIndex = 0;
5075#ifdef DEBUG_PUSH
5076 xmlGenericError(xmlGenericErrorContext,
5077 "HPP: entering START_TAG\n");
5078#endif
5079 break;
5080 } else if (cur == '&') {
5081 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005082 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005083 goto done;
5084#ifdef DEBUG_PUSH
5085 xmlGenericError(xmlGenericErrorContext,
5086 "HPP: Parsing Reference\n");
5087#endif
5088 /* TODO: check generation of subtrees if noent !!! */
5089 htmlParseReference(ctxt);
5090 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005091 /*
5092 * check that the text sequence is complete
5093 * before handing out the data to the parser
5094 * to avoid problems with erroneous end of
5095 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00005096 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00005097 if ((!terminate) &&
5098 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5099 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00005100 ctxt->checkIndex = 0;
5101#ifdef DEBUG_PUSH
5102 xmlGenericError(xmlGenericErrorContext,
5103 "HPP: Parsing char data\n");
5104#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005105 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005106 }
5107 }
5108 if (cons == ctxt->nbChars) {
5109 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005110 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5111 "detected an error in element content\n",
5112 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005113 }
5114 NEXT;
5115 break;
5116 }
5117
5118 break;
5119 }
5120 case XML_PARSER_END_TAG:
5121 if (avail < 2)
5122 goto done;
5123 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005124 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005125 goto done;
5126 htmlParseEndTag(ctxt);
5127 if (ctxt->nameNr == 0) {
5128 ctxt->instate = XML_PARSER_EPILOG;
5129 } else {
5130 ctxt->instate = XML_PARSER_CONTENT;
5131 }
5132 ctxt->checkIndex = 0;
5133#ifdef DEBUG_PUSH
5134 xmlGenericError(xmlGenericErrorContext,
5135 "HPP: entering CONTENT\n");
5136#endif
5137 break;
5138 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005139 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5140 "HPP: internal error, state == CDATA\n",
5141 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005142 ctxt->instate = XML_PARSER_CONTENT;
5143 ctxt->checkIndex = 0;
5144#ifdef DEBUG_PUSH
5145 xmlGenericError(xmlGenericErrorContext,
5146 "HPP: entering CONTENT\n");
5147#endif
5148 break;
5149 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005150 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5151 "HPP: internal error, state == DTD\n",
5152 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005153 ctxt->instate = XML_PARSER_CONTENT;
5154 ctxt->checkIndex = 0;
5155#ifdef DEBUG_PUSH
5156 xmlGenericError(xmlGenericErrorContext,
5157 "HPP: entering CONTENT\n");
5158#endif
5159 break;
5160 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005161 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5162 "HPP: internal error, state == COMMENT\n",
5163 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005164 ctxt->instate = XML_PARSER_CONTENT;
5165 ctxt->checkIndex = 0;
5166#ifdef DEBUG_PUSH
5167 xmlGenericError(xmlGenericErrorContext,
5168 "HPP: entering CONTENT\n");
5169#endif
5170 break;
5171 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005172 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5173 "HPP: internal error, state == PI\n",
5174 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005175 ctxt->instate = XML_PARSER_CONTENT;
5176 ctxt->checkIndex = 0;
5177#ifdef DEBUG_PUSH
5178 xmlGenericError(xmlGenericErrorContext,
5179 "HPP: entering CONTENT\n");
5180#endif
5181 break;
5182 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005183 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5184 "HPP: internal error, state == ENTITY_DECL\n",
5185 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005186 ctxt->instate = XML_PARSER_CONTENT;
5187 ctxt->checkIndex = 0;
5188#ifdef DEBUG_PUSH
5189 xmlGenericError(xmlGenericErrorContext,
5190 "HPP: entering CONTENT\n");
5191#endif
5192 break;
5193 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005194 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5195 "HPP: internal error, state == ENTITY_VALUE\n",
5196 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005197 ctxt->instate = XML_PARSER_CONTENT;
5198 ctxt->checkIndex = 0;
5199#ifdef DEBUG_PUSH
5200 xmlGenericError(xmlGenericErrorContext,
5201 "HPP: entering DTD\n");
5202#endif
5203 break;
5204 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005205 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5206 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5207 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005208 ctxt->instate = XML_PARSER_START_TAG;
5209 ctxt->checkIndex = 0;
5210#ifdef DEBUG_PUSH
5211 xmlGenericError(xmlGenericErrorContext,
5212 "HPP: entering START_TAG\n");
5213#endif
5214 break;
5215 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005216 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5217 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5218 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005219 ctxt->instate = XML_PARSER_CONTENT;
5220 ctxt->checkIndex = 0;
5221#ifdef DEBUG_PUSH
5222 xmlGenericError(xmlGenericErrorContext,
5223 "HPP: entering CONTENT\n");
5224#endif
5225 break;
5226 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005227 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5228 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5229 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005230 ctxt->instate = XML_PARSER_CONTENT;
5231 ctxt->checkIndex = 0;
5232#ifdef DEBUG_PUSH
5233 xmlGenericError(xmlGenericErrorContext,
5234 "HPP: entering CONTENT\n");
5235#endif
5236 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005237 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005238 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5239 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5240 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005241 ctxt->instate = XML_PARSER_CONTENT;
5242 ctxt->checkIndex = 0;
5243#ifdef DEBUG_PUSH
5244 xmlGenericError(xmlGenericErrorContext,
5245 "HPP: entering CONTENT\n");
5246#endif
5247 break;
5248
Owen Taylor3473f882001-02-23 17:55:21 +00005249 }
5250 }
5251done:
5252 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005253 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005254 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5255 /*
5256 * SAX: end of the document processing.
5257 */
5258 ctxt->instate = XML_PARSER_EOF;
5259 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5260 ctxt->sax->endDocument(ctxt->userData);
5261 }
5262 }
5263 if ((ctxt->myDoc != NULL) &&
5264 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5265 (ctxt->instate == XML_PARSER_EPILOG))) {
5266 xmlDtdPtr dtd;
5267 dtd = xmlGetIntSubset(ctxt->myDoc);
5268 if (dtd == NULL)
5269 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00005270 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005271 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5272 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5273 }
5274#ifdef DEBUG_PUSH
5275 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5276#endif
5277 return(ret);
5278}
5279
5280/**
Owen Taylor3473f882001-02-23 17:55:21 +00005281 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00005282 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00005283 * @chunk: an char array
5284 * @size: the size in byte of the chunk
5285 * @terminate: last chunk indicator
5286 *
5287 * Parse a Chunk of memory
5288 *
5289 * Returns zero if no error, the xmlParserErrors otherwise.
5290 */
5291int
5292htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5293 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00005294 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5295 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5296 "htmlParseChunk: context error\n", NULL, NULL);
5297 return(XML_ERR_INTERNAL_ERROR);
5298 }
Owen Taylor3473f882001-02-23 17:55:21 +00005299 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5300 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5301 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5302 int cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005303 int res;
Owen Taylor3473f882001-02-23 17:55:21 +00005304
Daniel Veillardd2755a82005-08-07 23:42:39 +00005305 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5306 if (res < 0) {
5307 ctxt->errNo = XML_PARSER_EOF;
5308 ctxt->disableSAX = 1;
5309 return (XML_PARSER_EOF);
5310 }
Owen Taylor3473f882001-02-23 17:55:21 +00005311 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5312 ctxt->input->cur = ctxt->input->base + cur;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005313 ctxt->input->end =
5314 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005315#ifdef DEBUG_PUSH
5316 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5317#endif
5318
Daniel Veillard14f752c2003-08-09 11:44:50 +00005319#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00005320 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5321 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005322#endif
Owen Taylor3473f882001-02-23 17:55:21 +00005323 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005324 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5325 xmlParserInputBufferPtr in = ctxt->input->buf;
5326 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5327 (in->raw != NULL)) {
5328 int nbchars;
5329
5330 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5331 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005332 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5333 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005334 return(XML_ERR_INVALID_ENCODING);
5335 }
5336 }
5337 }
Owen Taylor3473f882001-02-23 17:55:21 +00005338 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00005339 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00005340 if (terminate) {
5341 if ((ctxt->instate != XML_PARSER_EOF) &&
5342 (ctxt->instate != XML_PARSER_EPILOG) &&
5343 (ctxt->instate != XML_PARSER_MISC)) {
5344 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005345 ctxt->wellFormed = 0;
5346 }
5347 if (ctxt->instate != XML_PARSER_EOF) {
5348 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5349 ctxt->sax->endDocument(ctxt->userData);
5350 }
5351 ctxt->instate = XML_PARSER_EOF;
5352 }
5353 return((xmlParserErrors) ctxt->errNo);
5354}
5355
5356/************************************************************************
5357 * *
5358 * User entry points *
5359 * *
5360 ************************************************************************/
5361
5362/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005363 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005364 * @sax: a SAX handler
5365 * @user_data: The user data returned on SAX callbacks
5366 * @chunk: a pointer to an array of chars
5367 * @size: number of chars in the array
5368 * @filename: an optional file name or URI
5369 * @enc: an optional encoding
5370 *
5371 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005372 * The value of @filename is used for fetching external entities
5373 * and error/warning reports.
5374 *
5375 * Returns the new parser context or NULL
5376 */
5377htmlParserCtxtPtr
5378htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5379 const char *chunk, int size, const char *filename,
5380 xmlCharEncoding enc) {
5381 htmlParserCtxtPtr ctxt;
5382 htmlParserInputPtr inputStream;
5383 xmlParserInputBufferPtr buf;
5384
Daniel Veillardd0463562001-10-13 09:15:48 +00005385 xmlInitParser();
5386
Owen Taylor3473f882001-02-23 17:55:21 +00005387 buf = xmlAllocParserInputBuffer(enc);
5388 if (buf == NULL) return(NULL);
5389
Daniel Veillardf403d292003-10-05 13:51:35 +00005390 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005391 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005392 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005393 return(NULL);
5394 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00005395 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5396 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005397 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005398 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005399 xmlFree(ctxt->sax);
5400 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5401 if (ctxt->sax == NULL) {
5402 xmlFree(buf);
5403 xmlFree(ctxt);
5404 return(NULL);
5405 }
5406 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5407 if (user_data != NULL)
5408 ctxt->userData = user_data;
5409 }
5410 if (filename == NULL) {
5411 ctxt->directory = NULL;
5412 } else {
5413 ctxt->directory = xmlParserGetDirectory(filename);
5414 }
5415
5416 inputStream = htmlNewInputStream(ctxt);
5417 if (inputStream == NULL) {
5418 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005419 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005420 return(NULL);
5421 }
5422
5423 if (filename == NULL)
5424 inputStream->filename = NULL;
5425 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005426 inputStream->filename = (char *)
5427 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005428 inputStream->buf = buf;
5429 inputStream->base = inputStream->buf->buffer->content;
5430 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillard5f704af2003-03-05 10:01:43 +00005431 inputStream->end =
5432 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005433
5434 inputPush(ctxt, inputStream);
5435
5436 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5437 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005438 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5439 int cur = ctxt->input->cur - ctxt->input->base;
5440
Owen Taylor3473f882001-02-23 17:55:21 +00005441 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005442
5443 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5444 ctxt->input->cur = ctxt->input->base + cur;
5445 ctxt->input->end =
5446 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005447#ifdef DEBUG_PUSH
5448 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5449#endif
5450 }
Daniel Veillard68716a72006-10-16 09:32:17 +00005451 ctxt->progressive = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00005452
5453 return(ctxt);
5454}
William M. Brack21e4ef22005-01-02 09:53:13 +00005455#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00005456
5457/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005458 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005459 * @cur: a pointer to an array of xmlChar
5460 * @encoding: a free form C string describing the HTML document encoding, or NULL
5461 * @sax: the SAX handler block
5462 * @userData: if using SAX, this pointer will be provided on callbacks.
5463 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005464 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5465 * to handle parse events. If sax is NULL, fallback to the default DOM
5466 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00005467 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005468 * Returns the resulting document tree unless SAX is NULL or the document is
5469 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005470 */
5471
5472htmlDocPtr
5473htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5474 htmlDocPtr ret;
5475 htmlParserCtxtPtr ctxt;
5476
Daniel Veillardd0463562001-10-13 09:15:48 +00005477 xmlInitParser();
5478
Owen Taylor3473f882001-02-23 17:55:21 +00005479 if (cur == NULL) return(NULL);
5480
5481
5482 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5483 if (ctxt == NULL) return(NULL);
5484 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005485 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005486 ctxt->sax = sax;
5487 ctxt->userData = userData;
5488 }
5489
5490 htmlParseDocument(ctxt);
5491 ret = ctxt->myDoc;
5492 if (sax != NULL) {
5493 ctxt->sax = NULL;
5494 ctxt->userData = NULL;
5495 }
5496 htmlFreeParserCtxt(ctxt);
5497
5498 return(ret);
5499}
5500
5501/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005502 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005503 * @cur: a pointer to an array of xmlChar
5504 * @encoding: a free form C string describing the HTML document encoding, or NULL
5505 *
5506 * parse an HTML in-memory document and build a tree.
5507 *
5508 * Returns the resulting document tree
5509 */
5510
5511htmlDocPtr
5512htmlParseDoc(xmlChar *cur, const char *encoding) {
5513 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5514}
5515
5516
5517/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005518 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005519 * @filename: the filename
5520 * @encoding: a free form C string describing the HTML document encoding, or NULL
5521 *
5522 * Create a parser context for a file content.
5523 * Automatic support for ZLIB/Compress compressed document is provided
5524 * by default if found at compile-time.
5525 *
5526 * Returns the new parser context or NULL
5527 */
5528htmlParserCtxtPtr
5529htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5530{
5531 htmlParserCtxtPtr ctxt;
5532 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005533 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005534 /* htmlCharEncoding enc; */
5535 xmlChar *content, *content_line = (xmlChar *) "charset=";
5536
Daniel Veillarda03e3652004-11-02 18:45:30 +00005537 if (filename == NULL)
5538 return(NULL);
5539
Daniel Veillardf403d292003-10-05 13:51:35 +00005540 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005541 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00005542 return(NULL);
5543 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005544 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5545 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00005546#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005547 if (xmlDefaultSAXHandler.error != NULL) {
5548 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5549 }
Daniel Veillard87247e82004-01-13 20:42:02 +00005550#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00005551 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005552 return(NULL);
5553 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005554
5555 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5556 xmlFree(canonicFilename);
5557 if (inputStream == NULL) {
5558 xmlFreeParserCtxt(ctxt);
5559 return(NULL);
5560 }
Owen Taylor3473f882001-02-23 17:55:21 +00005561
5562 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005563
Owen Taylor3473f882001-02-23 17:55:21 +00005564 /* set encoding */
5565 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005566 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Owen Taylor3473f882001-02-23 17:55:21 +00005567 if (content) {
5568 strcpy ((char *)content, (char *)content_line);
5569 strcat ((char *)content, (char *)encoding);
5570 htmlCheckEncoding (ctxt, content);
5571 xmlFree (content);
5572 }
5573 }
5574
5575 return(ctxt);
5576}
5577
5578/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005579 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005580 * @filename: the filename
5581 * @encoding: a free form C string describing the HTML document encoding, or NULL
5582 * @sax: the SAX handler block
5583 * @userData: if using SAX, this pointer will be provided on callbacks.
5584 *
5585 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5586 * compressed document is provided by default if found at compile-time.
5587 * It use the given SAX function block to handle the parsing callback.
5588 * If sax is NULL, fallback to the default DOM tree building routines.
5589 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005590 * Returns the resulting document tree unless SAX is NULL or the document is
5591 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005592 */
5593
5594htmlDocPtr
5595htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5596 void *userData) {
5597 htmlDocPtr ret;
5598 htmlParserCtxtPtr ctxt;
5599 htmlSAXHandlerPtr oldsax = NULL;
5600
Daniel Veillardd0463562001-10-13 09:15:48 +00005601 xmlInitParser();
5602
Owen Taylor3473f882001-02-23 17:55:21 +00005603 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5604 if (ctxt == NULL) return(NULL);
5605 if (sax != NULL) {
5606 oldsax = ctxt->sax;
5607 ctxt->sax = sax;
5608 ctxt->userData = userData;
5609 }
5610
5611 htmlParseDocument(ctxt);
5612
5613 ret = ctxt->myDoc;
5614 if (sax != NULL) {
5615 ctxt->sax = oldsax;
5616 ctxt->userData = NULL;
5617 }
5618 htmlFreeParserCtxt(ctxt);
5619
5620 return(ret);
5621}
5622
5623/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005624 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005625 * @filename: the filename
5626 * @encoding: a free form C string describing the HTML document encoding, or NULL
5627 *
5628 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5629 * compressed document is provided by default if found at compile-time.
5630 *
5631 * Returns the resulting document tree
5632 */
5633
5634htmlDocPtr
5635htmlParseFile(const char *filename, const char *encoding) {
5636 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5637}
5638
5639/**
5640 * htmlHandleOmittedElem:
5641 * @val: int 0 or 1
5642 *
5643 * Set and return the previous value for handling HTML omitted tags.
5644 *
5645 * Returns the last value for 0 for no handling, 1 for auto insertion.
5646 */
5647
5648int
5649htmlHandleOmittedElem(int val) {
5650 int old = htmlOmittedDefaultValue;
5651
5652 htmlOmittedDefaultValue = val;
5653 return(old);
5654}
5655
Daniel Veillard930dfb62003-02-05 10:17:38 +00005656/**
5657 * htmlElementAllowedHere:
5658 * @parent: HTML parent element
5659 * @elt: HTML element
5660 *
5661 * Checks whether an HTML element may be a direct child of a parent element.
5662 * Note - doesn't check for deprecated elements
5663 *
5664 * Returns 1 if allowed; 0 otherwise.
5665 */
5666int
5667htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5668 const char** p ;
5669
5670 if ( ! elt || ! parent || ! parent->subelts )
5671 return 0 ;
5672
5673 for ( p = parent->subelts; *p; ++p )
5674 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5675 return 1 ;
5676
5677 return 0 ;
5678}
5679/**
5680 * htmlElementStatusHere:
5681 * @parent: HTML parent element
5682 * @elt: HTML element
5683 *
5684 * Checks whether an HTML element may be a direct child of a parent element.
5685 * and if so whether it is valid or deprecated.
5686 *
5687 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5688 */
5689htmlStatus
5690htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5691 if ( ! parent || ! elt )
5692 return HTML_INVALID ;
5693 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5694 return HTML_INVALID ;
5695
5696 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5697}
5698/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005699 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005700 * @elt: HTML element
5701 * @attr: HTML attribute
5702 * @legacy: whether to allow deprecated attributes
5703 *
5704 * Checks whether an attribute is valid for an element
5705 * Has full knowledge of Required and Deprecated attributes
5706 *
5707 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5708 */
5709htmlStatus
5710htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5711 const char** p ;
5712
5713 if ( !elt || ! attr )
5714 return HTML_INVALID ;
5715
5716 if ( elt->attrs_req )
5717 for ( p = elt->attrs_req; *p; ++p)
5718 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5719 return HTML_REQUIRED ;
5720
5721 if ( elt->attrs_opt )
5722 for ( p = elt->attrs_opt; *p; ++p)
5723 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5724 return HTML_VALID ;
5725
5726 if ( legacy && elt->attrs_depr )
5727 for ( p = elt->attrs_depr; *p; ++p)
5728 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5729 return HTML_DEPRECATED ;
5730
5731 return HTML_INVALID ;
5732}
5733/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005734 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005735 * @node: an htmlNodePtr in a tree
5736 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005737 * for Element nodes)
5738 *
5739 * Checks whether the tree node is valid. Experimental (the author
5740 * only uses the HTML enhancements in a SAX parser)
5741 *
5742 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5743 * legacy allowed) or htmlElementStatusHere (otherwise).
5744 * for Attribute nodes, a return from htmlAttrAllowed
5745 * for other nodes, HTML_NA (no checks performed)
5746 */
5747htmlStatus
5748htmlNodeStatus(const htmlNodePtr node, int legacy) {
5749 if ( ! node )
5750 return HTML_INVALID ;
5751
5752 switch ( node->type ) {
5753 case XML_ELEMENT_NODE:
5754 return legacy
5755 ? ( htmlElementAllowedHere (
5756 htmlTagLookup(node->parent->name) , node->name
5757 ) ? HTML_VALID : HTML_INVALID )
5758 : htmlElementStatusHere(
5759 htmlTagLookup(node->parent->name) ,
5760 htmlTagLookup(node->name) )
5761 ;
5762 case XML_ATTRIBUTE_NODE:
5763 return htmlAttrAllowed(
5764 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5765 default: return HTML_NA ;
5766 }
5767}
Daniel Veillard9475a352003-09-26 12:47:50 +00005768/************************************************************************
5769 * *
5770 * New set (2.6.0) of simpler and more flexible APIs *
5771 * *
5772 ************************************************************************/
5773/**
5774 * DICT_FREE:
5775 * @str: a string
5776 *
5777 * Free a string if it is not owned by the "dict" dictionnary in the
5778 * current scope
5779 */
5780#define DICT_FREE(str) \
5781 if ((str) && ((!dict) || \
5782 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5783 xmlFree((char *)(str));
5784
5785/**
5786 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00005787 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00005788 *
5789 * Reset a parser context
5790 */
5791void
5792htmlCtxtReset(htmlParserCtxtPtr ctxt)
5793{
5794 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00005795 xmlDictPtr dict;
5796
5797 if (ctxt == NULL)
5798 return;
5799
Daniel Veillardf1a27c62006-10-13 22:33:03 +00005800 xmlInitParser();
Daniel Veillarda03e3652004-11-02 18:45:30 +00005801 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00005802
5803 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5804 xmlFreeInputStream(input);
5805 }
5806 ctxt->inputNr = 0;
5807 ctxt->input = NULL;
5808
5809 ctxt->spaceNr = 0;
Daniel Veillarda521d282004-11-09 14:59:59 +00005810 if (ctxt->spaceTab != NULL) {
5811 ctxt->spaceTab[0] = -1;
5812 ctxt->space = &ctxt->spaceTab[0];
5813 } else {
5814 ctxt->space = NULL;
5815 }
Daniel Veillard9475a352003-09-26 12:47:50 +00005816
5817
5818 ctxt->nodeNr = 0;
5819 ctxt->node = NULL;
5820
5821 ctxt->nameNr = 0;
5822 ctxt->name = NULL;
5823
5824 DICT_FREE(ctxt->version);
5825 ctxt->version = NULL;
5826 DICT_FREE(ctxt->encoding);
5827 ctxt->encoding = NULL;
5828 DICT_FREE(ctxt->directory);
5829 ctxt->directory = NULL;
5830 DICT_FREE(ctxt->extSubURI);
5831 ctxt->extSubURI = NULL;
5832 DICT_FREE(ctxt->extSubSystem);
5833 ctxt->extSubSystem = NULL;
5834 if (ctxt->myDoc != NULL)
5835 xmlFreeDoc(ctxt->myDoc);
5836 ctxt->myDoc = NULL;
5837
5838 ctxt->standalone = -1;
5839 ctxt->hasExternalSubset = 0;
5840 ctxt->hasPErefs = 0;
5841 ctxt->html = 1;
5842 ctxt->external = 0;
5843 ctxt->instate = XML_PARSER_START;
5844 ctxt->token = 0;
5845
5846 ctxt->wellFormed = 1;
5847 ctxt->nsWellFormed = 1;
5848 ctxt->valid = 1;
5849 ctxt->vctxt.userData = ctxt;
5850 ctxt->vctxt.error = xmlParserValidityError;
5851 ctxt->vctxt.warning = xmlParserValidityWarning;
5852 ctxt->record_info = 0;
5853 ctxt->nbChars = 0;
5854 ctxt->checkIndex = 0;
5855 ctxt->inSubset = 0;
5856 ctxt->errNo = XML_ERR_OK;
5857 ctxt->depth = 0;
Daniel Veillard772869f2006-11-08 09:16:56 +00005858 ctxt->charset = XML_CHAR_ENCODING_NONE;
Daniel Veillard9475a352003-09-26 12:47:50 +00005859 ctxt->catalogs = NULL;
5860 xmlInitNodeInfoSeq(&ctxt->node_seq);
5861
5862 if (ctxt->attsDefault != NULL) {
5863 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5864 ctxt->attsDefault = NULL;
5865 }
5866 if (ctxt->attsSpecial != NULL) {
5867 xmlHashFree(ctxt->attsSpecial, NULL);
5868 ctxt->attsSpecial = NULL;
5869 }
5870}
5871
5872/**
5873 * htmlCtxtUseOptions:
5874 * @ctxt: an HTML parser context
5875 * @options: a combination of htmlParserOption(s)
5876 *
5877 * Applies the options to the parser context
5878 *
5879 * Returns 0 in case of success, the set of unknown or unimplemented options
5880 * in case of error.
5881 */
5882int
5883htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5884{
Daniel Veillarda03e3652004-11-02 18:45:30 +00005885 if (ctxt == NULL)
5886 return(-1);
5887
Daniel Veillard9475a352003-09-26 12:47:50 +00005888 if (options & HTML_PARSE_NOWARNING) {
5889 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005890 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00005891 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005892 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00005893 }
5894 if (options & HTML_PARSE_NOERROR) {
5895 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005896 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00005897 ctxt->sax->fatalError = NULL;
5898 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005899 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00005900 }
5901 if (options & HTML_PARSE_PEDANTIC) {
5902 ctxt->pedantic = 1;
5903 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005904 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00005905 } else
5906 ctxt->pedantic = 0;
5907 if (options & XML_PARSE_NOBLANKS) {
5908 ctxt->keepBlanks = 0;
5909 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5910 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005911 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00005912 } else
5913 ctxt->keepBlanks = 1;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00005914 if (options & HTML_PARSE_RECOVER) {
5915 ctxt->recovery = 1;
Daniel Veillardaf616a72006-10-17 20:18:39 +00005916 options -= HTML_PARSE_RECOVER;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00005917 } else
5918 ctxt->recovery = 0;
Daniel Veillard8874b942005-08-25 13:19:21 +00005919 if (options & HTML_PARSE_COMPACT) {
5920 ctxt->options |= HTML_PARSE_COMPACT;
5921 options -= HTML_PARSE_COMPACT;
5922 }
Daniel Veillard9475a352003-09-26 12:47:50 +00005923 ctxt->dictNames = 0;
5924 return (options);
5925}
5926
5927/**
5928 * htmlDoRead:
5929 * @ctxt: an HTML parser context
5930 * @URL: the base URL to use for the document
5931 * @encoding: the document encoding, or NULL
5932 * @options: a combination of htmlParserOption(s)
5933 * @reuse: keep the context for reuse
5934 *
5935 * Common front-end for the htmlRead functions
5936 *
5937 * Returns the resulting document tree or NULL
5938 */
5939static htmlDocPtr
5940htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5941 int options, int reuse)
5942{
5943 htmlDocPtr ret;
5944
5945 htmlCtxtUseOptions(ctxt, options);
5946 ctxt->html = 1;
5947 if (encoding != NULL) {
5948 xmlCharEncodingHandlerPtr hdlr;
5949
5950 hdlr = xmlFindCharEncodingHandler(encoding);
5951 if (hdlr != NULL)
5952 xmlSwitchToEncoding(ctxt, hdlr);
5953 }
5954 if ((URL != NULL) && (ctxt->input != NULL) &&
5955 (ctxt->input->filename == NULL))
5956 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
5957 htmlParseDocument(ctxt);
5958 ret = ctxt->myDoc;
5959 ctxt->myDoc = NULL;
5960 if (!reuse) {
5961 if ((ctxt->dictNames) &&
5962 (ret != NULL) &&
5963 (ret->dict == ctxt->dict))
5964 ctxt->dict = NULL;
5965 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00005966 }
5967 return (ret);
5968}
5969
5970/**
5971 * htmlReadDoc:
5972 * @cur: a pointer to a zero terminated string
5973 * @URL: the base URL to use for the document
5974 * @encoding: the document encoding, or NULL
5975 * @options: a combination of htmlParserOption(s)
5976 *
5977 * parse an XML in-memory document and build a tree.
5978 *
5979 * Returns the resulting document tree
5980 */
5981htmlDocPtr
5982htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
5983{
5984 htmlParserCtxtPtr ctxt;
5985
5986 if (cur == NULL)
5987 return (NULL);
5988
Daniel Veillardf1a27c62006-10-13 22:33:03 +00005989 xmlInitParser();
Daniel Veillard8a82ae12006-10-17 20:04:10 +00005990 ctxt = htmlCreateDocParserCtxt(cur, NULL);
Daniel Veillard9475a352003-09-26 12:47:50 +00005991 if (ctxt == NULL)
5992 return (NULL);
5993 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5994}
5995
5996/**
5997 * htmlReadFile:
5998 * @filename: a file or URL
5999 * @encoding: the document encoding, or NULL
6000 * @options: a combination of htmlParserOption(s)
6001 *
6002 * parse an XML file from the filesystem or the network.
6003 *
6004 * Returns the resulting document tree
6005 */
6006htmlDocPtr
6007htmlReadFile(const char *filename, const char *encoding, int options)
6008{
6009 htmlParserCtxtPtr ctxt;
6010
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006011 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006012 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6013 if (ctxt == NULL)
6014 return (NULL);
6015 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6016}
6017
6018/**
6019 * htmlReadMemory:
6020 * @buffer: a pointer to a char array
6021 * @size: the size of the array
6022 * @URL: the base URL to use for the document
6023 * @encoding: the document encoding, or NULL
6024 * @options: a combination of htmlParserOption(s)
6025 *
6026 * parse an XML in-memory document and build a tree.
6027 *
6028 * Returns the resulting document tree
6029 */
6030htmlDocPtr
6031htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6032{
6033 htmlParserCtxtPtr ctxt;
6034
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006035 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006036 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6037 if (ctxt == NULL)
6038 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006039 htmlDefaultSAXHandlerInit();
William M. Brackd43cdcd2004-08-03 15:13:29 +00006040 if (ctxt->sax != NULL)
6041 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00006042 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6043}
6044
6045/**
6046 * htmlReadFd:
6047 * @fd: an open file descriptor
6048 * @URL: the base URL to use for the document
6049 * @encoding: the document encoding, or NULL
6050 * @options: a combination of htmlParserOption(s)
6051 *
6052 * parse an XML from a file descriptor and build a tree.
6053 *
6054 * Returns the resulting document tree
6055 */
6056htmlDocPtr
6057htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6058{
6059 htmlParserCtxtPtr ctxt;
6060 xmlParserInputBufferPtr input;
6061 xmlParserInputPtr stream;
6062
6063 if (fd < 0)
6064 return (NULL);
6065
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006066 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006067 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6068 if (input == NULL)
6069 return (NULL);
6070 ctxt = xmlNewParserCtxt();
6071 if (ctxt == NULL) {
6072 xmlFreeParserInputBuffer(input);
6073 return (NULL);
6074 }
6075 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6076 if (stream == NULL) {
6077 xmlFreeParserInputBuffer(input);
6078 xmlFreeParserCtxt(ctxt);
6079 return (NULL);
6080 }
6081 inputPush(ctxt, stream);
6082 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6083}
6084
6085/**
6086 * htmlReadIO:
6087 * @ioread: an I/O read function
6088 * @ioclose: an I/O close function
6089 * @ioctx: an I/O handler
6090 * @URL: the base URL to use for the document
6091 * @encoding: the document encoding, or NULL
6092 * @options: a combination of htmlParserOption(s)
6093 *
6094 * parse an HTML document from I/O functions and source and build a tree.
6095 *
6096 * Returns the resulting document tree
6097 */
6098htmlDocPtr
6099htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6100 void *ioctx, const char *URL, const char *encoding, int options)
6101{
6102 htmlParserCtxtPtr ctxt;
6103 xmlParserInputBufferPtr input;
6104 xmlParserInputPtr stream;
6105
6106 if (ioread == NULL)
6107 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006108 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006109
6110 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6111 XML_CHAR_ENCODING_NONE);
6112 if (input == NULL)
6113 return (NULL);
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006114 ctxt = htmlNewParserCtxt();
Daniel Veillard9475a352003-09-26 12:47:50 +00006115 if (ctxt == NULL) {
6116 xmlFreeParserInputBuffer(input);
6117 return (NULL);
6118 }
6119 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6120 if (stream == NULL) {
6121 xmlFreeParserInputBuffer(input);
6122 xmlFreeParserCtxt(ctxt);
6123 return (NULL);
6124 }
6125 inputPush(ctxt, stream);
6126 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6127}
6128
6129/**
6130 * htmlCtxtReadDoc:
6131 * @ctxt: an HTML parser context
6132 * @cur: a pointer to a zero terminated string
6133 * @URL: the base URL to use for the document
6134 * @encoding: the document encoding, or NULL
6135 * @options: a combination of htmlParserOption(s)
6136 *
6137 * parse an XML in-memory document and build a tree.
6138 * This reuses the existing @ctxt parser context
6139 *
6140 * Returns the resulting document tree
6141 */
6142htmlDocPtr
6143htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6144 const char *URL, const char *encoding, int options)
6145{
6146 xmlParserInputPtr stream;
6147
6148 if (cur == NULL)
6149 return (NULL);
6150 if (ctxt == NULL)
6151 return (NULL);
6152
6153 htmlCtxtReset(ctxt);
6154
6155 stream = xmlNewStringInputStream(ctxt, cur);
6156 if (stream == NULL) {
6157 return (NULL);
6158 }
6159 inputPush(ctxt, stream);
6160 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6161}
6162
6163/**
6164 * htmlCtxtReadFile:
6165 * @ctxt: an HTML parser context
6166 * @filename: a file or URL
6167 * @encoding: the document encoding, or NULL
6168 * @options: a combination of htmlParserOption(s)
6169 *
6170 * parse an XML file from the filesystem or the network.
6171 * This reuses the existing @ctxt parser context
6172 *
6173 * Returns the resulting document tree
6174 */
6175htmlDocPtr
6176htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6177 const char *encoding, int options)
6178{
6179 xmlParserInputPtr stream;
6180
6181 if (filename == NULL)
6182 return (NULL);
6183 if (ctxt == NULL)
6184 return (NULL);
6185
6186 htmlCtxtReset(ctxt);
6187
Daniel Veillard29614c72004-11-26 10:47:26 +00006188 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006189 if (stream == NULL) {
6190 return (NULL);
6191 }
6192 inputPush(ctxt, stream);
6193 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6194}
6195
6196/**
6197 * htmlCtxtReadMemory:
6198 * @ctxt: an HTML parser context
6199 * @buffer: a pointer to a char array
6200 * @size: the size of the array
6201 * @URL: the base URL to use for the document
6202 * @encoding: the document encoding, or NULL
6203 * @options: a combination of htmlParserOption(s)
6204 *
6205 * parse an XML in-memory document and build a tree.
6206 * This reuses the existing @ctxt parser context
6207 *
6208 * Returns the resulting document tree
6209 */
6210htmlDocPtr
6211htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6212 const char *URL, const char *encoding, int options)
6213{
6214 xmlParserInputBufferPtr input;
6215 xmlParserInputPtr stream;
6216
6217 if (ctxt == NULL)
6218 return (NULL);
6219 if (buffer == NULL)
6220 return (NULL);
6221
6222 htmlCtxtReset(ctxt);
6223
6224 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6225 if (input == NULL) {
6226 return(NULL);
6227 }
6228
6229 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6230 if (stream == NULL) {
6231 xmlFreeParserInputBuffer(input);
6232 return(NULL);
6233 }
6234
6235 inputPush(ctxt, stream);
6236 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6237}
6238
6239/**
6240 * htmlCtxtReadFd:
6241 * @ctxt: an HTML parser context
6242 * @fd: an open file descriptor
6243 * @URL: the base URL to use for the document
6244 * @encoding: the document encoding, or NULL
6245 * @options: a combination of htmlParserOption(s)
6246 *
6247 * parse an XML from a file descriptor and build a tree.
6248 * This reuses the existing @ctxt parser context
6249 *
6250 * Returns the resulting document tree
6251 */
6252htmlDocPtr
6253htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6254 const char *URL, const char *encoding, int options)
6255{
6256 xmlParserInputBufferPtr input;
6257 xmlParserInputPtr stream;
6258
6259 if (fd < 0)
6260 return (NULL);
6261 if (ctxt == NULL)
6262 return (NULL);
6263
6264 htmlCtxtReset(ctxt);
6265
6266
6267 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6268 if (input == NULL)
6269 return (NULL);
6270 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6271 if (stream == NULL) {
6272 xmlFreeParserInputBuffer(input);
6273 return (NULL);
6274 }
6275 inputPush(ctxt, stream);
6276 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6277}
6278
6279/**
6280 * htmlCtxtReadIO:
6281 * @ctxt: an HTML parser context
6282 * @ioread: an I/O read function
6283 * @ioclose: an I/O close function
6284 * @ioctx: an I/O handler
6285 * @URL: the base URL to use for the document
6286 * @encoding: the document encoding, or NULL
6287 * @options: a combination of htmlParserOption(s)
6288 *
6289 * parse an HTML document from I/O functions and source and build a tree.
6290 * This reuses the existing @ctxt parser context
6291 *
6292 * Returns the resulting document tree
6293 */
6294htmlDocPtr
6295htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6296 xmlInputCloseCallback ioclose, void *ioctx,
6297 const char *URL,
6298 const char *encoding, int options)
6299{
6300 xmlParserInputBufferPtr input;
6301 xmlParserInputPtr stream;
6302
6303 if (ioread == NULL)
6304 return (NULL);
6305 if (ctxt == NULL)
6306 return (NULL);
6307
6308 htmlCtxtReset(ctxt);
6309
6310 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6311 XML_CHAR_ENCODING_NONE);
6312 if (input == NULL)
6313 return (NULL);
6314 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6315 if (stream == NULL) {
6316 xmlFreeParserInputBuffer(input);
6317 return (NULL);
6318 }
6319 inputPush(ctxt, stream);
6320 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6321}
6322
Daniel Veillard5d4644e2005-04-01 13:11:58 +00006323#define bottom_HTMLparser
6324#include "elfgcchack.h"
Owen Taylor3473f882001-02-23 17:55:21 +00006325#endif /* LIBXML_HTML_ENABLED */