blob: 3a03a3eb38dce64b48b269812b5f965d7967d900 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillardf403d292003-10-05 13:51:35 +000062 * Some factorized error routines *
63 * *
64 ************************************************************************/
65
66/**
William M. Brackedb65a72004-02-06 07:36:04 +000067 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000068 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
Daniel Veillard157fee02003-10-31 10:36:03 +000076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000079 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
Daniel Veillard157fee02003-10-31 10:36:03 +0000109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000112 if (ctxt != NULL)
113 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
117 NULL, 0, 0,
118 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000119 if (ctxt != NULL)
120 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
128 * @val: integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
135{
Daniel Veillard157fee02003-10-31 10:36:03 +0000136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
138 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000139 if (ctxt != NULL)
140 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000144 if (ctxt != NULL)
145 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000146}
147
148/************************************************************************
149 * *
Owen Taylor3473f882001-02-23 17:55:21 +0000150 * Parser stacks related functions and macros *
151 * *
152 ************************************************************************/
153
Daniel Veillard1c732d22002-11-30 11:22:59 +0000154/**
155 * htmlnamePush:
156 * @ctxt: an HTML parser context
157 * @value: the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000162 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000163static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000165{
166 if (ctxt->nameNr >= ctxt->nameMax) {
167 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000168 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000169 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000170 ctxt->nameMax *
171 sizeof(ctxt->nameTab[0]));
172 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000173 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000174 return (0);
175 }
176 }
177 ctxt->nameTab[ctxt->nameNr] = value;
178 ctxt->name = value;
179 return (ctxt->nameNr++);
180}
181/**
182 * htmlnamePop:
183 * @ctxt: an HTML parser context
184 *
185 * Pops the top element name from the name stack
186 *
187 * Returns the name just removed
188 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000189static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000190htmlnamePop(htmlParserCtxtPtr ctxt)
191{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000192 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000193
Daniel Veillard1c732d22002-11-30 11:22:59 +0000194 if (ctxt->nameNr <= 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000195 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000196 ctxt->nameNr--;
197 if (ctxt->nameNr < 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000198 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000199 if (ctxt->nameNr > 0)
200 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
201 else
202 ctxt->name = NULL;
203 ret = ctxt->nameTab[ctxt->nameNr];
Daniel Veillard24505b02005-07-28 23:49:35 +0000204 ctxt->nameTab[ctxt->nameNr] = NULL;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000205 return (ret);
206}
Owen Taylor3473f882001-02-23 17:55:21 +0000207
208/*
209 * Macros for accessing the content. Those should be used only by the parser,
210 * and not exported.
211 *
212 * Dirty macros, i.e. one need to make assumption on the context to use them
213 *
214 * CUR_PTR return the current pointer to the xmlChar to be parsed.
215 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
216 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
217 * in UNICODE mode. This should be used internally by the parser
218 * only to compare to ASCII values otherwise it would break when
219 * running with UTF-8 encoding.
220 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
221 * to compare on ASCII based substring.
222 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
223 * it should be used only to compare on ASCII based substring.
224 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000225 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000226 *
227 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
228 *
229 * CURRENT Returns the current char value, with the full decoding of
230 * UTF-8 if we are using this mode. It returns an int.
231 * NEXT Skip to the next character, this does the proper decoding
232 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000233 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000234 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
235 */
236
237#define UPPER (toupper(*ctxt->input->cur))
238
Daniel Veillard77a90a72003-03-22 00:04:05 +0000239#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000240
241#define NXT(val) ctxt->input->cur[(val)]
242
243#define UPP(val) (toupper(ctxt->input->cur[(val)]))
244
245#define CUR_PTR ctxt->input->cur
246
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000247#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
248 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
249 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000250
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000251#define GROW if ((ctxt->progressive == 0) && \
252 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
253 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000254
255#define CURRENT ((int) (*ctxt->input->cur))
256
257#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
258
259/* Inported from XML */
260
Daniel Veillard561b7f82002-03-20 21:55:57 +0000261/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
262#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000263#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000264
Daniel Veillard561b7f82002-03-20 21:55:57 +0000265#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000266#define NXT(val) ctxt->input->cur[(val)]
267#define CUR_PTR ctxt->input->cur
268
269
270#define NEXTL(l) do { \
271 if (*(ctxt->input->cur) == '\n') { \
272 ctxt->input->line++; ctxt->input->col = 1; \
273 } else ctxt->input->col++; \
274 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
275 } while (0)
276
277/************
278 \
279 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
280 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
281 ************/
282
283#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
284#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
285
286#define COPY_BUF(l,b,i,v) \
287 if (l == 1) b[i++] = (xmlChar) v; \
288 else i += xmlCopyChar(l,&b[i],v)
289
290/**
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200291 * htmlFindEncoding:
292 * @the HTML parser context
293 *
294 * Ty to find and encoding in the current data available in the input
295 * buffer this is needed to try to switch to the proper encoding when
296 * one face a character error.
297 * That's an heuristic, since it's operating outside of parsing it could
298 * try to use a meta which had been commented out, that's the reason it
299 * should only be used in case of error, not as a default.
300 *
301 * Returns an encoding string or NULL if not found, the string need to
302 * be freed
303 */
304static xmlChar *
305htmlFindEncoding(xmlParserCtxtPtr ctxt) {
306 const xmlChar *start, *cur, *end;
307
308 if ((ctxt == NULL) || (ctxt->input == NULL) ||
309 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
310 (ctxt->input->buf->encoder != NULL))
311 return(NULL);
312 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
313 return(NULL);
314
315 start = ctxt->input->cur;
316 end = ctxt->input->end;
317 /* we also expect the input buffer to be zero terminated */
318 if (*end != 0)
319 return(NULL);
320
321 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
322 if (cur == NULL)
323 return(NULL);
324 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
325 if (cur == NULL)
326 return(NULL);
327 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
328 if (cur == NULL)
329 return(NULL);
330 cur += 8;
331 start = cur;
332 while (((*cur >= 'A') && (*cur <= 'Z')) ||
333 ((*cur >= 'a') && (*cur <= 'z')) ||
334 ((*cur >= '0') && (*cur <= '9')) ||
335 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
336 cur++;
337 if (cur == start)
338 return(NULL);
339 return(xmlStrndup(start, cur - start));
340}
341
342/**
Owen Taylor3473f882001-02-23 17:55:21 +0000343 * htmlCurrentChar:
344 * @ctxt: the HTML parser context
345 * @len: pointer to the length of the char read
346 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000347 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000348 * bytes in the input buffer. Implement the end of line normalization:
349 * 2.11 End-of-Line Handling
350 * If the encoding is unspecified, in the case we find an ISO-Latin-1
351 * char, then the encoding converter is plugged in automatically.
352 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000353 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000354 */
355
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000356static int
Owen Taylor3473f882001-02-23 17:55:21 +0000357htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
358 if (ctxt->instate == XML_PARSER_EOF)
359 return(0);
360
361 if (ctxt->token != 0) {
362 *len = 0;
363 return(ctxt->token);
364 }
365 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
366 /*
367 * We are supposed to handle UTF8, check it's valid
368 * From rfc2044: encoding of the Unicode values on UTF-8:
369 *
370 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
371 * 0000 0000-0000 007F 0xxxxxxx
372 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
373 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
374 *
375 * Check for the 0x110000 limit too
376 */
377 const unsigned char *cur = ctxt->input->cur;
378 unsigned char c;
379 unsigned int val;
380
381 c = *cur;
382 if (c & 0x80) {
383 if (cur[1] == 0)
384 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
385 if ((cur[1] & 0xc0) != 0x80)
386 goto encoding_error;
387 if ((c & 0xe0) == 0xe0) {
388
389 if (cur[2] == 0)
390 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
391 if ((cur[2] & 0xc0) != 0x80)
392 goto encoding_error;
393 if ((c & 0xf0) == 0xf0) {
394 if (cur[3] == 0)
395 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
396 if (((c & 0xf8) != 0xf0) ||
397 ((cur[3] & 0xc0) != 0x80))
398 goto encoding_error;
399 /* 4-byte code */
400 *len = 4;
401 val = (cur[0] & 0x7) << 18;
402 val |= (cur[1] & 0x3f) << 12;
403 val |= (cur[2] & 0x3f) << 6;
404 val |= cur[3] & 0x3f;
405 } else {
406 /* 3-byte code */
407 *len = 3;
408 val = (cur[0] & 0xf) << 12;
409 val |= (cur[1] & 0x3f) << 6;
410 val |= cur[2] & 0x3f;
411 }
412 } else {
413 /* 2-byte code */
414 *len = 2;
415 val = (cur[0] & 0x1f) << 6;
416 val |= cur[1] & 0x3f;
417 }
418 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000419 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
420 "Char 0x%X out of allowed range\n", val);
Owen Taylor3473f882001-02-23 17:55:21 +0000421 }
422 return(val);
423 } else {
424 /* 1-byte code */
425 *len = 1;
426 return((int) *ctxt->input->cur);
427 }
428 }
429 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000430 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000431 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000432 * XML constructs only use < 128 chars
433 */
434 *len = 1;
435 if ((int) *ctxt->input->cur < 0x80)
436 return((int) *ctxt->input->cur);
437
438 /*
439 * Humm this is bad, do an automatic flow conversion
440 */
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200441 {
442 xmlChar * guess;
443 xmlCharEncodingHandlerPtr handler;
444
445 guess = htmlFindEncoding(ctxt);
446 if (guess == NULL) {
447 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
448 } else {
449 if (ctxt->input->encoding != NULL)
450 xmlFree((xmlChar *) ctxt->input->encoding);
451 ctxt->input->encoding = guess;
452 handler = xmlFindCharEncodingHandler((const char *) guess);
453 if (handler != NULL) {
454 xmlSwitchToEncoding(ctxt, handler);
455 } else {
456 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
457 "Unsupported encoding %s", guess, NULL);
458 }
459 }
460 ctxt->charset = XML_CHAR_ENCODING_UTF8;
461 }
462
Owen Taylor3473f882001-02-23 17:55:21 +0000463 return(xmlCurrentChar(ctxt, len));
464
465encoding_error:
466 /*
467 * If we detect an UTF8 error that probably mean that the
468 * input encoding didn't get properly advertized in the
469 * declaration header. Report the error and switch the encoding
470 * to ISO-Latin-1 (if you don't like this policy, just declare the
471 * encoding !)
472 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000473 {
474 char buffer[150];
475
Daniel Veillard861101d2007-06-12 08:38:57 +0000476 if (ctxt->input->end - ctxt->input->cur >= 4) {
477 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
478 ctxt->input->cur[0], ctxt->input->cur[1],
479 ctxt->input->cur[2], ctxt->input->cur[3]);
480 } else {
481 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
482 }
Daniel Veillarda03e3652004-11-02 18:45:30 +0000483 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
484 "Input is not proper UTF-8, indicate encoding !\n",
485 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000486 }
487
488 ctxt->charset = XML_CHAR_ENCODING_8859_1;
489 *len = 1;
490 return((int) *ctxt->input->cur);
491}
492
493/**
Owen Taylor3473f882001-02-23 17:55:21 +0000494 * htmlSkipBlankChars:
495 * @ctxt: the HTML parser context
496 *
497 * skip all blanks character found at that point in the input streams.
498 *
499 * Returns the number of space chars skipped
500 */
501
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000502static int
Owen Taylor3473f882001-02-23 17:55:21 +0000503htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
504 int res = 0;
505
William M. Brack76e95df2003-10-18 16:20:14 +0000506 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000507 if ((*ctxt->input->cur == 0) &&
508 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
509 xmlPopInput(ctxt);
510 } else {
511 if (*(ctxt->input->cur) == '\n') {
512 ctxt->input->line++; ctxt->input->col = 1;
513 } else ctxt->input->col++;
514 ctxt->input->cur++;
515 ctxt->nbChars++;
516 if (*ctxt->input->cur == 0)
517 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
518 }
519 res++;
520 }
521 return(res);
522}
523
524
525
526/************************************************************************
527 * *
528 * The list of HTML elements and their properties *
529 * *
530 ************************************************************************/
531
532/*
533 * Start Tag: 1 means the start tag can be ommited
534 * End Tag: 1 means the end tag can be ommited
535 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000536 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000537 * Depr: this element is deprecated
538 * DTD: 1 means that this element is valid only in the Loose DTD
539 * 2 means that this element is valid only in the Frameset DTD
540 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000541 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000542 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000543 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000544
545/* Definitions and a couple of vars for HTML Elements */
546
547#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000548#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000549#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000550#define NB_PHRASE 10
Daniel Veillard491e58e2007-05-02 16:15:18 +0000551#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
552#define NB_SPECIAL 16
Daniel Veillard930dfb62003-02-05 10:17:38 +0000553#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000554#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
555#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
556#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000557#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000558#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000559#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000560#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000561#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000562#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000563#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000564#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000565#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000566#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000567#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000568#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000569#define EMPTY NULL
570
571
Daniel Veillard065abe82006-07-03 08:55:04 +0000572static const char* const html_flow[] = { FLOW, NULL } ;
573static const char* const html_inline[] = { INLINE, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000574
575/* placeholders: elts with content but no subelements */
Daniel Veillard065abe82006-07-03 08:55:04 +0000576static const char* const html_pcdata[] = { NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000577#define html_cdata html_pcdata
578
579
580/* ... and for HTML Attributes */
581
582#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000583#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000584#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000585#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000586#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000587#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000588#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000589#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000590#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000591#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000592#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000593#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000594
Daniel Veillard065abe82006-07-03 08:55:04 +0000595static const char* const html_attrs[] = { ATTRS, NULL } ;
596static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
597static const char* const core_attrs[] = { COREATTRS, NULL } ;
598static const char* const i18n_attrs[] = { I18N, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000599
600
601/* Other declarations that should go inline ... */
Daniel Veillard065abe82006-07-03 08:55:04 +0000602static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000603 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
604 "tabindex", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000605static const char* const target_attr[] = { "target", NULL } ;
606static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
607static const char* const alt_attr[] = { "alt", NULL } ;
608static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
609static const char* const href_attrs[] = { "href", NULL } ;
610static const char* const clear_attrs[] = { "clear", NULL } ;
611static const char* const inline_p[] = { INLINE, "p", NULL } ;
612
613static const char* const flow_param[] = { FLOW, "param", NULL } ;
614static const char* const applet_attrs[] = { COREATTRS , "codebase",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000615 "archive", "alt", "name", "height", "width", "align",
616 "hspace", "vspace", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000617static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000618 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000619static const char* const basefont_attrs[] =
Daniel Veillard930dfb62003-02-05 10:17:38 +0000620 { "id", "size", "color", "face", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000621static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
622static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
623static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
624static const char* const body_depr[] = { "background", "bgcolor", "text",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000625 "link", "vlink", "alink", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000626static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000627 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
628
629
Daniel Veillard065abe82006-07-03 08:55:04 +0000630static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
631static const char* const col_elt[] = { "col", NULL } ;
632static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
633static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
634static const char* const dl_contents[] = { "dt", "dd", NULL } ;
635static const char* const compact_attr[] = { "compact", NULL } ;
636static const char* const label_attr[] = { "label", NULL } ;
637static const char* const fieldset_contents[] = { FLOW, "legend" } ;
638static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
639static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
640static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
641static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
642static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
643static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
644static const char* const head_attrs[] = { I18N, "profile", NULL } ;
645static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
646static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
647static const char* const version_attr[] = { "version", NULL } ;
648static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
649static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
650static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
Daniel Veillard491e58e2007-05-02 16:15:18 +0000651static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000652static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
653static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
654static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
655static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
656static const char* const align_attr[] = { "align", NULL } ;
657static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
658static const char* const map_contents[] = { BLOCK, "area", NULL } ;
659static const char* const name_attr[] = { "name", NULL } ;
660static const char* const action_attr[] = { "action", NULL } ;
661static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
662static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
663static const char* const content_attr[] = { "content", NULL } ;
664static const char* const type_attr[] = { "type", NULL } ;
665static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
666static const char* const object_contents[] = { FLOW, "param", NULL } ;
667static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
668static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
669static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
670static const char* const option_elt[] = { "option", NULL } ;
671static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
672static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
673static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
674static const char* const width_attr[] = { "width", NULL } ;
675static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
676static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
677static const char* const language_attr[] = { "language", NULL } ;
678static const char* const select_content[] = { "optgroup", "option", NULL } ;
679static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
680static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
Roland Steiner04f8eef2009-05-12 09:16:16 +0200681static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000682static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
683static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
684static const char* const tr_elt[] = { "tr", NULL } ;
685static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
686static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
687static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
688static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
689static const char* const tr_contents[] = { "th", "td", NULL } ;
690static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
691static const char* const li_elt[] = { "li", NULL } ;
692static const char* const ul_depr[] = { "type", "compact", NULL} ;
693static const char* const dir_attr[] = { "dir", NULL} ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000694
695#define DECL (const char**)
696
Daniel Veillard22090732001-07-16 00:06:07 +0000697static const htmlElemDesc
698html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000699{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
700 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
701},
702{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
703 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
704},
705{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
706 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
707},
708{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
709 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
710},
711{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
712 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
713},
714{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
715 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
716},
717{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
718 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
719},
720{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
721 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
722},
723{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
724 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
725},
726{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
727 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
728},
729{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
730 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
731},
732{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
733 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
734},
735{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
736 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
737},
738{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
739 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
740},
741{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
742 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
743},
744{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
745 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
746},
747{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
748 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
749},
750{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
751 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
752},
753{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
754 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
755},
756{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
757 EMPTY , NULL , DECL col_attrs , NULL, NULL
758},
759{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
760 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
761},
762{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
763 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
764},
765{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
766 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
767},
768{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
769 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
770},
771{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
772 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
773},
774{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
775 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
776},
777{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
William M. Bracke978ae22007-03-21 06:16:02 +0000778 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
Daniel Veillard930dfb62003-02-05 10:17:38 +0000779},
780{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
781 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
782},
783{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
784 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
785},
Daniel Veillard640f89e2008-01-11 06:24:09 +0000786{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
Daniel Veillard491e58e2007-05-02 16:15:18 +0000787 EMPTY, NULL, DECL embed_attrs, NULL, NULL
788},
Daniel Veillard930dfb62003-02-05 10:17:38 +0000789{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
790 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
791},
792{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
793 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
794},
795{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
796 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
797},
798{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
799 EMPTY, NULL, NULL, DECL frame_attrs, NULL
800},
801{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
802 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
803},
804{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
805 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
806},
807{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
808 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
809},
810{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
811 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
812},
813{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
814 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
815},
816{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
817 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
818},
819{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
820 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
821},
822{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
823 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
824},
825{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
826 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
827},
828{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
829 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
830},
831{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
832 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
833},
834{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
835 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
836},
837{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
William M. Bracke978ae22007-03-21 06:16:02 +0000838 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
Daniel Veillard930dfb62003-02-05 10:17:38 +0000839},
840{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
841 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
842},
843{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
844 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
845},
846{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
847 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
848},
849{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
850 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
851},
852{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
853 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
854},
855{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
856 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
857},
858{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
859 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
860},
861{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
862 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
863},
864{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
William M. Bracke978ae22007-03-21 06:16:02 +0000865 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000866},
867{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
868 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
869},
870{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
871 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
872},
873{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
874 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
875},
876{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
877 DECL html_flow, "div", DECL html_attrs, NULL, NULL
878},
879{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
880 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
881},
882{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
883 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
884},
885{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
William M. Bracke978ae22007-03-21 06:16:02 +0000886 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000887},
888{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
889 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
890},
891{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
892 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
893},
894{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
William M. Bracke978ae22007-03-21 06:16:02 +0000895 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000896},
897{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
898 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
899},
900{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
901 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
902},
903{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
904 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
905},
906{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
907 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
908},
909{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
910 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
911},
912{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
913 DECL select_content, NULL, DECL select_attrs, NULL, NULL
914},
915{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
916 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
917},
918{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
919 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
920},
921{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
922 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
923},
924{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
925 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
926},
927{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
928 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
929},
930{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
931 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
932},
933{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
934 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
935},
936{ "table", 0, 0, 0, 0, 0, 0, 0, "",
937 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
938},
939{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
940 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
941},
942{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
943 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
944},
945{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
946 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
947},
948{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
949 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
950},
951{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
952 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
953},
954{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
955 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
956},
957{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
958 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
959},
960{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
961 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
962},
963{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
964 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
965},
966{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
967 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
968},
969{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
970 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
971},
972{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
973 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
974}
Owen Taylor3473f882001-02-23 17:55:21 +0000975};
976
977/*
Owen Taylor3473f882001-02-23 17:55:21 +0000978 * start tags that imply the end of current element
979 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000980static const char * const htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000981"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
982 "dl", "ul", "ol", "menu", "dir", "address", "pre",
983 "listing", "xmp", "head", NULL,
984"head", "p", NULL,
985"title", "p", NULL,
986"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +0000987"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000988"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
989 "pre", "listing", "xmp", "head", "li", NULL,
990"hr", "p", "head", NULL,
991"h1", "p", "head", NULL,
992"h2", "p", "head", NULL,
993"h3", "p", "head", NULL,
994"h4", "p", "head", NULL,
995"h5", "p", "head", NULL,
996"h6", "p", "head", NULL,
997"dir", "p", "head", NULL,
998"address", "p", "head", "ul", NULL,
999"pre", "p", "head", "ul", NULL,
1000"listing", "p", "head", NULL,
1001"xmp", "p", "head", NULL,
1002"blockquote", "p", "head", NULL,
1003"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1004 "xmp", "head", NULL,
1005"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1006 "head", "dd", NULL,
1007"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1008 "head", "dt", NULL,
1009"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1010 "listing", "xmp", NULL,
1011"ol", "p", "head", "ul", NULL,
1012"menu", "p", "head", "ul", NULL,
1013"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
1014"div", "p", "head", NULL,
1015"noscript", "p", "head", NULL,
1016"center", "font", "b", "i", "p", "head", NULL,
1017"a", "a", NULL,
1018"caption", "p", NULL,
1019"colgroup", "caption", "colgroup", "col", "p", NULL,
1020"col", "caption", "col", "p", NULL,
1021"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1022 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +00001023"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1024"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001025"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1026"thead", "caption", "col", "colgroup", NULL,
1027"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1028 "tbody", "p", NULL,
1029"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1030 "tfoot", "tbody", "p", NULL,
1031"optgroup", "option", NULL,
1032"option", "option", NULL,
1033"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1034 "pre", "listing", "xmp", "a", NULL,
1035NULL
1036};
1037
1038/*
1039 * The list of HTML elements which are supposed not to have
1040 * CDATA content and where a p element will be implied
1041 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001042 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +00001043 * implied paragraph
1044 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001045static const char *const htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001046 "html",
1047 "head",
Owen Taylor3473f882001-02-23 17:55:21 +00001048 NULL
1049};
1050
1051/*
1052 * The list of HTML attributes which are of content %Script;
1053 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1054 * it assumes the name starts with 'on'
1055 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001056static const char *const htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001057 "onclick",
1058 "ondblclick",
1059 "onmousedown",
1060 "onmouseup",
1061 "onmouseover",
1062 "onmousemove",
1063 "onmouseout",
1064 "onkeypress",
1065 "onkeydown",
1066 "onkeyup",
1067 "onload",
1068 "onunload",
1069 "onfocus",
1070 "onblur",
1071 "onsubmit",
1072 "onrest",
1073 "onchange",
1074 "onselect"
1075};
1076
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001077/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001078 * This table is used by the htmlparser to know what to do with
1079 * broken html pages. By assigning different priorities to different
1080 * elements the parser can decide how to handle extra endtags.
1081 * Endtags are only allowed to close elements with lower or equal
1082 * priority.
1083 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001084
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001085typedef struct {
1086 const char *name;
1087 int priority;
1088} elementPriority;
1089
Daniel Veillard22090732001-07-16 00:06:07 +00001090static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001091 {"div", 150},
1092 {"td", 160},
1093 {"th", 160},
1094 {"tr", 170},
1095 {"thead", 180},
1096 {"tbody", 180},
1097 {"tfoot", 180},
1098 {"table", 190},
1099 {"head", 200},
1100 {"body", 200},
1101 {"html", 220},
1102 {NULL, 100} /* Default priority */
1103};
Owen Taylor3473f882001-02-23 17:55:21 +00001104
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001105static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001106static int htmlStartCloseIndexinitialized = 0;
1107
1108/************************************************************************
1109 * *
1110 * functions to handle HTML specific data *
1111 * *
1112 ************************************************************************/
1113
1114/**
1115 * htmlInitAutoClose:
1116 *
1117 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1118 * This is not reentrant. Call xmlInitParser() once before processing in
1119 * case of use in multithreaded programs.
1120 */
1121void
1122htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001123 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001124
1125 if (htmlStartCloseIndexinitialized) return;
1126
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001127 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1128 indx = 0;
1129 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
Daniel Veillard28aac0b2006-10-16 08:31:18 +00001130 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001131 while (htmlStartClose[i] != NULL) i++;
1132 i++;
1133 }
1134 htmlStartCloseIndexinitialized = 1;
1135}
1136
1137/**
1138 * htmlTagLookup:
1139 * @tag: The tag name in lowercase
1140 *
1141 * Lookup the HTML tag in the ElementTable
1142 *
1143 * Returns the related htmlElemDescPtr or NULL if not found.
1144 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001145const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001146htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001147 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001148
1149 for (i = 0; i < (sizeof(html40ElementTable) /
1150 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001151 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001152 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001153 }
1154 return(NULL);
1155}
1156
1157/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001158 * htmlGetEndPriority:
1159 * @name: The name of the element to look up the priority for.
1160 *
1161 * Return value: The "endtag" priority.
1162 **/
1163static int
1164htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001165 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001166
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001167 while ((htmlEndPriority[i].name != NULL) &&
1168 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1169 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001170
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001171 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001172}
1173
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001174
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001175/**
Owen Taylor3473f882001-02-23 17:55:21 +00001176 * htmlCheckAutoClose:
1177 * @newtag: The new tag name
1178 * @oldtag: The old tag name
1179 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001180 * Checks whether the new tag is one of the registered valid tags for
1181 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001182 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1183 *
1184 * Returns 0 if no, 1 if yes.
1185 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001186static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001187htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1188{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001189 int i, indx;
1190 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001191
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001192 if (htmlStartCloseIndexinitialized == 0)
1193 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001194
1195 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001196 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001197 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001198 if (closed == NULL)
1199 return (0);
1200 if (xmlStrEqual(BAD_CAST * closed, newtag))
1201 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001202 }
1203
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001204 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001205 i++;
1206 while (htmlStartClose[i] != NULL) {
1207 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001208 return (1);
1209 }
1210 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001211 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001212 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001213}
1214
1215/**
1216 * htmlAutoCloseOnClose:
1217 * @ctxt: an HTML parser context
1218 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001219 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001220 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001221 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001222 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001223static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001224htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1225{
1226 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001227 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001228
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001229 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001230
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001231 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001232
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001233 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1234 break;
1235 /*
1236 * A missplaced endtag can only close elements with lower
1237 * or equal priority, so if we find an element with higher
1238 * priority before we find an element with
1239 * matching name, we just ignore this endtag
1240 */
1241 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1242 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001243 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001244 if (i < 0)
1245 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001246
1247 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001248 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001249 if ((info != NULL) && (info->endTag == 3)) {
1250 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1251 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001252 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001253 }
1254 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1255 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001256 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001257 }
1258}
1259
1260/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001261 * htmlAutoCloseOnEnd:
1262 * @ctxt: an HTML parser context
1263 *
1264 * Close all remaining tags at the end of the stream
1265 */
1266static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001267htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1268{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001269 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001270
William M. Brack899e64a2003-09-26 18:03:42 +00001271 if (ctxt->nameNr == 0)
1272 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001273 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001274 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1275 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001276 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001277 }
1278}
1279
1280/**
Owen Taylor3473f882001-02-23 17:55:21 +00001281 * htmlAutoClose:
1282 * @ctxt: an HTML parser context
1283 * @newtag: The new tag name or NULL
1284 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001285 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001286 * The list is kept in htmlStartClose array. This function is
1287 * called when a new tag has been detected and generates the
1288 * appropriates closes if possible/needed.
1289 * If newtag is NULL this mean we are at the end of the resource
1290 * and we should check
1291 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001292static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001293htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1294{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001295 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001296 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001297 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1298 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001299 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001300 }
1301 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001302 htmlAutoCloseOnEnd(ctxt);
1303 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001304 }
1305 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001306 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1307 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1308 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001309 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1310 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001311 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001312 }
Owen Taylor3473f882001-02-23 17:55:21 +00001313}
1314
1315/**
1316 * htmlAutoCloseTag:
1317 * @doc: the HTML document
1318 * @name: The tag name
1319 * @elem: the HTML element
1320 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001321 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001322 * The list is kept in htmlStartClose array. This function checks
1323 * if the element or one of it's children would autoclose the
1324 * given tag.
1325 *
1326 * Returns 1 if autoclose, 0 otherwise
1327 */
1328int
1329htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1330 htmlNodePtr child;
1331
1332 if (elem == NULL) return(1);
1333 if (xmlStrEqual(name, elem->name)) return(0);
1334 if (htmlCheckAutoClose(elem->name, name)) return(1);
1335 child = elem->children;
1336 while (child != NULL) {
1337 if (htmlAutoCloseTag(doc, name, child)) return(1);
1338 child = child->next;
1339 }
1340 return(0);
1341}
1342
1343/**
1344 * htmlIsAutoClosed:
1345 * @doc: the HTML document
1346 * @elem: the HTML element
1347 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001348 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001349 * The list is kept in htmlStartClose array. This function checks
1350 * if a tag is autoclosed by one of it's child
1351 *
1352 * Returns 1 if autoclosed, 0 otherwise
1353 */
1354int
1355htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1356 htmlNodePtr child;
1357
1358 if (elem == NULL) return(1);
1359 child = elem->children;
1360 while (child != NULL) {
1361 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1362 child = child->next;
1363 }
1364 return(0);
1365}
1366
1367/**
1368 * htmlCheckImplied:
1369 * @ctxt: an HTML parser context
1370 * @newtag: The new tag name
1371 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001372 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001373 * called when a new tag has been detected and generates the
1374 * appropriates implicit tags if missing
1375 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001376static void
Owen Taylor3473f882001-02-23 17:55:21 +00001377htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1378 if (!htmlOmittedDefaultValue)
1379 return;
1380 if (xmlStrEqual(newtag, BAD_CAST"html"))
1381 return;
1382 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001383 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001384 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1385 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1386 }
1387 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1388 return;
1389 if ((ctxt->nameNr <= 1) &&
1390 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1391 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1392 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1393 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1394 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1395 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1396 /*
1397 * dropped OBJECT ... i you put it first BODY will be
1398 * assumed !
1399 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001400 htmlnamePush(ctxt, BAD_CAST"head");
Owen Taylor3473f882001-02-23 17:55:21 +00001401 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1402 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1403 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1404 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1405 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1406 int i;
1407 for (i = 0;i < ctxt->nameNr;i++) {
1408 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1409 return;
1410 }
1411 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1412 return;
1413 }
1414 }
1415
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001416 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001417 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1418 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1419 }
1420}
1421
1422/**
1423 * htmlCheckParagraph
1424 * @ctxt: an HTML parser context
1425 *
1426 * Check whether a p element need to be implied before inserting
1427 * characters in the current element.
1428 *
1429 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1430 * in case of error.
1431 */
1432
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001433static int
Owen Taylor3473f882001-02-23 17:55:21 +00001434htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1435 const xmlChar *tag;
1436 int i;
1437
1438 if (ctxt == NULL)
1439 return(-1);
1440 tag = ctxt->name;
1441 if (tag == NULL) {
1442 htmlAutoClose(ctxt, BAD_CAST"p");
1443 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001444 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001445 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1446 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1447 return(1);
1448 }
1449 if (!htmlOmittedDefaultValue)
1450 return(0);
1451 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1452 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001453 htmlAutoClose(ctxt, BAD_CAST"p");
1454 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001455 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001456 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1457 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1458 return(1);
1459 }
1460 }
1461 return(0);
1462}
1463
1464/**
1465 * htmlIsScriptAttribute:
1466 * @name: an attribute name
1467 *
1468 * Check if an attribute is of content type Script
1469 *
1470 * Returns 1 is the attribute is a script 0 otherwise
1471 */
1472int
1473htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001474 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001475
1476 if (name == NULL)
1477 return(0);
1478 /*
1479 * all script attributes start with 'on'
1480 */
1481 if ((name[0] != 'o') || (name[1] != 'n'))
1482 return(0);
1483 for (i = 0;
1484 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1485 i++) {
1486 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1487 return(1);
1488 }
1489 return(0);
1490}
1491
1492/************************************************************************
1493 * *
1494 * The list of HTML predefined entities *
1495 * *
1496 ************************************************************************/
1497
1498
Daniel Veillard22090732001-07-16 00:06:07 +00001499static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001500/*
1501 * the 4 absolute ones, plus apostrophe.
1502 */
1503{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1504{ 38, "amp", "ampersand, U+0026 ISOnum" },
1505{ 39, "apos", "single quote" },
1506{ 60, "lt", "less-than sign, U+003C ISOnum" },
1507{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1508
1509/*
1510 * A bunch still in the 128-255 range
1511 * Replacing them depend really on the charset used.
1512 */
1513{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1514{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1515{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1516{ 163, "pound","pound sign, U+00A3 ISOnum" },
1517{ 164, "curren","currency sign, U+00A4 ISOnum" },
1518{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1519{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1520{ 167, "sect", "section sign, U+00A7 ISOnum" },
1521{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1522{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1523{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1524{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1525{ 172, "not", "not sign, U+00AC ISOnum" },
1526{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1527{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1528{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1529{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1530{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1531{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1532{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1533{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1534{ 181, "micro","micro sign, U+00B5 ISOnum" },
1535{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1536{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1537{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1538{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1539{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1540{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1541{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1542{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1543{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1544{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1545{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1546{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1547{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1548{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1549{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1550{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1551{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1552{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1553{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1554{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1555{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1556{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1557{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1558{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1559{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1560{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1561{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1562{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1563{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1564{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1565{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1566{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1567{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1568{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1569{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1570{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1571{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1572{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1573{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1574{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1575{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1576{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1577{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1578{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1579{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1580{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1581{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1582{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1583{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1584{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1585{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1586{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1587{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1588{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1589{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1590{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1591{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1592{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1593{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1594{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1595{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1596{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1597{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1598{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1599{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1600{ 247, "divide","division sign, U+00F7 ISOnum" },
1601{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1602{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1603{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1604{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1605{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1606{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1607{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1608{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1609
1610{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1611{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1612{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1613{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1614{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1615
1616/*
1617 * Anything below should really be kept as entities references
1618 */
1619{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1620
1621{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1622{ 732, "tilde","small tilde, U+02DC ISOdia" },
1623
1624{ 913, "Alpha","greek capital letter alpha, U+0391" },
1625{ 914, "Beta", "greek capital letter beta, U+0392" },
1626{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1627{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1628{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1629{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1630{ 919, "Eta", "greek capital letter eta, U+0397" },
1631{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1632{ 921, "Iota", "greek capital letter iota, U+0399" },
1633{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001634{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001635{ 924, "Mu", "greek capital letter mu, U+039C" },
1636{ 925, "Nu", "greek capital letter nu, U+039D" },
1637{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1638{ 927, "Omicron","greek capital letter omicron, U+039F" },
1639{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1640{ 929, "Rho", "greek capital letter rho, U+03A1" },
1641{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1642{ 932, "Tau", "greek capital letter tau, U+03A4" },
1643{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1644{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1645{ 935, "Chi", "greek capital letter chi, U+03A7" },
1646{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1647{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1648
1649{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1650{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1651{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1652{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1653{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1654{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1655{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1656{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1657{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1658{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1659{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1660{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1661{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1662{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1663{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1664{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1665{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1666{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1667{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1668{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1669{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1670{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1671{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1672{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1673{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1674{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1675{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1676{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1677
1678{ 8194, "ensp", "en space, U+2002 ISOpub" },
1679{ 8195, "emsp", "em space, U+2003 ISOpub" },
1680{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1681{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1682{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1683{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1684{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1685{ 8211, "ndash","en dash, U+2013 ISOpub" },
1686{ 8212, "mdash","em dash, U+2014 ISOpub" },
1687{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1688{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1689{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1690{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1691{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1692{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1693{ 8224, "dagger","dagger, U+2020 ISOpub" },
1694{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1695
1696{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1697{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1698
1699{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1700
1701{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1702{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1703
1704{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1705{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1706
1707{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1708{ 8260, "frasl","fraction slash, U+2044 NEW" },
1709
1710{ 8364, "euro", "euro sign, U+20AC NEW" },
1711
1712{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1713{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1714{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1715{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1716{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1717{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1718{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1719{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1720{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1721{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1722{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1723{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1724{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1725{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1726{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1727{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1728
1729{ 8704, "forall","for all, U+2200 ISOtech" },
1730{ 8706, "part", "partial differential, U+2202 ISOtech" },
1731{ 8707, "exist","there exists, U+2203 ISOtech" },
1732{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1733{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1734{ 8712, "isin", "element of, U+2208 ISOtech" },
1735{ 8713, "notin","not an element of, U+2209 ISOtech" },
1736{ 8715, "ni", "contains as member, U+220B ISOtech" },
1737{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001738{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001739{ 8722, "minus","minus sign, U+2212 ISOtech" },
1740{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1741{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1742{ 8733, "prop", "proportional to, U+221D ISOtech" },
1743{ 8734, "infin","infinity, U+221E ISOtech" },
1744{ 8736, "ang", "angle, U+2220 ISOamso" },
1745{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1746{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1747{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1748{ 8746, "cup", "union = cup, U+222A ISOtech" },
1749{ 8747, "int", "integral, U+222B ISOtech" },
1750{ 8756, "there4","therefore, U+2234 ISOtech" },
1751{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1752{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1753{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1754{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1755{ 8801, "equiv","identical to, U+2261 ISOtech" },
1756{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1757{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1758{ 8834, "sub", "subset of, U+2282 ISOtech" },
1759{ 8835, "sup", "superset of, U+2283 ISOtech" },
1760{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1761{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1762{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1763{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1764{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1765{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1766{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1767{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1768{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1769{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1770{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1771{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1772{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1773{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1774
1775{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1776{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1777{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1778{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1779
1780};
1781
1782/************************************************************************
1783 * *
1784 * Commodity functions to handle entities *
1785 * *
1786 ************************************************************************/
1787
1788/*
1789 * Macro used to grow the current buffer.
1790 */
1791#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001792 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001793 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001794 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1795 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001796 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001797 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001798 return(NULL); \
1799 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001800 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001801}
1802
1803/**
1804 * htmlEntityLookup:
1805 * @name: the entity name
1806 *
1807 * Lookup the given entity in EntitiesTable
1808 *
1809 * TODO: the linear scan is really ugly, an hash table is really needed.
1810 *
1811 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1812 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001813const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001814htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001815 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001816
1817 for (i = 0;i < (sizeof(html40EntitiesTable)/
1818 sizeof(html40EntitiesTable[0]));i++) {
1819 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001820 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001821 }
1822 }
1823 return(NULL);
1824}
1825
1826/**
1827 * htmlEntityValueLookup:
1828 * @value: the entity's unicode value
1829 *
1830 * Lookup the given entity in EntitiesTable
1831 *
1832 * TODO: the linear scan is really ugly, an hash table is really needed.
1833 *
1834 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1835 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001836const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001837htmlEntityValueLookup(unsigned int value) {
1838 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001839
1840 for (i = 0;i < (sizeof(html40EntitiesTable)/
1841 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001842 if (html40EntitiesTable[i].value >= value) {
1843 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001844 break;
William M. Brack78637da2003-07-31 14:47:38 +00001845 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001846 }
Owen Taylor3473f882001-02-23 17:55:21 +00001847 }
1848 return(NULL);
1849}
1850
1851/**
1852 * UTF8ToHtml:
1853 * @out: a pointer to an array of bytes to store the result
1854 * @outlen: the length of @out
1855 * @in: a pointer to an array of UTF-8 chars
1856 * @inlen: the length of @in
1857 *
1858 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1859 * plus HTML entities block of chars out.
1860 *
1861 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1862 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001863 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001864 * The value of @outlen after return is the number of octets consumed.
1865 */
1866int
1867UTF8ToHtml(unsigned char* out, int *outlen,
1868 const unsigned char* in, int *inlen) {
1869 const unsigned char* processed = in;
1870 const unsigned char* outend;
1871 const unsigned char* outstart = out;
1872 const unsigned char* instart = in;
1873 const unsigned char* inend;
1874 unsigned int c, d;
1875 int trailing;
1876
Daniel Veillardce682bc2004-11-05 17:22:25 +00001877 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00001878 if (in == NULL) {
1879 /*
1880 * initialization nothing to do
1881 */
1882 *outlen = 0;
1883 *inlen = 0;
1884 return(0);
1885 }
1886 inend = in + (*inlen);
1887 outend = out + (*outlen);
1888 while (in < inend) {
1889 d = *in++;
1890 if (d < 0x80) { c= d; trailing= 0; }
1891 else if (d < 0xC0) {
1892 /* trailing byte in leading position */
1893 *outlen = out - outstart;
1894 *inlen = processed - instart;
1895 return(-2);
1896 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1897 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1898 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1899 else {
1900 /* no chance for this in Ascii */
1901 *outlen = out - outstart;
1902 *inlen = processed - instart;
1903 return(-2);
1904 }
1905
1906 if (inend - in < trailing) {
1907 break;
1908 }
1909
1910 for ( ; trailing; trailing--) {
1911 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1912 break;
1913 c <<= 6;
1914 c |= d & 0x3F;
1915 }
1916
1917 /* assertion: c is a single UTF-4 value */
1918 if (c < 0x80) {
1919 if (out + 1 >= outend)
1920 break;
1921 *out++ = c;
1922 } else {
1923 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001924 const htmlEntityDesc * ent;
Daniel Veillard1032ac42006-11-23 16:18:30 +00001925 const char *cp;
1926 char nbuf[16];
Owen Taylor3473f882001-02-23 17:55:21 +00001927
1928 /*
1929 * Try to lookup a predefined HTML entity for it
1930 */
1931
1932 ent = htmlEntityValueLookup(c);
1933 if (ent == NULL) {
Daniel Veillard1032ac42006-11-23 16:18:30 +00001934 snprintf(nbuf, sizeof(nbuf), "#%u", c);
1935 cp = nbuf;
Owen Taylor3473f882001-02-23 17:55:21 +00001936 }
Daniel Veillard1032ac42006-11-23 16:18:30 +00001937 else
1938 cp = ent->name;
1939 len = strlen(cp);
Owen Taylor3473f882001-02-23 17:55:21 +00001940 if (out + 2 + len >= outend)
1941 break;
1942 *out++ = '&';
Daniel Veillard1032ac42006-11-23 16:18:30 +00001943 memcpy(out, cp, len);
Owen Taylor3473f882001-02-23 17:55:21 +00001944 out += len;
1945 *out++ = ';';
1946 }
1947 processed = in;
1948 }
1949 *outlen = out - outstart;
1950 *inlen = processed - instart;
1951 return(0);
1952}
1953
1954/**
1955 * htmlEncodeEntities:
1956 * @out: a pointer to an array of bytes to store the result
1957 * @outlen: the length of @out
1958 * @in: a pointer to an array of UTF-8 chars
1959 * @inlen: the length of @in
1960 * @quoteChar: the quote character to escape (' or ") or zero.
1961 *
1962 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1963 * plus HTML entities block of chars out.
1964 *
1965 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1966 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001967 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001968 * The value of @outlen after return is the number of octets consumed.
1969 */
1970int
1971htmlEncodeEntities(unsigned char* out, int *outlen,
1972 const unsigned char* in, int *inlen, int quoteChar) {
1973 const unsigned char* processed = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001974 const unsigned char* outend;
Owen Taylor3473f882001-02-23 17:55:21 +00001975 const unsigned char* outstart = out;
1976 const unsigned char* instart = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001977 const unsigned char* inend;
Owen Taylor3473f882001-02-23 17:55:21 +00001978 unsigned int c, d;
1979 int trailing;
1980
Daniel Veillardce682bc2004-11-05 17:22:25 +00001981 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
1982 return(-1);
1983 outend = out + (*outlen);
1984 inend = in + (*inlen);
Owen Taylor3473f882001-02-23 17:55:21 +00001985 while (in < inend) {
1986 d = *in++;
1987 if (d < 0x80) { c= d; trailing= 0; }
1988 else if (d < 0xC0) {
1989 /* trailing byte in leading position */
1990 *outlen = out - outstart;
1991 *inlen = processed - instart;
1992 return(-2);
1993 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1994 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1995 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1996 else {
1997 /* no chance for this in Ascii */
1998 *outlen = out - outstart;
1999 *inlen = processed - instart;
2000 return(-2);
2001 }
2002
2003 if (inend - in < trailing)
2004 break;
2005
2006 while (trailing--) {
2007 if (((d= *in++) & 0xC0) != 0x80) {
2008 *outlen = out - outstart;
2009 *inlen = processed - instart;
2010 return(-2);
2011 }
2012 c <<= 6;
2013 c |= d & 0x3F;
2014 }
2015
2016 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002017 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2018 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002019 if (out >= outend)
2020 break;
2021 *out++ = c;
2022 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00002023 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002024 const char *cp;
2025 char nbuf[16];
2026 int len;
2027
2028 /*
2029 * Try to lookup a predefined HTML entity for it
2030 */
2031 ent = htmlEntityValueLookup(c);
2032 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00002033 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00002034 cp = nbuf;
2035 }
2036 else
2037 cp = ent->name;
2038 len = strlen(cp);
2039 if (out + 2 + len > outend)
2040 break;
2041 *out++ = '&';
2042 memcpy(out, cp, len);
2043 out += len;
2044 *out++ = ';';
2045 }
2046 processed = in;
2047 }
2048 *outlen = out - outstart;
2049 *inlen = processed - instart;
2050 return(0);
2051}
2052
Owen Taylor3473f882001-02-23 17:55:21 +00002053/************************************************************************
2054 * *
2055 * Commodity functions to handle streams *
2056 * *
2057 ************************************************************************/
2058
2059/**
Owen Taylor3473f882001-02-23 17:55:21 +00002060 * htmlNewInputStream:
2061 * @ctxt: an HTML parser context
2062 *
2063 * Create a new input stream structure
2064 * Returns the new input stream or NULL
2065 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002066static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00002067htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2068 htmlParserInputPtr input;
2069
2070 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2071 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002072 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002073 return(NULL);
2074 }
2075 memset(input, 0, sizeof(htmlParserInput));
2076 input->filename = NULL;
2077 input->directory = NULL;
2078 input->base = NULL;
2079 input->cur = NULL;
2080 input->buf = NULL;
2081 input->line = 1;
2082 input->col = 1;
2083 input->buf = NULL;
2084 input->free = NULL;
2085 input->version = NULL;
2086 input->consumed = 0;
2087 input->length = 0;
2088 return(input);
2089}
2090
2091
2092/************************************************************************
2093 * *
2094 * Commodity functions, cleanup needed ? *
2095 * *
2096 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002097/*
2098 * all tags allowing pc data from the html 4.01 loose dtd
2099 * NOTE: it might be more apropriate to integrate this information
2100 * into the html40ElementTable array but I don't want to risk any
2101 * binary incomptibility
2102 */
2103static const char *allowPCData[] = {
2104 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2105 "blockquote", "body", "button", "caption", "center", "cite", "code",
2106 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2107 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2108 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2109 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2110};
Owen Taylor3473f882001-02-23 17:55:21 +00002111
2112/**
2113 * areBlanks:
2114 * @ctxt: an HTML parser context
2115 * @str: a xmlChar *
2116 * @len: the size of @str
2117 *
2118 * Is this a sequence of blank chars that one can ignore ?
2119 *
2120 * Returns 1 if ignorable 0 otherwise.
2121 */
2122
2123static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002124 unsigned int i;
2125 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002126 xmlNodePtr lastChild;
Daniel Veillard36d73402005-09-01 09:52:30 +00002127 xmlDtdPtr dtd;
Owen Taylor3473f882001-02-23 17:55:21 +00002128
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002129 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002130 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002131
2132 if (CUR == 0) return(1);
2133 if (CUR != '<') return(0);
2134 if (ctxt->name == NULL)
2135 return(1);
2136 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2137 return(1);
2138 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2139 return(1);
Daniel Veillard36d73402005-09-01 09:52:30 +00002140
2141 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2142 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2143 dtd = xmlGetIntSubset(ctxt->myDoc);
2144 if (dtd != NULL && dtd->ExternalID != NULL) {
2145 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2146 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2147 return(1);
2148 }
2149 }
2150
Owen Taylor3473f882001-02-23 17:55:21 +00002151 if (ctxt->node == NULL) return(0);
2152 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002153 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2154 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002155 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002156 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2157 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002158 /* keep ws in constructs like ...<b> </b>...
2159 for all tags "b" allowing PCDATA */
2160 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2161 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2162 return(0);
2163 }
2164 }
Owen Taylor3473f882001-02-23 17:55:21 +00002165 } else if (xmlNodeIsText(lastChild)) {
2166 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002167 } else {
2168 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2169 for all tags "p" allowing PCDATA */
2170 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2171 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2172 return(0);
2173 }
2174 }
Owen Taylor3473f882001-02-23 17:55:21 +00002175 }
2176 return(1);
2177}
2178
2179/**
Owen Taylor3473f882001-02-23 17:55:21 +00002180 * htmlNewDocNoDtD:
2181 * @URI: URI for the dtd, or NULL
2182 * @ExternalID: the external ID of the DTD, or NULL
2183 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002184 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2185 * are NULL
2186 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002187 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002188 */
2189htmlDocPtr
2190htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2191 xmlDocPtr cur;
2192
2193 /*
2194 * Allocate a new document and fill the fields.
2195 */
2196 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2197 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002198 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002199 return(NULL);
2200 }
2201 memset(cur, 0, sizeof(xmlDoc));
2202
2203 cur->type = XML_HTML_DOCUMENT_NODE;
2204 cur->version = NULL;
2205 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002206 cur->doc = cur;
2207 cur->name = NULL;
2208 cur->children = NULL;
2209 cur->extSubset = NULL;
2210 cur->oldNs = NULL;
2211 cur->encoding = NULL;
2212 cur->standalone = 1;
2213 cur->compression = 0;
2214 cur->ids = NULL;
2215 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002216 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002217 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardae0765b2008-07-31 19:54:59 +00002218 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002219 if ((ExternalID != NULL) ||
2220 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002221 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002222 return(cur);
2223}
2224
2225/**
2226 * htmlNewDoc:
2227 * @URI: URI for the dtd, or NULL
2228 * @ExternalID: the external ID of the DTD, or NULL
2229 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002230 * Creates a new HTML document
2231 *
Owen Taylor3473f882001-02-23 17:55:21 +00002232 * Returns a new document
2233 */
2234htmlDocPtr
2235htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2236 if ((URI == NULL) && (ExternalID == NULL))
2237 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002238 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2239 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002240
2241 return(htmlNewDocNoDtD(URI, ExternalID));
2242}
2243
2244
2245/************************************************************************
2246 * *
2247 * The parser itself *
2248 * Relates to http://www.w3.org/TR/html40 *
2249 * *
2250 ************************************************************************/
2251
2252/************************************************************************
2253 * *
2254 * The parser itself *
2255 * *
2256 ************************************************************************/
2257
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002258static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002259
Owen Taylor3473f882001-02-23 17:55:21 +00002260/**
2261 * htmlParseHTMLName:
2262 * @ctxt: an HTML parser context
2263 *
2264 * parse an HTML tag or attribute name, note that we convert it to lowercase
2265 * since HTML names are not case-sensitive.
2266 *
2267 * Returns the Tag Name parsed or NULL
2268 */
2269
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002270static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002271htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002272 int i = 0;
2273 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2274
William M. Brackd1757ab2004-10-02 22:07:48 +00002275 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Owen Taylor3473f882001-02-23 17:55:21 +00002276 (CUR != ':')) return(NULL);
2277
2278 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002279 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Owen Taylor3473f882001-02-23 17:55:21 +00002280 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2281 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2282 else loc[i] = CUR;
2283 i++;
2284
2285 NEXT;
2286 }
2287
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002288 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002289}
2290
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002291
2292/**
2293 * htmlParseHTMLName_nonInvasive:
2294 * @ctxt: an HTML parser context
2295 *
2296 * parse an HTML tag or attribute name, note that we convert it to lowercase
2297 * since HTML names are not case-sensitive, this doesn't consume the data
2298 * from the stream, it's a look-ahead
2299 *
2300 * Returns the Tag Name parsed or NULL
2301 */
2302
2303static const xmlChar *
2304htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2305 int i = 0;
2306 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2307
2308 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2309 (NXT(1) != ':')) return(NULL);
2310
2311 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2312 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2313 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2314 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2315 else loc[i] = NXT(1+i);
2316 i++;
2317 }
2318
2319 return(xmlDictLookup(ctxt->dict, loc, i));
2320}
2321
2322
Owen Taylor3473f882001-02-23 17:55:21 +00002323/**
2324 * htmlParseName:
2325 * @ctxt: an HTML parser context
2326 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002327 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002328 *
2329 * Returns the Name parsed or NULL
2330 */
2331
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002332static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002333htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002334 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002335 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002336 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002337
2338 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002339
2340 /*
2341 * Accelerator for simple ASCII names
2342 */
2343 in = ctxt->input->cur;
2344 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2345 ((*in >= 0x41) && (*in <= 0x5A)) ||
2346 (*in == '_') || (*in == ':')) {
2347 in++;
2348 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2349 ((*in >= 0x41) && (*in <= 0x5A)) ||
2350 ((*in >= 0x30) && (*in <= 0x39)) ||
2351 (*in == '_') || (*in == '-') ||
2352 (*in == ':') || (*in == '.'))
2353 in++;
2354 if ((*in > 0) && (*in < 0x80)) {
2355 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002356 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002357 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002358 ctxt->nbChars += count;
2359 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002360 return(ret);
2361 }
2362 }
2363 return(htmlParseNameComplex(ctxt));
2364}
2365
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002366static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002367htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002368 int len = 0, l;
2369 int c;
2370 int count = 0;
2371
2372 /*
2373 * Handler for more complex cases
2374 */
2375 GROW;
2376 c = CUR_CHAR(l);
2377 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2378 (!IS_LETTER(c) && (c != '_') &&
2379 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002380 return(NULL);
2381 }
2382
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002383 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2384 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2385 (c == '.') || (c == '-') ||
2386 (c == '_') || (c == ':') ||
2387 (IS_COMBINING(c)) ||
2388 (IS_EXTENDER(c)))) {
2389 if (count++ > 100) {
2390 count = 0;
2391 GROW;
2392 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002393 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002394 NEXTL(l);
2395 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002396 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002397 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002398}
2399
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002400
Owen Taylor3473f882001-02-23 17:55:21 +00002401/**
2402 * htmlParseHTMLAttribute:
2403 * @ctxt: an HTML parser context
2404 * @stop: a char stop value
2405 *
2406 * parse an HTML attribute value till the stop (quote), if
2407 * stop is 0 then it stops at the first space
2408 *
2409 * Returns the attribute parsed or NULL
2410 */
2411
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002412static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002413htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2414 xmlChar *buffer = NULL;
2415 int buffer_size = 0;
2416 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002417 const xmlChar *name = NULL;
2418 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002419 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002420
2421 /*
2422 * allocate a translation buffer.
2423 */
2424 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002425 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002426 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002427 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002428 return(NULL);
2429 }
2430 out = buffer;
2431
2432 /*
2433 * Ok loop until we reach one of the ending chars
2434 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002435 while ((CUR != 0) && (CUR != stop)) {
2436 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002437 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002438 if (CUR == '&') {
2439 if (NXT(1) == '#') {
2440 unsigned int c;
2441 int bits;
2442
2443 c = htmlParseCharRef(ctxt);
2444 if (c < 0x80)
2445 { *out++ = c; bits= -6; }
2446 else if (c < 0x800)
2447 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2448 else if (c < 0x10000)
2449 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2450 else
2451 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2452
2453 for ( ; bits >= 0; bits-= 6) {
2454 *out++ = ((c >> bits) & 0x3F) | 0x80;
2455 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002456
2457 if (out - buffer > buffer_size - 100) {
2458 int indx = out - buffer;
2459
2460 growBuffer(buffer);
2461 out = &buffer[indx];
2462 }
Owen Taylor3473f882001-02-23 17:55:21 +00002463 } else {
2464 ent = htmlParseEntityRef(ctxt, &name);
2465 if (name == NULL) {
2466 *out++ = '&';
2467 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002468 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002469
2470 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002471 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002472 }
2473 } else if (ent == NULL) {
2474 *out++ = '&';
2475 cur = name;
2476 while (*cur != 0) {
2477 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002478 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002479
2480 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002481 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002482 }
2483 *out++ = *cur++;
2484 }
Owen Taylor3473f882001-02-23 17:55:21 +00002485 } else {
2486 unsigned int c;
2487 int bits;
2488
2489 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002490 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002491
2492 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002493 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002494 }
Daniel Veillard48519092006-10-17 15:56:35 +00002495 c = ent->value;
Owen Taylor3473f882001-02-23 17:55:21 +00002496 if (c < 0x80)
2497 { *out++ = c; bits= -6; }
2498 else if (c < 0x800)
2499 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2500 else if (c < 0x10000)
2501 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2502 else
2503 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2504
2505 for ( ; bits >= 0; bits-= 6) {
2506 *out++ = ((c >> bits) & 0x3F) | 0x80;
2507 }
Owen Taylor3473f882001-02-23 17:55:21 +00002508 }
2509 }
2510 } else {
2511 unsigned int c;
2512 int bits, l;
2513
2514 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002515 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002516
2517 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002518 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002519 }
2520 c = CUR_CHAR(l);
2521 if (c < 0x80)
2522 { *out++ = c; bits= -6; }
2523 else if (c < 0x800)
2524 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2525 else if (c < 0x10000)
2526 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2527 else
2528 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2529
2530 for ( ; bits >= 0; bits-= 6) {
2531 *out++ = ((c >> bits) & 0x3F) | 0x80;
2532 }
2533 NEXT;
2534 }
2535 }
2536 *out++ = 0;
2537 return(buffer);
2538}
2539
2540/**
Owen Taylor3473f882001-02-23 17:55:21 +00002541 * htmlParseEntityRef:
2542 * @ctxt: an HTML parser context
2543 * @str: location to store the entity name
2544 *
2545 * parse an HTML ENTITY references
2546 *
2547 * [68] EntityRef ::= '&' Name ';'
2548 *
2549 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2550 * if non-NULL *str will have to be freed by the caller.
2551 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002552const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002553htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2554 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002555 const htmlEntityDesc * ent = NULL;
Daniel Veillard42595322004-11-08 10:52:06 +00002556
2557 if (str != NULL) *str = NULL;
2558 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002559
2560 if (CUR == '&') {
2561 NEXT;
2562 name = htmlParseName(ctxt);
2563 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002564 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2565 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002566 } else {
2567 GROW;
2568 if (CUR == ';') {
Daniel Veillard42595322004-11-08 10:52:06 +00002569 if (str != NULL)
2570 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002571
2572 /*
2573 * Lookup the entity in the table.
2574 */
2575 ent = htmlEntityLookup(name);
2576 if (ent != NULL) /* OK that's ugly !!! */
2577 NEXT;
2578 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002579 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2580 "htmlParseEntityRef: expecting ';'\n",
2581 NULL, NULL);
Daniel Veillard42595322004-11-08 10:52:06 +00002582 if (str != NULL)
2583 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002584 }
2585 }
2586 }
2587 return(ent);
2588}
2589
2590/**
2591 * htmlParseAttValue:
2592 * @ctxt: an HTML parser context
2593 *
2594 * parse a value for an attribute
2595 * Note: the parser won't do substitution of entities here, this
2596 * will be handled later in xmlStringGetNodeList, unless it was
2597 * asked for ctxt->replaceEntities != 0
2598 *
2599 * Returns the AttValue parsed or NULL.
2600 */
2601
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002602static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002603htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2604 xmlChar *ret = NULL;
2605
2606 if (CUR == '"') {
2607 NEXT;
2608 ret = htmlParseHTMLAttribute(ctxt, '"');
2609 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002610 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2611 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002612 } else
2613 NEXT;
2614 } else if (CUR == '\'') {
2615 NEXT;
2616 ret = htmlParseHTMLAttribute(ctxt, '\'');
2617 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002618 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2619 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002620 } else
2621 NEXT;
2622 } else {
2623 /*
2624 * That's an HTMLism, the attribute value may not be quoted
2625 */
2626 ret = htmlParseHTMLAttribute(ctxt, 0);
2627 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002628 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2629 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002630 }
2631 }
2632 return(ret);
2633}
2634
2635/**
2636 * htmlParseSystemLiteral:
2637 * @ctxt: an HTML parser context
2638 *
2639 * parse an HTML Literal
2640 *
2641 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2642 *
2643 * Returns the SystemLiteral parsed or NULL
2644 */
2645
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002646static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002647htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2648 const xmlChar *q;
2649 xmlChar *ret = NULL;
2650
2651 if (CUR == '"') {
2652 NEXT;
2653 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002654 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002655 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002656 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002657 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2658 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002659 } else {
2660 ret = xmlStrndup(q, CUR_PTR - q);
2661 NEXT;
2662 }
2663 } else if (CUR == '\'') {
2664 NEXT;
2665 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002666 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002667 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002668 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002669 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2670 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002671 } else {
2672 ret = xmlStrndup(q, CUR_PTR - q);
2673 NEXT;
2674 }
2675 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002676 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2677 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002678 }
2679
2680 return(ret);
2681}
2682
2683/**
2684 * htmlParsePubidLiteral:
2685 * @ctxt: an HTML parser context
2686 *
2687 * parse an HTML public literal
2688 *
2689 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2690 *
2691 * Returns the PubidLiteral parsed or NULL.
2692 */
2693
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002694static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002695htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2696 const xmlChar *q;
2697 xmlChar *ret = NULL;
2698 /*
2699 * Name ::= (Letter | '_') (NameChar)*
2700 */
2701 if (CUR == '"') {
2702 NEXT;
2703 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002704 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002705 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002706 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2707 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002708 } else {
2709 ret = xmlStrndup(q, CUR_PTR - q);
2710 NEXT;
2711 }
2712 } else if (CUR == '\'') {
2713 NEXT;
2714 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002715 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002716 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002717 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002718 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2719 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002720 } else {
2721 ret = xmlStrndup(q, CUR_PTR - q);
2722 NEXT;
2723 }
2724 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002725 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2726 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002727 }
2728
2729 return(ret);
2730}
2731
2732/**
2733 * htmlParseScript:
2734 * @ctxt: an HTML parser context
2735 *
2736 * parse the content of an HTML SCRIPT or STYLE element
2737 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2738 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2739 * http://www.w3.org/TR/html4/types.html#type-script
2740 * http://www.w3.org/TR/html4/types.html#h-6.15
2741 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2742 *
2743 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2744 * element and the value of intrinsic event attributes. User agents must
2745 * not evaluate script data as HTML markup but instead must pass it on as
2746 * data to a script engine.
2747 * NOTES:
2748 * - The content is passed like CDATA
2749 * - the attributes for style and scripting "onXXX" are also described
2750 * as CDATA but SGML allows entities references in attributes so their
2751 * processing is identical as other attributes
2752 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002753static void
Owen Taylor3473f882001-02-23 17:55:21 +00002754htmlParseScript(htmlParserCtxtPtr ctxt) {
Daniel Veillard7d2b3232005-07-14 08:57:39 +00002755 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
Owen Taylor3473f882001-02-23 17:55:21 +00002756 int nbchar = 0;
Daniel Veillard358fef42005-07-13 16:37:38 +00002757 int cur,l;
Owen Taylor3473f882001-02-23 17:55:21 +00002758
2759 SHRINK;
Daniel Veillard358fef42005-07-13 16:37:38 +00002760 cur = CUR_CHAR(l);
William M. Brack76e95df2003-10-18 16:20:14 +00002761 while (IS_CHAR_CH(cur)) {
Daniel Veillard42720242007-04-16 07:02:31 +00002762 if ((cur == '<') && (NXT(1) == '/')) {
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002763 /*
2764 * One should break here, the specification is clear:
2765 * Authors should therefore escape "</" within the content.
2766 * Escape mechanisms are specific to each scripting or
2767 * style sheet language.
2768 *
2769 * In recovery mode, only break if end tag match the
2770 * current tag, effectively ignoring all tags inside the
2771 * script/style block and treating the entire block as
2772 * CDATA.
2773 */
2774 if (ctxt->recovery) {
2775 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2776 xmlStrlen(ctxt->name)) == 0)
2777 {
2778 break; /* while */
2779 } else {
2780 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
Daniel Veillard2cf36a12005-10-25 12:21:29 +00002781 "Element %s embeds close tag\n",
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002782 ctxt->name, NULL);
2783 }
2784 } else {
2785 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2786 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2787 {
2788 break; /* while */
2789 }
2790 }
Owen Taylor3473f882001-02-23 17:55:21 +00002791 }
Daniel Veillard358fef42005-07-13 16:37:38 +00002792 COPY_BUF(l,buf,nbchar,cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002793 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2794 if (ctxt->sax->cdataBlock!= NULL) {
2795 /*
2796 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2797 */
2798 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002799 } else if (ctxt->sax->characters != NULL) {
2800 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002801 }
2802 nbchar = 0;
2803 }
Daniel Veillardb9900082005-10-25 12:36:29 +00002804 GROW;
Daniel Veillard358fef42005-07-13 16:37:38 +00002805 NEXTL(l);
2806 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002807 }
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002808
Daniel Veillard68716a72006-10-16 09:32:17 +00002809 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002810 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2811 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002812 NEXT;
2813 }
2814
2815 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2816 if (ctxt->sax->cdataBlock!= NULL) {
2817 /*
2818 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2819 */
2820 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002821 } else if (ctxt->sax->characters != NULL) {
2822 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002823 }
2824 }
2825}
2826
2827
2828/**
2829 * htmlParseCharData:
2830 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002831 *
2832 * parse a CharData section.
2833 * if we are within a CDATA section ']]>' marks an end of section.
2834 *
2835 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2836 */
2837
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002838static void
2839htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002840 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2841 int nbchar = 0;
2842 int cur, l;
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002843 int chunk = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002844
2845 SHRINK;
2846 cur = CUR_CHAR(l);
2847 while (((cur != '<') || (ctxt->token == '<')) &&
2848 ((cur != '&') || (ctxt->token == '&')) &&
Daniel Veillardc5b43cc2008-01-11 07:41:39 +00002849 (cur != 0)) {
2850 if (!(IS_CHAR(cur))) {
2851 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2852 "Invalid char in CDATA 0x%X\n", cur);
2853 } else {
2854 COPY_BUF(l,buf,nbchar,cur);
2855 }
Owen Taylor3473f882001-02-23 17:55:21 +00002856 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2857 /*
2858 * Ok the segment is to be consumed as chars.
2859 */
2860 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2861 if (areBlanks(ctxt, buf, nbchar)) {
2862 if (ctxt->sax->ignorableWhitespace != NULL)
2863 ctxt->sax->ignorableWhitespace(ctxt->userData,
2864 buf, nbchar);
2865 } else {
2866 htmlCheckParagraph(ctxt);
2867 if (ctxt->sax->characters != NULL)
2868 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2869 }
2870 }
2871 nbchar = 0;
2872 }
2873 NEXTL(l);
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002874 chunk++;
2875 if (chunk > HTML_PARSER_BUFFER_SIZE) {
2876 chunk = 0;
2877 SHRINK;
2878 GROW;
2879 }
Owen Taylor3473f882001-02-23 17:55:21 +00002880 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002881 if (cur == 0) {
2882 SHRINK;
2883 GROW;
2884 cur = CUR_CHAR(l);
2885 }
Owen Taylor3473f882001-02-23 17:55:21 +00002886 }
2887 if (nbchar != 0) {
Daniel Veillardd2755a82005-08-07 23:42:39 +00002888 buf[nbchar] = 0;
2889
Owen Taylor3473f882001-02-23 17:55:21 +00002890 /*
2891 * Ok the segment is to be consumed as chars.
2892 */
2893 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2894 if (areBlanks(ctxt, buf, nbchar)) {
2895 if (ctxt->sax->ignorableWhitespace != NULL)
2896 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2897 } else {
2898 htmlCheckParagraph(ctxt);
2899 if (ctxt->sax->characters != NULL)
2900 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2901 }
2902 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002903 } else {
2904 /*
2905 * Loop detection
2906 */
2907 if (cur == 0)
2908 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002909 }
2910}
2911
2912/**
2913 * htmlParseExternalID:
2914 * @ctxt: an HTML parser context
2915 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002916 *
2917 * Parse an External ID or a Public ID
2918 *
Owen Taylor3473f882001-02-23 17:55:21 +00002919 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2920 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2921 *
2922 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2923 *
2924 * Returns the function returns SystemLiteral and in the second
2925 * case publicID receives PubidLiteral, is strict is off
2926 * it is possible to return NULL and have publicID set.
2927 */
2928
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002929static xmlChar *
2930htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002931 xmlChar *URI = NULL;
2932
2933 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2934 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2935 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2936 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002937 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002938 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2939 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002940 }
2941 SKIP_BLANKS;
2942 URI = htmlParseSystemLiteral(ctxt);
2943 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002944 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2945 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002946 }
2947 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2948 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2949 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2950 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002951 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002952 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2953 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002954 }
2955 SKIP_BLANKS;
2956 *publicID = htmlParsePubidLiteral(ctxt);
2957 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002958 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2959 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2960 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002961 }
2962 SKIP_BLANKS;
2963 if ((CUR == '"') || (CUR == '\'')) {
2964 URI = htmlParseSystemLiteral(ctxt);
2965 }
2966 }
2967 return(URI);
2968}
2969
2970/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00002971 * xmlParsePI:
2972 * @ctxt: an XML parser context
2973 *
2974 * parse an XML Processing Instruction.
2975 *
2976 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2977 */
2978static void
2979htmlParsePI(htmlParserCtxtPtr ctxt) {
2980 xmlChar *buf = NULL;
2981 int len = 0;
2982 int size = HTML_PARSER_BUFFER_SIZE;
2983 int cur, l;
2984 const xmlChar *target;
2985 xmlParserInputState state;
2986 int count = 0;
2987
2988 if ((RAW == '<') && (NXT(1) == '?')) {
2989 state = ctxt->instate;
2990 ctxt->instate = XML_PARSER_PI;
2991 /*
2992 * this is a Processing Instruction.
2993 */
2994 SKIP(2);
2995 SHRINK;
2996
2997 /*
2998 * Parse the target name and check for special support like
2999 * namespace.
3000 */
3001 target = htmlParseName(ctxt);
3002 if (target != NULL) {
3003 if (RAW == '>') {
3004 SKIP(1);
3005
3006 /*
3007 * SAX: PI detected.
3008 */
3009 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3010 (ctxt->sax->processingInstruction != NULL))
3011 ctxt->sax->processingInstruction(ctxt->userData,
3012 target, NULL);
3013 ctxt->instate = state;
3014 return;
3015 }
3016 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3017 if (buf == NULL) {
3018 htmlErrMemory(ctxt, NULL);
3019 ctxt->instate = state;
3020 return;
3021 }
3022 cur = CUR;
3023 if (!IS_BLANK(cur)) {
3024 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3025 "ParsePI: PI %s space expected\n", target, NULL);
3026 }
3027 SKIP_BLANKS;
3028 cur = CUR_CHAR(l);
3029 while (IS_CHAR(cur) && (cur != '>')) {
3030 if (len + 5 >= size) {
3031 xmlChar *tmp;
3032
3033 size *= 2;
3034 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3035 if (tmp == NULL) {
3036 htmlErrMemory(ctxt, NULL);
3037 xmlFree(buf);
3038 ctxt->instate = state;
3039 return;
3040 }
3041 buf = tmp;
3042 }
3043 count++;
3044 if (count > 50) {
3045 GROW;
3046 count = 0;
3047 }
3048 COPY_BUF(l,buf,len,cur);
3049 NEXTL(l);
3050 cur = CUR_CHAR(l);
3051 if (cur == 0) {
3052 SHRINK;
3053 GROW;
3054 cur = CUR_CHAR(l);
3055 }
3056 }
3057 buf[len] = 0;
3058 if (cur != '>') {
3059 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3060 "ParsePI: PI %s never end ...\n", target, NULL);
3061 } else {
3062 SKIP(1);
3063
3064 /*
3065 * SAX: PI detected.
3066 */
3067 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3068 (ctxt->sax->processingInstruction != NULL))
3069 ctxt->sax->processingInstruction(ctxt->userData,
3070 target, buf);
3071 }
3072 xmlFree(buf);
3073 } else {
3074 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3075 "PI is not started correctly", NULL, NULL);
3076 }
3077 ctxt->instate = state;
3078 }
3079}
3080
3081/**
Owen Taylor3473f882001-02-23 17:55:21 +00003082 * htmlParseComment:
3083 * @ctxt: an HTML parser context
3084 *
3085 * Parse an XML (SGML) comment <!-- .... -->
3086 *
3087 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3088 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003089static void
Owen Taylor3473f882001-02-23 17:55:21 +00003090htmlParseComment(htmlParserCtxtPtr ctxt) {
3091 xmlChar *buf = NULL;
3092 int len;
3093 int size = HTML_PARSER_BUFFER_SIZE;
3094 int q, ql;
3095 int r, rl;
3096 int cur, l;
3097 xmlParserInputState state;
3098
3099 /*
3100 * Check that there is a comment right here.
3101 */
3102 if ((RAW != '<') || (NXT(1) != '!') ||
3103 (NXT(2) != '-') || (NXT(3) != '-')) return;
3104
3105 state = ctxt->instate;
3106 ctxt->instate = XML_PARSER_COMMENT;
3107 SHRINK;
3108 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003109 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00003110 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003111 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003112 ctxt->instate = state;
3113 return;
3114 }
3115 q = CUR_CHAR(ql);
3116 NEXTL(ql);
3117 r = CUR_CHAR(rl);
3118 NEXTL(rl);
3119 cur = CUR_CHAR(l);
3120 len = 0;
3121 while (IS_CHAR(cur) &&
3122 ((cur != '>') ||
3123 (r != '-') || (q != '-'))) {
3124 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00003125 xmlChar *tmp;
3126
Owen Taylor3473f882001-02-23 17:55:21 +00003127 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00003128 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3129 if (tmp == NULL) {
3130 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00003131 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003132 ctxt->instate = state;
3133 return;
3134 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00003135 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00003136 }
3137 COPY_BUF(ql,buf,len,q);
3138 q = r;
3139 ql = rl;
3140 r = cur;
3141 rl = l;
3142 NEXTL(l);
3143 cur = CUR_CHAR(l);
3144 if (cur == 0) {
3145 SHRINK;
3146 GROW;
3147 cur = CUR_CHAR(l);
3148 }
3149 }
3150 buf[len] = 0;
3151 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003152 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3153 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003154 xmlFree(buf);
3155 } else {
3156 NEXT;
3157 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3158 (!ctxt->disableSAX))
3159 ctxt->sax->comment(ctxt->userData, buf);
3160 xmlFree(buf);
3161 }
3162 ctxt->instate = state;
3163}
3164
3165/**
3166 * htmlParseCharRef:
3167 * @ctxt: an HTML parser context
3168 *
3169 * parse Reference declarations
3170 *
3171 * [66] CharRef ::= '&#' [0-9]+ ';' |
3172 * '&#x' [0-9a-fA-F]+ ';'
3173 *
3174 * Returns the value parsed (as an int)
3175 */
3176int
3177htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3178 int val = 0;
3179
Daniel Veillarda03e3652004-11-02 18:45:30 +00003180 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3181 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3182 "htmlParseCharRef: context error\n",
3183 NULL, NULL);
3184 return(0);
3185 }
Owen Taylor3473f882001-02-23 17:55:21 +00003186 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003187 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003188 SKIP(3);
3189 while (CUR != ';') {
3190 if ((CUR >= '0') && (CUR <= '9'))
3191 val = val * 16 + (CUR - '0');
3192 else if ((CUR >= 'a') && (CUR <= 'f'))
3193 val = val * 16 + (CUR - 'a') + 10;
3194 else if ((CUR >= 'A') && (CUR <= 'F'))
3195 val = val * 16 + (CUR - 'A') + 10;
3196 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003197 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
Daniel Veillard36de63e2008-04-03 09:05:05 +00003198 "htmlParseCharRef: missing semicolumn\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003199 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003200 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003201 }
3202 NEXT;
3203 }
3204 if (CUR == ';')
3205 NEXT;
3206 } else if ((CUR == '&') && (NXT(1) == '#')) {
3207 SKIP(2);
3208 while (CUR != ';') {
3209 if ((CUR >= '0') && (CUR <= '9'))
3210 val = val * 10 + (CUR - '0');
3211 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003212 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
Daniel Veillard36de63e2008-04-03 09:05:05 +00003213 "htmlParseCharRef: missing semicolumn\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003214 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003215 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003216 }
3217 NEXT;
3218 }
3219 if (CUR == ';')
3220 NEXT;
3221 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003222 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3223 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003224 }
3225 /*
3226 * Check the value IS_CHAR ...
3227 */
3228 if (IS_CHAR(val)) {
3229 return(val);
3230 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003231 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3232 "htmlParseCharRef: invalid xmlChar value %d\n",
3233 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003234 }
3235 return(0);
3236}
3237
3238
3239/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003240 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003241 * @ctxt: an HTML parser context
3242 *
3243 * parse a DOCTYPE declaration
3244 *
3245 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3246 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3247 */
3248
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003249static void
Owen Taylor3473f882001-02-23 17:55:21 +00003250htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003251 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003252 xmlChar *ExternalID = NULL;
3253 xmlChar *URI = NULL;
3254
3255 /*
3256 * We know that '<!DOCTYPE' has been detected.
3257 */
3258 SKIP(9);
3259
3260 SKIP_BLANKS;
3261
3262 /*
3263 * Parse the DOCTYPE name.
3264 */
3265 name = htmlParseName(ctxt);
3266 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003267 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3268 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3269 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003270 }
3271 /*
3272 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3273 */
3274
3275 SKIP_BLANKS;
3276
3277 /*
3278 * Check for SystemID and ExternalID
3279 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003280 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003281 SKIP_BLANKS;
3282
3283 /*
3284 * We should be at the end of the DOCTYPE declaration.
3285 */
3286 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003287 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3288 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003289 /* We shouldn't try to resynchronize ... */
3290 }
3291 NEXT;
3292
3293 /*
3294 * Create or update the document accordingly to the DOCTYPE
3295 */
3296 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3297 (!ctxt->disableSAX))
3298 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3299
3300 /*
3301 * Cleanup, since we don't use all those identifiers
3302 */
3303 if (URI != NULL) xmlFree(URI);
3304 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003305}
3306
3307/**
3308 * htmlParseAttribute:
3309 * @ctxt: an HTML parser context
3310 * @value: a xmlChar ** used to store the value of the attribute
3311 *
3312 * parse an attribute
3313 *
3314 * [41] Attribute ::= Name Eq AttValue
3315 *
3316 * [25] Eq ::= S? '=' S?
3317 *
3318 * With namespace:
3319 *
3320 * [NS 11] Attribute ::= QName Eq AttValue
3321 *
3322 * Also the case QName == xmlns:??? is handled independently as a namespace
3323 * definition.
3324 *
3325 * Returns the attribute name, and the value in *value.
3326 */
3327
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003328static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003329htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003330 const xmlChar *name;
3331 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003332
3333 *value = NULL;
3334 name = htmlParseHTMLName(ctxt);
3335 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003336 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3337 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003338 return(NULL);
3339 }
3340
3341 /*
3342 * read the value
3343 */
3344 SKIP_BLANKS;
3345 if (CUR == '=') {
3346 NEXT;
3347 SKIP_BLANKS;
3348 val = htmlParseAttValue(ctxt);
Daniel Veillardc47d2632006-10-17 16:13:27 +00003349 } else if (htmlIsBooleanAttr(name)) {
3350 /*
3351 * assume a minimized attribute
3352 */
3353 val = xmlStrdup(name);
Owen Taylor3473f882001-02-23 17:55:21 +00003354 }
3355
3356 *value = val;
3357 return(name);
3358}
3359
3360/**
3361 * htmlCheckEncoding:
3362 * @ctxt: an HTML parser context
3363 * @attvalue: the attribute value
3364 *
3365 * Checks an http-equiv attribute from a Meta tag to detect
3366 * the encoding
3367 * If a new encoding is detected the parser is switched to decode
3368 * it and pass UTF8
3369 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003370static void
Owen Taylor3473f882001-02-23 17:55:21 +00003371htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3372 const xmlChar *encoding;
3373
3374 if ((ctxt == NULL) || (attvalue == NULL))
3375 return;
3376
3377 /* do not change encoding */
3378 if (ctxt->input->encoding != NULL)
3379 return;
3380
3381 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3382 if (encoding != NULL) {
3383 encoding += 8;
3384 } else {
3385 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3386 if (encoding != NULL)
3387 encoding += 9;
3388 }
3389 if (encoding != NULL) {
3390 xmlCharEncoding enc;
3391 xmlCharEncodingHandlerPtr handler;
3392
3393 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3394
3395 if (ctxt->input->encoding != NULL)
3396 xmlFree((xmlChar *) ctxt->input->encoding);
3397 ctxt->input->encoding = xmlStrdup(encoding);
3398
3399 enc = xmlParseCharEncoding((const char *) encoding);
3400 /*
3401 * registered set of known encodings
3402 */
3403 if (enc != XML_CHAR_ENCODING_ERROR) {
Daniel Veillard7e303562006-10-16 13:14:55 +00003404 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3405 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3406 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3407 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3408 (ctxt->input->buf != NULL) &&
3409 (ctxt->input->buf->encoder == NULL)) {
3410 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3411 "htmlCheckEncoding: wrong encoding meta\n",
3412 NULL, NULL);
3413 } else {
3414 xmlSwitchEncoding(ctxt, enc);
3415 }
Owen Taylor3473f882001-02-23 17:55:21 +00003416 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3417 } else {
3418 /*
3419 * fallback for unknown encodings
3420 */
3421 handler = xmlFindCharEncodingHandler((const char *) encoding);
3422 if (handler != NULL) {
3423 xmlSwitchToEncoding(ctxt, handler);
3424 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3425 } else {
3426 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3427 }
3428 }
3429
3430 if ((ctxt->input->buf != NULL) &&
3431 (ctxt->input->buf->encoder != NULL) &&
3432 (ctxt->input->buf->raw != NULL) &&
3433 (ctxt->input->buf->buffer != NULL)) {
3434 int nbchars;
3435 int processed;
3436
3437 /*
3438 * convert as much as possible to the parser reading buffer.
3439 */
3440 processed = ctxt->input->cur - ctxt->input->base;
3441 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3442 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3443 ctxt->input->buf->buffer,
3444 ctxt->input->buf->raw);
3445 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003446 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3447 "htmlCheckEncoding: encoder error\n",
3448 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003449 }
3450 ctxt->input->base =
3451 ctxt->input->cur = ctxt->input->buf->buffer->content;
3452 }
3453 }
3454}
3455
3456/**
3457 * htmlCheckMeta:
3458 * @ctxt: an HTML parser context
3459 * @atts: the attributes values
3460 *
3461 * Checks an attributes from a Meta tag
3462 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003463static void
Owen Taylor3473f882001-02-23 17:55:21 +00003464htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3465 int i;
3466 const xmlChar *att, *value;
3467 int http = 0;
3468 const xmlChar *content = NULL;
3469
3470 if ((ctxt == NULL) || (atts == NULL))
3471 return;
3472
3473 i = 0;
3474 att = atts[i++];
3475 while (att != NULL) {
3476 value = atts[i++];
3477 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3478 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3479 http = 1;
3480 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3481 content = value;
3482 att = atts[i++];
3483 }
3484 if ((http) && (content != NULL))
3485 htmlCheckEncoding(ctxt, content);
3486
3487}
3488
3489/**
3490 * htmlParseStartTag:
3491 * @ctxt: an HTML parser context
3492 *
3493 * parse a start of tag either for rule element or
3494 * EmptyElement. In both case we don't parse the tag closing chars.
3495 *
3496 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3497 *
3498 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3499 *
3500 * With namespace:
3501 *
3502 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3503 *
3504 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3505 *
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003506 * Returns 0 in case of success, -1 in case of error and 1 if discarded
Owen Taylor3473f882001-02-23 17:55:21 +00003507 */
3508
Daniel Veillard597f1c12005-07-03 23:00:18 +00003509static int
Owen Taylor3473f882001-02-23 17:55:21 +00003510htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003511 const xmlChar *name;
3512 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003513 xmlChar *attvalue;
Daniel Veillard30e76072006-03-09 14:13:55 +00003514 const xmlChar **atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003515 int nbatts = 0;
Daniel Veillard30e76072006-03-09 14:13:55 +00003516 int maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003517 int meta = 0;
3518 int i;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003519 int discardtag = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003520
Daniel Veillarda03e3652004-11-02 18:45:30 +00003521 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3522 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3523 "htmlParseStartTag: context error\n", NULL, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003524 return -1;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003525 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003526 if (CUR != '<') return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003527 NEXT;
3528
Daniel Veillard30e76072006-03-09 14:13:55 +00003529 atts = ctxt->atts;
3530 maxatts = ctxt->maxatts;
3531
Owen Taylor3473f882001-02-23 17:55:21 +00003532 GROW;
3533 name = htmlParseHTMLName(ctxt);
3534 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003535 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3536 "htmlParseStartTag: invalid element name\n",
3537 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003538 /* Dump the bogus tag like browsers do */
William M. Brack76e95df2003-10-18 16:20:14 +00003539 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Owen Taylor3473f882001-02-23 17:55:21 +00003540 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003541 return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003542 }
3543 if (xmlStrEqual(name, BAD_CAST"meta"))
3544 meta = 1;
3545
3546 /*
3547 * Check for auto-closure of HTML elements.
3548 */
3549 htmlAutoClose(ctxt, name);
3550
3551 /*
3552 * Check for implied HTML elements.
3553 */
3554 htmlCheckImplied(ctxt, name);
3555
3556 /*
3557 * Avoid html at any level > 0, head at any level != 1
3558 * or any attempt to recurse body
3559 */
3560 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003561 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3562 "htmlParseStartTag: misplaced <html> tag\n",
3563 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003564 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003565 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003566 }
3567 if ((ctxt->nameNr != 1) &&
3568 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003569 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3570 "htmlParseStartTag: misplaced <head> tag\n",
3571 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003572 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003573 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003574 }
3575 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003576 int indx;
3577 for (indx = 0;indx < ctxt->nameNr;indx++) {
3578 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003579 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3580 "htmlParseStartTag: misplaced <body> tag\n",
3581 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003582 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003583 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003584 }
3585 }
3586 }
3587
3588 /*
3589 * Now parse the attributes, it ends up with the ending
3590 *
3591 * (S Attribute)* S?
3592 */
3593 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003594 while ((IS_CHAR_CH(CUR)) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003595 (CUR != '>') &&
3596 ((CUR != '/') || (NXT(1) != '>'))) {
3597 long cons = ctxt->nbChars;
3598
3599 GROW;
3600 attname = htmlParseAttribute(ctxt, &attvalue);
3601 if (attname != NULL) {
3602
3603 /*
3604 * Well formedness requires at most one declaration of an attribute
3605 */
3606 for (i = 0; i < nbatts;i += 2) {
3607 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003608 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3609 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003610 if (attvalue != NULL)
3611 xmlFree(attvalue);
3612 goto failed;
3613 }
3614 }
3615
3616 /*
3617 * Add the pair to atts
3618 */
3619 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003620 maxatts = 22; /* allow for 10 attrs by default */
3621 atts = (const xmlChar **)
3622 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003623 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003624 htmlErrMemory(ctxt, NULL);
3625 if (attvalue != NULL)
3626 xmlFree(attvalue);
3627 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003628 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003629 ctxt->atts = atts;
3630 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003631 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003632 const xmlChar **n;
3633
Owen Taylor3473f882001-02-23 17:55:21 +00003634 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003635 n = (const xmlChar **) xmlRealloc((void *) atts,
3636 maxatts * sizeof(const xmlChar *));
3637 if (n == NULL) {
3638 htmlErrMemory(ctxt, NULL);
3639 if (attvalue != NULL)
3640 xmlFree(attvalue);
3641 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003642 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003643 atts = n;
3644 ctxt->atts = atts;
3645 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003646 }
3647 atts[nbatts++] = attname;
3648 atts[nbatts++] = attvalue;
3649 atts[nbatts] = NULL;
3650 atts[nbatts + 1] = NULL;
3651 }
3652 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003653 if (attvalue != NULL)
3654 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003655 /* Dump the bogus attribute string up to the next blank or
3656 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003657 while ((IS_CHAR_CH(CUR)) &&
3658 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003659 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003660 NEXT;
3661 }
3662
3663failed:
3664 SKIP_BLANKS;
3665 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003666 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3667 "htmlParseStartTag: problem parsing attributes\n",
3668 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003669 break;
3670 }
3671 }
3672
3673 /*
3674 * Handle specific association to the META tag
3675 */
William M. Bracke978ae22007-03-21 06:16:02 +00003676 if (meta && (nbatts != 0))
Owen Taylor3473f882001-02-23 17:55:21 +00003677 htmlCheckMeta(ctxt, atts);
3678
3679 /*
3680 * SAX: Start of Element !
3681 */
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003682 if (!discardtag) {
3683 htmlnamePush(ctxt, name);
3684 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3685 if (nbatts != 0)
3686 ctxt->sax->startElement(ctxt->userData, name, atts);
3687 else
3688 ctxt->sax->startElement(ctxt->userData, name, NULL);
3689 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003690 }
Owen Taylor3473f882001-02-23 17:55:21 +00003691
3692 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003693 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003694 if (atts[i] != NULL)
3695 xmlFree((xmlChar *) atts[i]);
3696 }
Owen Taylor3473f882001-02-23 17:55:21 +00003697 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003698
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003699 return(discardtag);
Owen Taylor3473f882001-02-23 17:55:21 +00003700}
3701
3702/**
3703 * htmlParseEndTag:
3704 * @ctxt: an HTML parser context
3705 *
3706 * parse an end of tag
3707 *
3708 * [42] ETag ::= '</' Name S? '>'
3709 *
3710 * With namespace
3711 *
3712 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003713 *
3714 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003715 */
3716
Daniel Veillardf420ac52001-07-04 16:04:09 +00003717static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003718htmlParseEndTag(htmlParserCtxtPtr ctxt)
3719{
3720 const xmlChar *name;
3721 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003722 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003723
3724 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003725 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3726 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003727 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003728 }
3729 SKIP(2);
3730
3731 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003732 if (name == NULL)
3733 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003734 /*
3735 * We should definitely be at the ending "S? '>'" part
3736 */
3737 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003738 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003739 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3740 "End tag : expected '>'\n", NULL, NULL);
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00003741 if (ctxt->recovery) {
3742 /*
3743 * We're not at the ending > !!
3744 * Error, unless in recover mode where we search forwards
3745 * until we find a >
3746 */
3747 while (CUR != '\0' && CUR != '>') NEXT;
3748 NEXT;
3749 }
Owen Taylor3473f882001-02-23 17:55:21 +00003750 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003751 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003752
3753 /*
Daniel Veillarded86dc22008-04-24 11:58:41 +00003754 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3755 * out now.
3756 */
3757 if ((ctxt->depth > 0) &&
3758 (xmlStrEqual(name, BAD_CAST "html") ||
3759 xmlStrEqual(name, BAD_CAST "body") ||
3760 xmlStrEqual(name, BAD_CAST "head"))) {
3761 ctxt->depth--;
3762 return (0);
3763 }
3764
3765 /*
Owen Taylor3473f882001-02-23 17:55:21 +00003766 * If the name read is not one of the element in the parsing stack
3767 * then return, it's just an error.
3768 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003769 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3770 if (xmlStrEqual(name, ctxt->nameTab[i]))
3771 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003772 }
3773 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003774 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3775 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003776 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003777 }
3778
3779
3780 /*
3781 * Check for auto-closure of HTML elements.
3782 */
3783
3784 htmlAutoCloseOnClose(ctxt, name);
3785
3786 /*
3787 * Well formedness constraints, opening and closing must match.
3788 * With the exception that the autoclose may have popped stuff out
3789 * of the stack.
3790 */
3791 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003792 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003793 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3794 "Opening and ending tag mismatch: %s and %s\n",
3795 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003796 }
3797 }
3798
3799 /*
3800 * SAX: End of Tag
3801 */
3802 oldname = ctxt->name;
3803 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003804 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3805 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003806 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003807 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003808 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003809 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003810 }
3811
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003812 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003813}
3814
3815
3816/**
3817 * htmlParseReference:
3818 * @ctxt: an HTML parser context
3819 *
3820 * parse and handle entity references in content,
3821 * this will end-up in a call to character() since this is either a
3822 * CharRef, or a predefined entity.
3823 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003824static void
Owen Taylor3473f882001-02-23 17:55:21 +00003825htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003826 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003827 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003828 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003829 if (CUR != '&') return;
3830
3831 if (NXT(1) == '#') {
3832 unsigned int c;
3833 int bits, i = 0;
3834
3835 c = htmlParseCharRef(ctxt);
3836 if (c == 0)
3837 return;
3838
3839 if (c < 0x80) { out[i++]= c; bits= -6; }
3840 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3841 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3842 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3843
3844 for ( ; bits >= 0; bits-= 6) {
3845 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3846 }
3847 out[i] = 0;
3848
3849 htmlCheckParagraph(ctxt);
3850 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3851 ctxt->sax->characters(ctxt->userData, out, i);
3852 } else {
3853 ent = htmlParseEntityRef(ctxt, &name);
3854 if (name == NULL) {
3855 htmlCheckParagraph(ctxt);
3856 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3857 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3858 return;
3859 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003860 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003861 htmlCheckParagraph(ctxt);
3862 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3863 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3864 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3865 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3866 }
3867 } else {
3868 unsigned int c;
3869 int bits, i = 0;
3870
3871 c = ent->value;
3872 if (c < 0x80)
3873 { out[i++]= c; bits= -6; }
3874 else if (c < 0x800)
3875 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3876 else if (c < 0x10000)
3877 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3878 else
3879 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3880
3881 for ( ; bits >= 0; bits-= 6) {
3882 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3883 }
3884 out[i] = 0;
3885
3886 htmlCheckParagraph(ctxt);
3887 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3888 ctxt->sax->characters(ctxt->userData, out, i);
3889 }
Owen Taylor3473f882001-02-23 17:55:21 +00003890 }
3891}
3892
3893/**
3894 * htmlParseContent:
3895 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00003896 *
3897 * Parse a content: comment, sub-element, reference or text.
Owen Taylor3473f882001-02-23 17:55:21 +00003898 */
3899
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003900static void
Owen Taylor3473f882001-02-23 17:55:21 +00003901htmlParseContent(htmlParserCtxtPtr ctxt) {
3902 xmlChar *currentNode;
3903 int depth;
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003904 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003905
3906 currentNode = xmlStrdup(ctxt->name);
3907 depth = ctxt->nameNr;
3908 while (1) {
3909 long cons = ctxt->nbChars;
3910
3911 GROW;
3912 /*
3913 * Our tag or one of it's parent or children is ending.
3914 */
3915 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003916 if (htmlParseEndTag(ctxt) &&
3917 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3918 if (currentNode != NULL)
3919 xmlFree(currentNode);
3920 return;
3921 }
3922 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003923 }
3924
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003925 else if ((CUR == '<') &&
3926 ((IS_ASCII_LETTER(NXT(1))) ||
3927 (NXT(1) == '_') || (NXT(1) == ':'))) {
3928 name = htmlParseHTMLName_nonInvasive(ctxt);
3929 if (name == NULL) {
3930 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3931 "htmlParseStartTag: invalid element name\n",
3932 NULL, NULL);
3933 /* Dump the bogus tag like browsers do */
3934 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3935 NEXT;
3936
3937 if (currentNode != NULL)
3938 xmlFree(currentNode);
3939 return;
3940 }
3941
3942 if (ctxt->name != NULL) {
3943 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
3944 htmlAutoClose(ctxt, name);
3945 continue;
3946 }
3947 }
3948 }
3949
Owen Taylor3473f882001-02-23 17:55:21 +00003950 /*
3951 * Has this node been popped out during parsing of
3952 * the next element
3953 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003954 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3955 (!xmlStrEqual(currentNode, ctxt->name)))
3956 {
Owen Taylor3473f882001-02-23 17:55:21 +00003957 if (currentNode != NULL) xmlFree(currentNode);
3958 return;
3959 }
3960
Daniel Veillardf9533d12001-03-03 10:04:57 +00003961 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3962 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003963 /*
3964 * Handle SCRIPT/STYLE separately
3965 */
3966 htmlParseScript(ctxt);
3967 } else {
3968 /*
3969 * Sometimes DOCTYPE arrives in the middle of the document
3970 */
3971 if ((CUR == '<') && (NXT(1) == '!') &&
3972 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3973 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3974 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3975 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003976 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3977 "Misplaced DOCTYPE declaration\n",
3978 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003979 htmlParseDocTypeDecl(ctxt);
3980 }
3981
3982 /*
3983 * First case : a comment
3984 */
3985 if ((CUR == '<') && (NXT(1) == '!') &&
3986 (NXT(2) == '-') && (NXT(3) == '-')) {
3987 htmlParseComment(ctxt);
3988 }
3989
3990 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003991 * Second case : a Processing Instruction.
3992 */
3993 else if ((CUR == '<') && (NXT(1) == '?')) {
3994 htmlParsePI(ctxt);
3995 }
3996
3997 /*
3998 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00003999 */
4000 else if (CUR == '<') {
4001 htmlParseElement(ctxt);
4002 }
4003
4004 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004005 * Fourth case : a reference. If if has not been resolved,
Owen Taylor3473f882001-02-23 17:55:21 +00004006 * parsing returns it's Name, create the node
4007 */
4008 else if (CUR == '&') {
4009 htmlParseReference(ctxt);
4010 }
4011
4012 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004013 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00004014 */
4015 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004016 htmlAutoCloseOnEnd(ctxt);
4017 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004018 }
4019
4020 /*
4021 * Last case, text. Note that References are handled directly.
4022 */
4023 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004024 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004025 }
4026
4027 if (cons == ctxt->nbChars) {
4028 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004029 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4030 "detected an error in element content\n",
4031 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004032 }
4033 break;
4034 }
4035 }
4036 GROW;
4037 }
4038 if (currentNode != NULL) xmlFree(currentNode);
4039}
4040
4041/**
Daniel Veillard499cc922006-01-18 17:22:35 +00004042 * htmlParseContent:
4043 * @ctxt: an HTML parser context
4044 *
4045 * Parse a content: comment, sub-element, reference or text.
4046 */
4047
4048void
4049__htmlParseContent(void *ctxt) {
4050 if (ctxt != NULL)
4051 htmlParseContent((htmlParserCtxtPtr) ctxt);
4052}
4053
4054/**
Owen Taylor3473f882001-02-23 17:55:21 +00004055 * htmlParseElement:
4056 * @ctxt: an HTML parser context
4057 *
4058 * parse an HTML element, this is highly recursive
4059 *
4060 * [39] element ::= EmptyElemTag | STag content ETag
4061 *
4062 * [41] Attribute ::= Name Eq AttValue
4063 */
4064
4065void
4066htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004067 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004068 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00004069 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004070 htmlParserNodeInfo node_info;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004071 int failed;
Daniel Veillarda03e3652004-11-02 18:45:30 +00004072 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00004073 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00004074
Daniel Veillarda03e3652004-11-02 18:45:30 +00004075 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4076 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
Daniel Veillard597f1c12005-07-03 23:00:18 +00004077 "htmlParseElement: context error\n", NULL, NULL);
Daniel Veillarda03e3652004-11-02 18:45:30 +00004078 return;
4079 }
Owen Taylor3473f882001-02-23 17:55:21 +00004080 /* Capture start position */
4081 if (ctxt->record_info) {
4082 node_info.begin_pos = ctxt->input->consumed +
4083 (CUR_PTR - ctxt->input->base);
4084 node_info.begin_line = ctxt->input->line;
4085 }
4086
Daniel Veillard597f1c12005-07-03 23:00:18 +00004087 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004088 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00004089 if ((failed == -1) || (name == NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004090 if (CUR == '>')
4091 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004092 return;
4093 }
Owen Taylor3473f882001-02-23 17:55:21 +00004094
4095 /*
4096 * Lookup the info for that element.
4097 */
4098 info = htmlTagLookup(name);
4099 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004100 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4101 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004102 }
4103
4104 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004105 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004106 */
4107 if ((CUR == '/') && (NXT(1) == '>')) {
4108 SKIP(2);
4109 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4110 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004111 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004112 return;
4113 }
4114
4115 if (CUR == '>') {
4116 NEXT;
4117 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004118 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4119 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004120
4121 /*
4122 * end of parsing of this node.
4123 */
4124 if (xmlStrEqual(name, ctxt->name)) {
4125 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00004126 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004127 }
4128
4129 /*
4130 * Capture end position and add node
4131 */
Daniel Veillard30e76072006-03-09 14:13:55 +00004132 if (ctxt->record_info) {
Owen Taylor3473f882001-02-23 17:55:21 +00004133 node_info.end_pos = ctxt->input->consumed +
4134 (CUR_PTR - ctxt->input->base);
4135 node_info.end_line = ctxt->input->line;
4136 node_info.node = ctxt->node;
4137 xmlParserAddNodeInfo(ctxt, &node_info);
4138 }
4139 return;
4140 }
4141
4142 /*
4143 * Check for an Empty Element from DTD definition
4144 */
4145 if ((info != NULL) && (info->empty)) {
4146 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4147 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004148 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004149 return;
4150 }
4151
4152 /*
4153 * Parse the content of the element:
4154 */
4155 currentNode = xmlStrdup(ctxt->name);
4156 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00004157 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00004158 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00004159 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00004160 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00004161 if (ctxt->nameNr < depth) break;
4162 }
4163
Owen Taylor3473f882001-02-23 17:55:21 +00004164 /*
4165 * Capture end position and add node
4166 */
4167 if ( currentNode != NULL && ctxt->record_info ) {
4168 node_info.end_pos = ctxt->input->consumed +
4169 (CUR_PTR - ctxt->input->base);
4170 node_info.end_line = ctxt->input->line;
4171 node_info.node = ctxt->node;
4172 xmlParserAddNodeInfo(ctxt, &node_info);
4173 }
William M. Brack76e95df2003-10-18 16:20:14 +00004174 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004175 htmlAutoCloseOnEnd(ctxt);
4176 }
4177
Owen Taylor3473f882001-02-23 17:55:21 +00004178 if (currentNode != NULL)
4179 xmlFree(currentNode);
4180}
4181
4182/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004183 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00004184 * @ctxt: an HTML parser context
4185 *
4186 * parse an HTML document (and build a tree if using the standard SAX
4187 * interface).
4188 *
4189 * Returns 0, -1 in case of error. the parser context is augmented
4190 * as a result of the parsing.
4191 */
4192
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00004193int
Owen Taylor3473f882001-02-23 17:55:21 +00004194htmlParseDocument(htmlParserCtxtPtr ctxt) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004195 xmlChar start[4];
4196 xmlCharEncoding enc;
Owen Taylor3473f882001-02-23 17:55:21 +00004197 xmlDtdPtr dtd;
4198
Daniel Veillardd0463562001-10-13 09:15:48 +00004199 xmlInitParser();
4200
Owen Taylor3473f882001-02-23 17:55:21 +00004201 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00004202
Daniel Veillarda03e3652004-11-02 18:45:30 +00004203 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4204 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4205 "htmlParseDocument: context error\n", NULL, NULL);
4206 return(XML_ERR_INTERNAL_ERROR);
4207 }
4208 ctxt->html = 1;
Daniel Veillard4d3e2da2009-05-15 17:55:45 +02004209 ctxt->linenumbers = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00004210 GROW;
4211 /*
4212 * SAX: beginning of the document processing.
4213 */
4214 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4215 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4216
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004217 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4218 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4219 /*
4220 * Get the 4 first bytes and decode the charset
4221 * if enc != XML_CHAR_ENCODING_NONE
4222 * plug some encoding conversion routines.
4223 */
4224 start[0] = RAW;
4225 start[1] = NXT(1);
4226 start[2] = NXT(2);
4227 start[3] = NXT(3);
4228 enc = xmlDetectCharEncoding(&start[0], 4);
4229 if (enc != XML_CHAR_ENCODING_NONE) {
4230 xmlSwitchEncoding(ctxt, enc);
4231 }
4232 }
4233
Owen Taylor3473f882001-02-23 17:55:21 +00004234 /*
4235 * Wipe out everything which is before the first '<'
4236 */
4237 SKIP_BLANKS;
4238 if (CUR == 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004239 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4240 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004241 }
4242
4243 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4244 ctxt->sax->startDocument(ctxt->userData);
4245
4246
4247 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004248 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004249 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004250 while (((CUR == '<') && (NXT(1) == '!') &&
4251 (NXT(2) == '-') && (NXT(3) == '-')) ||
4252 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004253 htmlParseComment(ctxt);
4254 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004255 SKIP_BLANKS;
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004256 }
Owen Taylor3473f882001-02-23 17:55:21 +00004257
4258
4259 /*
4260 * Then possibly doc type declaration(s) and more Misc
4261 * (doctypedecl Misc*)?
4262 */
4263 if ((CUR == '<') && (NXT(1) == '!') &&
4264 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4265 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4266 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4267 (UPP(8) == 'E')) {
4268 htmlParseDocTypeDecl(ctxt);
4269 }
4270 SKIP_BLANKS;
4271
4272 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004273 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004274 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004275 while (((CUR == '<') && (NXT(1) == '!') &&
4276 (NXT(2) == '-') && (NXT(3) == '-')) ||
4277 ((CUR == '<') && (NXT(1) == '?'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004278 htmlParseComment(ctxt);
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004279 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004280 SKIP_BLANKS;
4281 }
4282
4283 /*
4284 * Time to start parsing the tree itself
4285 */
4286 htmlParseContent(ctxt);
4287
4288 /*
4289 * autoclose
4290 */
4291 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004292 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004293
4294
4295 /*
4296 * SAX: end of the document processing.
4297 */
4298 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4299 ctxt->sax->endDocument(ctxt->userData);
4300
4301 if (ctxt->myDoc != NULL) {
4302 dtd = xmlGetIntSubset(ctxt->myDoc);
4303 if (dtd == NULL)
4304 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00004305 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004306 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4307 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4308 }
4309 if (! ctxt->wellFormed) return(-1);
4310 return(0);
4311}
4312
4313
4314/************************************************************************
4315 * *
4316 * Parser contexts handling *
4317 * *
4318 ************************************************************************/
4319
4320/**
William M. Brackedb65a72004-02-06 07:36:04 +00004321 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004322 * @ctxt: an HTML parser context
4323 *
4324 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004325 *
4326 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004327 */
4328
Daniel Veillardf403d292003-10-05 13:51:35 +00004329static int
Owen Taylor3473f882001-02-23 17:55:21 +00004330htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4331{
4332 htmlSAXHandler *sax;
4333
Daniel Veillardf403d292003-10-05 13:51:35 +00004334 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004335 memset(ctxt, 0, sizeof(htmlParserCtxt));
4336
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004337 ctxt->dict = xmlDictCreate();
4338 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004339 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4340 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004341 }
Owen Taylor3473f882001-02-23 17:55:21 +00004342 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4343 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004344 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4345 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004346 }
4347 else
4348 memset(sax, 0, sizeof(htmlSAXHandler));
4349
4350 /* Allocate the Input stack */
4351 ctxt->inputTab = (htmlParserInputPtr *)
4352 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4353 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004354 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004355 ctxt->inputNr = 0;
4356 ctxt->inputMax = 0;
4357 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004358 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004359 }
4360 ctxt->inputNr = 0;
4361 ctxt->inputMax = 5;
4362 ctxt->input = NULL;
4363 ctxt->version = NULL;
4364 ctxt->encoding = NULL;
4365 ctxt->standalone = -1;
4366 ctxt->instate = XML_PARSER_START;
4367
4368 /* Allocate the Node stack */
4369 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4370 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004371 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004372 ctxt->nodeNr = 0;
4373 ctxt->nodeMax = 0;
4374 ctxt->node = NULL;
4375 ctxt->inputNr = 0;
4376 ctxt->inputMax = 0;
4377 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004378 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004379 }
4380 ctxt->nodeNr = 0;
4381 ctxt->nodeMax = 10;
4382 ctxt->node = NULL;
4383
4384 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004385 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004386 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004387 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004388 ctxt->nameNr = 0;
4389 ctxt->nameMax = 10;
4390 ctxt->name = NULL;
4391 ctxt->nodeNr = 0;
4392 ctxt->nodeMax = 0;
4393 ctxt->node = NULL;
4394 ctxt->inputNr = 0;
4395 ctxt->inputMax = 0;
4396 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004397 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004398 }
4399 ctxt->nameNr = 0;
4400 ctxt->nameMax = 10;
4401 ctxt->name = NULL;
4402
Daniel Veillard092643b2003-09-25 14:29:29 +00004403 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004404 else {
4405 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004406 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004407 }
4408 ctxt->userData = ctxt;
4409 ctxt->myDoc = NULL;
4410 ctxt->wellFormed = 1;
4411 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004412 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004413 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004414 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004415 ctxt->vctxt.userData = ctxt;
4416 ctxt->vctxt.error = xmlParserValidityError;
4417 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004418 ctxt->record_info = 0;
4419 ctxt->validate = 0;
4420 ctxt->nbChars = 0;
4421 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004422 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004423 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004424 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004425}
4426
4427/**
4428 * htmlFreeParserCtxt:
4429 * @ctxt: an HTML parser context
4430 *
4431 * Free all the memory used by a parser context. However the parsed
4432 * document in ctxt->myDoc is not freed.
4433 */
4434
4435void
4436htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4437{
4438 xmlFreeParserCtxt(ctxt);
4439}
4440
4441/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004442 * htmlNewParserCtxt:
4443 *
4444 * Allocate and initialize a new parser context.
4445 *
Daniel Veillard34c647c2006-09-21 06:53:59 +00004446 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
Daniel Veillard1d995272002-07-22 16:43:32 +00004447 */
4448
Daniel Veillard34c647c2006-09-21 06:53:59 +00004449htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004450htmlNewParserCtxt(void)
4451{
4452 xmlParserCtxtPtr ctxt;
4453
4454 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4455 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004456 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004457 return(NULL);
4458 }
4459 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004460 if (htmlInitParserCtxt(ctxt) < 0) {
4461 htmlFreeParserCtxt(ctxt);
4462 return(NULL);
4463 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004464 return(ctxt);
4465}
4466
4467/**
4468 * htmlCreateMemoryParserCtxt:
4469 * @buffer: a pointer to a char array
4470 * @size: the size of the array
4471 *
4472 * Create a parser context for an HTML in-memory document.
4473 *
4474 * Returns the new parser context or NULL
4475 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004476htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004477htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4478 xmlParserCtxtPtr ctxt;
4479 xmlParserInputPtr input;
4480 xmlParserInputBufferPtr buf;
4481
4482 if (buffer == NULL)
4483 return(NULL);
4484 if (size <= 0)
4485 return(NULL);
4486
4487 ctxt = htmlNewParserCtxt();
4488 if (ctxt == NULL)
4489 return(NULL);
4490
4491 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4492 if (buf == NULL) return(NULL);
4493
4494 input = xmlNewInputStream(ctxt);
4495 if (input == NULL) {
4496 xmlFreeParserCtxt(ctxt);
4497 return(NULL);
4498 }
4499
4500 input->filename = NULL;
4501 input->buf = buf;
4502 input->base = input->buf->buffer->content;
4503 input->cur = input->buf->buffer->content;
4504 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4505
4506 inputPush(ctxt, input);
4507 return(ctxt);
4508}
4509
4510/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004511 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004512 * @cur: a pointer to an array of xmlChar
4513 * @encoding: a free form C string describing the HTML document encoding, or NULL
4514 *
4515 * Create a parser context for an HTML document.
4516 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004517 * TODO: check the need to add encoding handling there
4518 *
Owen Taylor3473f882001-02-23 17:55:21 +00004519 * Returns the new parser context or NULL
4520 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004521static htmlParserCtxtPtr
Daniel Veillard4d1320f2007-04-26 08:55:33 +00004522htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004523 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004524 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004525
Daniel Veillard1d995272002-07-22 16:43:32 +00004526 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004527 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004528 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004529 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
Daniel Veillard739e9d02007-04-27 09:33:58 +00004530 if (ctxt == NULL)
4531 return(NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004532
4533 if (encoding != NULL) {
4534 xmlCharEncoding enc;
4535 xmlCharEncodingHandlerPtr handler;
4536
4537 if (ctxt->input->encoding != NULL)
4538 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004539 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004540
4541 enc = xmlParseCharEncoding(encoding);
4542 /*
4543 * registered set of known encodings
4544 */
4545 if (enc != XML_CHAR_ENCODING_ERROR) {
4546 xmlSwitchEncoding(ctxt, enc);
4547 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004548 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4549 "Unsupported encoding %s\n",
4550 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004551 }
4552 } else {
4553 /*
4554 * fallback for unknown encodings
4555 */
4556 handler = xmlFindCharEncodingHandler((const char *) encoding);
4557 if (handler != NULL) {
4558 xmlSwitchToEncoding(ctxt, handler);
4559 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004560 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4561 "Unsupported encoding %s\n",
4562 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004563 }
4564 }
4565 }
4566 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004567}
4568
Daniel Veillard73b013f2003-09-30 12:36:01 +00004569#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004570/************************************************************************
4571 * *
4572 * Progressive parsing interfaces *
4573 * *
4574 ************************************************************************/
4575
4576/**
4577 * htmlParseLookupSequence:
4578 * @ctxt: an HTML parser context
4579 * @first: the first char to lookup
4580 * @next: the next char to lookup or zero
4581 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004582 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004583 *
4584 * Try to find if a sequence (first, next, third) or just (first next) or
4585 * (first) is available in the input stream.
4586 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4587 * to avoid rescanning sequences of bytes, it DOES change the state of the
4588 * parser, do not use liberally.
4589 * This is basically similar to xmlParseLookupSequence()
4590 *
4591 * Returns the index to the current parsing point if the full sequence
4592 * is available, -1 otherwise.
4593 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004594static int
Owen Taylor3473f882001-02-23 17:55:21 +00004595htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Jiri Netolicky446e1262009-08-07 17:05:36 +02004596 xmlChar next, xmlChar third, int iscomment,
4597 int ignoreattrval) {
Owen Taylor3473f882001-02-23 17:55:21 +00004598 int base, len;
4599 htmlParserInputPtr in;
4600 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004601 int incomment = 0;
Jiri Netolicky446e1262009-08-07 17:05:36 +02004602 int invalue = 0;
4603 char valdellim = 0x0;
Owen Taylor3473f882001-02-23 17:55:21 +00004604
4605 in = ctxt->input;
4606 if (in == NULL) return(-1);
4607 base = in->cur - in->base;
4608 if (base < 0) return(-1);
4609 if (ctxt->checkIndex > base)
4610 base = ctxt->checkIndex;
4611 if (in->buf == NULL) {
4612 buf = in->base;
4613 len = in->length;
4614 } else {
4615 buf = in->buf->buffer->content;
4616 len = in->buf->buffer->use;
4617 }
4618 /* take into account the sequence length */
4619 if (third) len -= 2;
4620 else if (next) len --;
4621 for (;base < len;base++) {
William M. Brackc1939562003-08-05 15:52:22 +00004622 if (!incomment && (base + 4 < len) && !iscomment) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004623 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4624 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4625 incomment = 1;
Daniel Veillard97e01882003-07-30 18:59:19 +00004626 /* do not increment past <! - some people use <!--> */
4627 base += 2;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004628 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004629 }
Jiri Netolicky446e1262009-08-07 17:05:36 +02004630 if (ignoreattrval) {
4631 if (buf[base] == '"' || buf[base] == '\'') {
4632 if (invalue) {
4633 if (buf[base] == valdellim) {
4634 invalue = 0;
4635 continue;
4636 }
4637 } else {
4638 valdellim = buf[base];
4639 invalue = 1;
4640 continue;
4641 }
4642 } else if (invalue) {
4643 continue;
4644 }
4645 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004646 if (incomment) {
William M. Brack4a557d92003-07-29 04:28:04 +00004647 if (base + 3 > len)
Daniel Veillardc1f78342001-11-10 11:43:05 +00004648 return(-1);
4649 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4650 (buf[base + 2] == '>')) {
4651 incomment = 0;
4652 base += 2;
4653 }
4654 continue;
4655 }
Owen Taylor3473f882001-02-23 17:55:21 +00004656 if (buf[base] == first) {
4657 if (third != 0) {
4658 if ((buf[base + 1] != next) ||
4659 (buf[base + 2] != third)) continue;
4660 } else if (next != 0) {
4661 if (buf[base + 1] != next) continue;
4662 }
4663 ctxt->checkIndex = 0;
4664#ifdef DEBUG_PUSH
4665 if (next == 0)
4666 xmlGenericError(xmlGenericErrorContext,
4667 "HPP: lookup '%c' found at %d\n",
4668 first, base);
4669 else if (third == 0)
4670 xmlGenericError(xmlGenericErrorContext,
4671 "HPP: lookup '%c%c' found at %d\n",
4672 first, next, base);
4673 else
4674 xmlGenericError(xmlGenericErrorContext,
4675 "HPP: lookup '%c%c%c' found at %d\n",
4676 first, next, third, base);
4677#endif
4678 return(base - (in->cur - in->base));
4679 }
4680 }
4681 ctxt->checkIndex = base;
4682#ifdef DEBUG_PUSH
4683 if (next == 0)
4684 xmlGenericError(xmlGenericErrorContext,
4685 "HPP: lookup '%c' failed\n", first);
4686 else if (third == 0)
4687 xmlGenericError(xmlGenericErrorContext,
4688 "HPP: lookup '%c%c' failed\n", first, next);
4689 else
4690 xmlGenericError(xmlGenericErrorContext,
4691 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4692#endif
4693 return(-1);
4694}
4695
4696/**
4697 * htmlParseTryOrFinish:
4698 * @ctxt: an HTML parser context
4699 * @terminate: last chunk indicator
4700 *
4701 * Try to progress on parsing
4702 *
4703 * Returns zero if no parsing was possible
4704 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004705static int
Owen Taylor3473f882001-02-23 17:55:21 +00004706htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4707 int ret = 0;
4708 htmlParserInputPtr in;
4709 int avail = 0;
4710 xmlChar cur, next;
4711
4712#ifdef DEBUG_PUSH
4713 switch (ctxt->instate) {
4714 case XML_PARSER_EOF:
4715 xmlGenericError(xmlGenericErrorContext,
4716 "HPP: try EOF\n"); break;
4717 case XML_PARSER_START:
4718 xmlGenericError(xmlGenericErrorContext,
4719 "HPP: try START\n"); break;
4720 case XML_PARSER_MISC:
4721 xmlGenericError(xmlGenericErrorContext,
4722 "HPP: try MISC\n");break;
4723 case XML_PARSER_COMMENT:
4724 xmlGenericError(xmlGenericErrorContext,
4725 "HPP: try COMMENT\n");break;
4726 case XML_PARSER_PROLOG:
4727 xmlGenericError(xmlGenericErrorContext,
4728 "HPP: try PROLOG\n");break;
4729 case XML_PARSER_START_TAG:
4730 xmlGenericError(xmlGenericErrorContext,
4731 "HPP: try START_TAG\n");break;
4732 case XML_PARSER_CONTENT:
4733 xmlGenericError(xmlGenericErrorContext,
4734 "HPP: try CONTENT\n");break;
4735 case XML_PARSER_CDATA_SECTION:
4736 xmlGenericError(xmlGenericErrorContext,
4737 "HPP: try CDATA_SECTION\n");break;
4738 case XML_PARSER_END_TAG:
4739 xmlGenericError(xmlGenericErrorContext,
4740 "HPP: try END_TAG\n");break;
4741 case XML_PARSER_ENTITY_DECL:
4742 xmlGenericError(xmlGenericErrorContext,
4743 "HPP: try ENTITY_DECL\n");break;
4744 case XML_PARSER_ENTITY_VALUE:
4745 xmlGenericError(xmlGenericErrorContext,
4746 "HPP: try ENTITY_VALUE\n");break;
4747 case XML_PARSER_ATTRIBUTE_VALUE:
4748 xmlGenericError(xmlGenericErrorContext,
4749 "HPP: try ATTRIBUTE_VALUE\n");break;
4750 case XML_PARSER_DTD:
4751 xmlGenericError(xmlGenericErrorContext,
4752 "HPP: try DTD\n");break;
4753 case XML_PARSER_EPILOG:
4754 xmlGenericError(xmlGenericErrorContext,
4755 "HPP: try EPILOG\n");break;
4756 case XML_PARSER_PI:
4757 xmlGenericError(xmlGenericErrorContext,
4758 "HPP: try PI\n");break;
4759 case XML_PARSER_SYSTEM_LITERAL:
4760 xmlGenericError(xmlGenericErrorContext,
4761 "HPP: try SYSTEM_LITERAL\n");break;
4762 }
4763#endif
4764
4765 while (1) {
4766
4767 in = ctxt->input;
4768 if (in == NULL) break;
4769 if (in->buf == NULL)
4770 avail = in->length - (in->cur - in->base);
4771 else
4772 avail = in->buf->buffer->use - (in->cur - in->base);
4773 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004774 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004775 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4776 /*
4777 * SAX: end of the document processing.
4778 */
4779 ctxt->instate = XML_PARSER_EOF;
4780 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4781 ctxt->sax->endDocument(ctxt->userData);
4782 }
4783 }
4784 if (avail < 1)
4785 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004786 cur = in->cur[0];
4787 if (cur == 0) {
4788 SKIP(1);
4789 continue;
4790 }
4791
Owen Taylor3473f882001-02-23 17:55:21 +00004792 switch (ctxt->instate) {
4793 case XML_PARSER_EOF:
4794 /*
4795 * Document parsing is done !
4796 */
4797 goto done;
4798 case XML_PARSER_START:
4799 /*
4800 * Very first chars read from the document flow.
4801 */
4802 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004803 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004804 SKIP_BLANKS;
4805 if (in->buf == NULL)
4806 avail = in->length - (in->cur - in->base);
4807 else
4808 avail = in->buf->buffer->use - (in->cur - in->base);
4809 }
4810 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4811 ctxt->sax->setDocumentLocator(ctxt->userData,
4812 &xmlDefaultSAXLocator);
4813 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4814 (!ctxt->disableSAX))
4815 ctxt->sax->startDocument(ctxt->userData);
4816
4817 cur = in->cur[0];
4818 next = in->cur[1];
4819 if ((cur == '<') && (next == '!') &&
4820 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4821 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4822 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4823 (UPP(8) == 'E')) {
4824 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004825 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004826 goto done;
4827#ifdef DEBUG_PUSH
4828 xmlGenericError(xmlGenericErrorContext,
4829 "HPP: Parsing internal subset\n");
4830#endif
4831 htmlParseDocTypeDecl(ctxt);
4832 ctxt->instate = XML_PARSER_PROLOG;
4833#ifdef DEBUG_PUSH
4834 xmlGenericError(xmlGenericErrorContext,
4835 "HPP: entering PROLOG\n");
4836#endif
4837 } else {
4838 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004839#ifdef DEBUG_PUSH
Daniel Veillard597f1c12005-07-03 23:00:18 +00004840 xmlGenericError(xmlGenericErrorContext,
4841 "HPP: entering MISC\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004842#endif
Daniel Veillard597f1c12005-07-03 23:00:18 +00004843 }
Owen Taylor3473f882001-02-23 17:55:21 +00004844 break;
4845 case XML_PARSER_MISC:
4846 SKIP_BLANKS;
4847 if (in->buf == NULL)
4848 avail = in->length - (in->cur - in->base);
4849 else
4850 avail = in->buf->buffer->use - (in->cur - in->base);
4851 if (avail < 2)
4852 goto done;
4853 cur = in->cur[0];
4854 next = in->cur[1];
4855 if ((cur == '<') && (next == '!') &&
4856 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4857 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004858 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004859 goto done;
4860#ifdef DEBUG_PUSH
4861 xmlGenericError(xmlGenericErrorContext,
4862 "HPP: Parsing Comment\n");
4863#endif
4864 htmlParseComment(ctxt);
4865 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004866 } else if ((cur == '<') && (next == '?')) {
4867 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004868 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004869 goto done;
4870#ifdef DEBUG_PUSH
4871 xmlGenericError(xmlGenericErrorContext,
4872 "HPP: Parsing PI\n");
4873#endif
4874 htmlParsePI(ctxt);
4875 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004876 } else if ((cur == '<') && (next == '!') &&
4877 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4878 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4879 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4880 (UPP(8) == 'E')) {
4881 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004882 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004883 goto done;
4884#ifdef DEBUG_PUSH
4885 xmlGenericError(xmlGenericErrorContext,
4886 "HPP: Parsing internal subset\n");
4887#endif
4888 htmlParseDocTypeDecl(ctxt);
4889 ctxt->instate = XML_PARSER_PROLOG;
4890#ifdef DEBUG_PUSH
4891 xmlGenericError(xmlGenericErrorContext,
4892 "HPP: entering PROLOG\n");
4893#endif
4894 } else if ((cur == '<') && (next == '!') &&
4895 (avail < 9)) {
4896 goto done;
4897 } else {
4898 ctxt->instate = XML_PARSER_START_TAG;
4899#ifdef DEBUG_PUSH
4900 xmlGenericError(xmlGenericErrorContext,
4901 "HPP: entering START_TAG\n");
4902#endif
4903 }
4904 break;
4905 case XML_PARSER_PROLOG:
4906 SKIP_BLANKS;
4907 if (in->buf == NULL)
4908 avail = in->length - (in->cur - in->base);
4909 else
4910 avail = in->buf->buffer->use - (in->cur - in->base);
4911 if (avail < 2)
4912 goto done;
4913 cur = in->cur[0];
4914 next = in->cur[1];
4915 if ((cur == '<') && (next == '!') &&
4916 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4917 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004918 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004919 goto done;
4920#ifdef DEBUG_PUSH
4921 xmlGenericError(xmlGenericErrorContext,
4922 "HPP: Parsing Comment\n");
4923#endif
4924 htmlParseComment(ctxt);
4925 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004926 } else if ((cur == '<') && (next == '?')) {
4927 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004928 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004929 goto done;
4930#ifdef DEBUG_PUSH
4931 xmlGenericError(xmlGenericErrorContext,
4932 "HPP: Parsing PI\n");
4933#endif
4934 htmlParsePI(ctxt);
4935 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004936 } else if ((cur == '<') && (next == '!') &&
4937 (avail < 4)) {
4938 goto done;
4939 } else {
4940 ctxt->instate = XML_PARSER_START_TAG;
4941#ifdef DEBUG_PUSH
4942 xmlGenericError(xmlGenericErrorContext,
4943 "HPP: entering START_TAG\n");
4944#endif
4945 }
4946 break;
4947 case XML_PARSER_EPILOG:
4948 if (in->buf == NULL)
4949 avail = in->length - (in->cur - in->base);
4950 else
4951 avail = in->buf->buffer->use - (in->cur - in->base);
4952 if (avail < 1)
4953 goto done;
4954 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004955 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004956 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004957 goto done;
4958 }
4959 if (avail < 2)
4960 goto done;
4961 next = in->cur[1];
4962 if ((cur == '<') && (next == '!') &&
4963 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4964 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004965 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004966 goto done;
4967#ifdef DEBUG_PUSH
4968 xmlGenericError(xmlGenericErrorContext,
4969 "HPP: Parsing Comment\n");
4970#endif
4971 htmlParseComment(ctxt);
4972 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004973 } else if ((cur == '<') && (next == '?')) {
4974 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004975 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004976 goto done;
4977#ifdef DEBUG_PUSH
4978 xmlGenericError(xmlGenericErrorContext,
4979 "HPP: Parsing PI\n");
4980#endif
4981 htmlParsePI(ctxt);
4982 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004983 } else if ((cur == '<') && (next == '!') &&
4984 (avail < 4)) {
4985 goto done;
4986 } else {
4987 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004988 ctxt->wellFormed = 0;
4989 ctxt->instate = XML_PARSER_EOF;
4990#ifdef DEBUG_PUSH
4991 xmlGenericError(xmlGenericErrorContext,
4992 "HPP: entering EOF\n");
4993#endif
4994 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4995 ctxt->sax->endDocument(ctxt->userData);
4996 goto done;
4997 }
4998 break;
4999 case XML_PARSER_START_TAG: {
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005000 const xmlChar *name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00005001 int failed;
Daniel Veillardbb371292001-08-16 23:26:59 +00005002 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00005003
5004 if (avail < 2)
5005 goto done;
5006 cur = in->cur[0];
5007 if (cur != '<') {
5008 ctxt->instate = XML_PARSER_CONTENT;
5009#ifdef DEBUG_PUSH
5010 xmlGenericError(xmlGenericErrorContext,
5011 "HPP: entering CONTENT\n");
5012#endif
5013 break;
5014 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00005015 if (in->cur[1] == '/') {
5016 ctxt->instate = XML_PARSER_END_TAG;
5017 ctxt->checkIndex = 0;
5018#ifdef DEBUG_PUSH
5019 xmlGenericError(xmlGenericErrorContext,
5020 "HPP: entering END_TAG\n");
5021#endif
5022 break;
5023 }
Owen Taylor3473f882001-02-23 17:55:21 +00005024 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005025 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005026 goto done;
5027
Daniel Veillard597f1c12005-07-03 23:00:18 +00005028 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005029 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00005030 if ((failed == -1) ||
Owen Taylor3473f882001-02-23 17:55:21 +00005031 (name == NULL)) {
5032 if (CUR == '>')
5033 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00005034 break;
5035 }
Owen Taylor3473f882001-02-23 17:55:21 +00005036
5037 /*
5038 * Lookup the info for that element.
5039 */
5040 info = htmlTagLookup(name);
5041 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005042 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5043 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005044 }
5045
5046 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00005047 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00005048 */
5049 if ((CUR == '/') && (NXT(1) == '>')) {
5050 SKIP(2);
5051 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5052 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005053 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005054 ctxt->instate = XML_PARSER_CONTENT;
5055#ifdef DEBUG_PUSH
5056 xmlGenericError(xmlGenericErrorContext,
5057 "HPP: entering CONTENT\n");
5058#endif
5059 break;
5060 }
5061
5062 if (CUR == '>') {
5063 NEXT;
5064 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00005065 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5066 "Couldn't find end of Start Tag %s\n",
5067 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005068
5069 /*
5070 * end of parsing of this node.
5071 */
5072 if (xmlStrEqual(name, ctxt->name)) {
5073 nodePop(ctxt);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005074 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005075 }
5076
5077 ctxt->instate = XML_PARSER_CONTENT;
5078#ifdef DEBUG_PUSH
5079 xmlGenericError(xmlGenericErrorContext,
5080 "HPP: entering CONTENT\n");
5081#endif
5082 break;
5083 }
5084
5085 /*
5086 * Check for an Empty Element from DTD definition
5087 */
5088 if ((info != NULL) && (info->empty)) {
5089 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5090 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005091 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005092 }
5093 ctxt->instate = XML_PARSER_CONTENT;
5094#ifdef DEBUG_PUSH
5095 xmlGenericError(xmlGenericErrorContext,
5096 "HPP: entering CONTENT\n");
5097#endif
5098 break;
5099 }
5100 case XML_PARSER_CONTENT: {
5101 long cons;
5102 /*
5103 * Handle preparsed entities and charRef
5104 */
5105 if (ctxt->token != 0) {
5106 xmlChar chr[2] = { 0 , 0 } ;
5107
5108 chr[0] = (xmlChar) ctxt->token;
5109 htmlCheckParagraph(ctxt);
5110 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5111 ctxt->sax->characters(ctxt->userData, chr, 1);
5112 ctxt->token = 0;
5113 ctxt->checkIndex = 0;
5114 }
5115 if ((avail == 1) && (terminate)) {
5116 cur = in->cur[0];
5117 if ((cur != '<') && (cur != '&')) {
5118 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00005119 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005120 if (ctxt->sax->ignorableWhitespace != NULL)
5121 ctxt->sax->ignorableWhitespace(
5122 ctxt->userData, &cur, 1);
5123 } else {
5124 htmlCheckParagraph(ctxt);
5125 if (ctxt->sax->characters != NULL)
5126 ctxt->sax->characters(
5127 ctxt->userData, &cur, 1);
5128 }
5129 }
5130 ctxt->token = 0;
5131 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00005132 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00005133 break;
Owen Taylor3473f882001-02-23 17:55:21 +00005134 }
Owen Taylor3473f882001-02-23 17:55:21 +00005135 }
5136 if (avail < 2)
5137 goto done;
5138 cur = in->cur[0];
5139 next = in->cur[1];
5140 cons = ctxt->nbChars;
5141 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5142 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5143 /*
5144 * Handle SCRIPT/STYLE separately
5145 */
Daniel Veillard68716a72006-10-16 09:32:17 +00005146 if (!terminate) {
5147 int idx;
5148 xmlChar val;
5149
Jiri Netolicky446e1262009-08-07 17:05:36 +02005150 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 1);
Daniel Veillard68716a72006-10-16 09:32:17 +00005151 if (idx < 0)
5152 goto done;
5153 val = in->cur[idx + 2];
5154 if (val == 0) /* bad cut of input */
5155 goto done;
5156 }
Owen Taylor3473f882001-02-23 17:55:21 +00005157 htmlParseScript(ctxt);
5158 if ((cur == '<') && (next == '/')) {
5159 ctxt->instate = XML_PARSER_END_TAG;
5160 ctxt->checkIndex = 0;
5161#ifdef DEBUG_PUSH
5162 xmlGenericError(xmlGenericErrorContext,
5163 "HPP: entering END_TAG\n");
5164#endif
5165 break;
5166 }
5167 } else {
5168 /*
5169 * Sometimes DOCTYPE arrives in the middle of the document
5170 */
5171 if ((cur == '<') && (next == '!') &&
5172 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5173 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5174 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5175 (UPP(8) == 'E')) {
5176 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005177 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005178 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00005179 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5180 "Misplaced DOCTYPE declaration\n",
5181 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005182 htmlParseDocTypeDecl(ctxt);
5183 } else if ((cur == '<') && (next == '!') &&
5184 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5185 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005186 (htmlParseLookupSequence(
Jiri Netolicky446e1262009-08-07 17:05:36 +02005187 ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005188 goto done;
5189#ifdef DEBUG_PUSH
5190 xmlGenericError(xmlGenericErrorContext,
5191 "HPP: Parsing Comment\n");
5192#endif
5193 htmlParseComment(ctxt);
5194 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005195 } else if ((cur == '<') && (next == '?')) {
5196 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005197 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005198 goto done;
5199#ifdef DEBUG_PUSH
5200 xmlGenericError(xmlGenericErrorContext,
5201 "HPP: Parsing PI\n");
5202#endif
5203 htmlParsePI(ctxt);
5204 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00005205 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5206 goto done;
5207 } else if ((cur == '<') && (next == '/')) {
5208 ctxt->instate = XML_PARSER_END_TAG;
5209 ctxt->checkIndex = 0;
5210#ifdef DEBUG_PUSH
5211 xmlGenericError(xmlGenericErrorContext,
5212 "HPP: entering END_TAG\n");
5213#endif
5214 break;
5215 } else if (cur == '<') {
5216 ctxt->instate = XML_PARSER_START_TAG;
5217 ctxt->checkIndex = 0;
5218#ifdef DEBUG_PUSH
5219 xmlGenericError(xmlGenericErrorContext,
5220 "HPP: entering START_TAG\n");
5221#endif
5222 break;
5223 } else if (cur == '&') {
5224 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005225 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005226 goto done;
5227#ifdef DEBUG_PUSH
5228 xmlGenericError(xmlGenericErrorContext,
5229 "HPP: Parsing Reference\n");
5230#endif
5231 /* TODO: check generation of subtrees if noent !!! */
5232 htmlParseReference(ctxt);
5233 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005234 /*
5235 * check that the text sequence is complete
5236 * before handing out the data to the parser
5237 * to avoid problems with erroneous end of
5238 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00005239 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00005240 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005241 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0, 1) < 0))
Daniel Veillard14f752c2003-08-09 11:44:50 +00005242 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00005243 ctxt->checkIndex = 0;
5244#ifdef DEBUG_PUSH
5245 xmlGenericError(xmlGenericErrorContext,
5246 "HPP: Parsing char data\n");
5247#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005248 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005249 }
5250 }
5251 if (cons == ctxt->nbChars) {
5252 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005253 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5254 "detected an error in element content\n",
5255 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005256 }
5257 NEXT;
5258 break;
5259 }
5260
5261 break;
5262 }
5263 case XML_PARSER_END_TAG:
5264 if (avail < 2)
5265 goto done;
5266 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005267 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005268 goto done;
5269 htmlParseEndTag(ctxt);
5270 if (ctxt->nameNr == 0) {
5271 ctxt->instate = XML_PARSER_EPILOG;
5272 } else {
5273 ctxt->instate = XML_PARSER_CONTENT;
5274 }
5275 ctxt->checkIndex = 0;
5276#ifdef DEBUG_PUSH
5277 xmlGenericError(xmlGenericErrorContext,
5278 "HPP: entering CONTENT\n");
5279#endif
5280 break;
5281 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005282 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5283 "HPP: internal error, state == CDATA\n",
5284 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005285 ctxt->instate = XML_PARSER_CONTENT;
5286 ctxt->checkIndex = 0;
5287#ifdef DEBUG_PUSH
5288 xmlGenericError(xmlGenericErrorContext,
5289 "HPP: entering CONTENT\n");
5290#endif
5291 break;
5292 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005293 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5294 "HPP: internal error, state == DTD\n",
5295 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005296 ctxt->instate = XML_PARSER_CONTENT;
5297 ctxt->checkIndex = 0;
5298#ifdef DEBUG_PUSH
5299 xmlGenericError(xmlGenericErrorContext,
5300 "HPP: entering CONTENT\n");
5301#endif
5302 break;
5303 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005304 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5305 "HPP: internal error, state == COMMENT\n",
5306 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005307 ctxt->instate = XML_PARSER_CONTENT;
5308 ctxt->checkIndex = 0;
5309#ifdef DEBUG_PUSH
5310 xmlGenericError(xmlGenericErrorContext,
5311 "HPP: entering CONTENT\n");
5312#endif
5313 break;
5314 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005315 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5316 "HPP: internal error, state == PI\n",
5317 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005318 ctxt->instate = XML_PARSER_CONTENT;
5319 ctxt->checkIndex = 0;
5320#ifdef DEBUG_PUSH
5321 xmlGenericError(xmlGenericErrorContext,
5322 "HPP: entering CONTENT\n");
5323#endif
5324 break;
5325 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005326 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5327 "HPP: internal error, state == ENTITY_DECL\n",
5328 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005329 ctxt->instate = XML_PARSER_CONTENT;
5330 ctxt->checkIndex = 0;
5331#ifdef DEBUG_PUSH
5332 xmlGenericError(xmlGenericErrorContext,
5333 "HPP: entering CONTENT\n");
5334#endif
5335 break;
5336 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005337 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5338 "HPP: internal error, state == ENTITY_VALUE\n",
5339 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005340 ctxt->instate = XML_PARSER_CONTENT;
5341 ctxt->checkIndex = 0;
5342#ifdef DEBUG_PUSH
5343 xmlGenericError(xmlGenericErrorContext,
5344 "HPP: entering DTD\n");
5345#endif
5346 break;
5347 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005348 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5349 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5350 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005351 ctxt->instate = XML_PARSER_START_TAG;
5352 ctxt->checkIndex = 0;
5353#ifdef DEBUG_PUSH
5354 xmlGenericError(xmlGenericErrorContext,
5355 "HPP: entering START_TAG\n");
5356#endif
5357 break;
5358 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005359 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5360 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5361 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005362 ctxt->instate = XML_PARSER_CONTENT;
5363 ctxt->checkIndex = 0;
5364#ifdef DEBUG_PUSH
5365 xmlGenericError(xmlGenericErrorContext,
5366 "HPP: entering CONTENT\n");
5367#endif
5368 break;
5369 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005370 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5371 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5372 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005373 ctxt->instate = XML_PARSER_CONTENT;
5374 ctxt->checkIndex = 0;
5375#ifdef DEBUG_PUSH
5376 xmlGenericError(xmlGenericErrorContext,
5377 "HPP: entering CONTENT\n");
5378#endif
5379 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005380 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005381 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5382 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5383 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005384 ctxt->instate = XML_PARSER_CONTENT;
5385 ctxt->checkIndex = 0;
5386#ifdef DEBUG_PUSH
5387 xmlGenericError(xmlGenericErrorContext,
5388 "HPP: entering CONTENT\n");
5389#endif
5390 break;
5391
Owen Taylor3473f882001-02-23 17:55:21 +00005392 }
5393 }
5394done:
5395 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005396 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005397 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5398 /*
5399 * SAX: end of the document processing.
5400 */
5401 ctxt->instate = XML_PARSER_EOF;
5402 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5403 ctxt->sax->endDocument(ctxt->userData);
5404 }
5405 }
5406 if ((ctxt->myDoc != NULL) &&
5407 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5408 (ctxt->instate == XML_PARSER_EPILOG))) {
5409 xmlDtdPtr dtd;
5410 dtd = xmlGetIntSubset(ctxt->myDoc);
5411 if (dtd == NULL)
5412 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00005413 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005414 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5415 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5416 }
5417#ifdef DEBUG_PUSH
5418 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5419#endif
5420 return(ret);
5421}
5422
5423/**
Owen Taylor3473f882001-02-23 17:55:21 +00005424 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00005425 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00005426 * @chunk: an char array
5427 * @size: the size in byte of the chunk
5428 * @terminate: last chunk indicator
5429 *
5430 * Parse a Chunk of memory
5431 *
5432 * Returns zero if no error, the xmlParserErrors otherwise.
5433 */
5434int
5435htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5436 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00005437 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5438 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5439 "htmlParseChunk: context error\n", NULL, NULL);
5440 return(XML_ERR_INTERNAL_ERROR);
5441 }
Owen Taylor3473f882001-02-23 17:55:21 +00005442 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5443 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5444 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5445 int cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005446 int res;
Owen Taylor3473f882001-02-23 17:55:21 +00005447
Daniel Veillardd2755a82005-08-07 23:42:39 +00005448 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5449 if (res < 0) {
5450 ctxt->errNo = XML_PARSER_EOF;
5451 ctxt->disableSAX = 1;
5452 return (XML_PARSER_EOF);
5453 }
Owen Taylor3473f882001-02-23 17:55:21 +00005454 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5455 ctxt->input->cur = ctxt->input->base + cur;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005456 ctxt->input->end =
5457 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005458#ifdef DEBUG_PUSH
5459 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5460#endif
5461
Daniel Veillard14f752c2003-08-09 11:44:50 +00005462#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00005463 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5464 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005465#endif
Owen Taylor3473f882001-02-23 17:55:21 +00005466 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005467 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5468 xmlParserInputBufferPtr in = ctxt->input->buf;
5469 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5470 (in->raw != NULL)) {
5471 int nbchars;
5472
5473 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5474 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005475 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5476 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005477 return(XML_ERR_INVALID_ENCODING);
5478 }
5479 }
5480 }
Owen Taylor3473f882001-02-23 17:55:21 +00005481 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00005482 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00005483 if (terminate) {
5484 if ((ctxt->instate != XML_PARSER_EOF) &&
5485 (ctxt->instate != XML_PARSER_EPILOG) &&
5486 (ctxt->instate != XML_PARSER_MISC)) {
5487 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005488 ctxt->wellFormed = 0;
5489 }
5490 if (ctxt->instate != XML_PARSER_EOF) {
5491 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5492 ctxt->sax->endDocument(ctxt->userData);
5493 }
5494 ctxt->instate = XML_PARSER_EOF;
5495 }
5496 return((xmlParserErrors) ctxt->errNo);
5497}
5498
5499/************************************************************************
5500 * *
5501 * User entry points *
5502 * *
5503 ************************************************************************/
5504
5505/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005506 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005507 * @sax: a SAX handler
5508 * @user_data: The user data returned on SAX callbacks
5509 * @chunk: a pointer to an array of chars
5510 * @size: number of chars in the array
5511 * @filename: an optional file name or URI
5512 * @enc: an optional encoding
5513 *
5514 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005515 * The value of @filename is used for fetching external entities
5516 * and error/warning reports.
5517 *
5518 * Returns the new parser context or NULL
5519 */
5520htmlParserCtxtPtr
5521htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5522 const char *chunk, int size, const char *filename,
5523 xmlCharEncoding enc) {
5524 htmlParserCtxtPtr ctxt;
5525 htmlParserInputPtr inputStream;
5526 xmlParserInputBufferPtr buf;
5527
Daniel Veillardd0463562001-10-13 09:15:48 +00005528 xmlInitParser();
5529
Owen Taylor3473f882001-02-23 17:55:21 +00005530 buf = xmlAllocParserInputBuffer(enc);
5531 if (buf == NULL) return(NULL);
5532
Daniel Veillardf403d292003-10-05 13:51:35 +00005533 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005534 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005535 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005536 return(NULL);
5537 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00005538 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5539 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005540 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005541 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005542 xmlFree(ctxt->sax);
5543 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5544 if (ctxt->sax == NULL) {
5545 xmlFree(buf);
5546 xmlFree(ctxt);
5547 return(NULL);
5548 }
5549 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5550 if (user_data != NULL)
5551 ctxt->userData = user_data;
5552 }
5553 if (filename == NULL) {
5554 ctxt->directory = NULL;
5555 } else {
5556 ctxt->directory = xmlParserGetDirectory(filename);
5557 }
5558
5559 inputStream = htmlNewInputStream(ctxt);
5560 if (inputStream == NULL) {
5561 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005562 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005563 return(NULL);
5564 }
5565
5566 if (filename == NULL)
5567 inputStream->filename = NULL;
5568 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005569 inputStream->filename = (char *)
5570 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005571 inputStream->buf = buf;
5572 inputStream->base = inputStream->buf->buffer->content;
5573 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillard5f704af2003-03-05 10:01:43 +00005574 inputStream->end =
5575 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005576
5577 inputPush(ctxt, inputStream);
5578
5579 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5580 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005581 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5582 int cur = ctxt->input->cur - ctxt->input->base;
5583
Owen Taylor3473f882001-02-23 17:55:21 +00005584 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005585
5586 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5587 ctxt->input->cur = ctxt->input->base + cur;
5588 ctxt->input->end =
5589 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005590#ifdef DEBUG_PUSH
5591 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5592#endif
5593 }
Daniel Veillard68716a72006-10-16 09:32:17 +00005594 ctxt->progressive = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00005595
5596 return(ctxt);
5597}
William M. Brack21e4ef22005-01-02 09:53:13 +00005598#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00005599
5600/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005601 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005602 * @cur: a pointer to an array of xmlChar
5603 * @encoding: a free form C string describing the HTML document encoding, or NULL
5604 * @sax: the SAX handler block
5605 * @userData: if using SAX, this pointer will be provided on callbacks.
5606 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005607 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5608 * to handle parse events. If sax is NULL, fallback to the default DOM
5609 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00005610 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005611 * Returns the resulting document tree unless SAX is NULL or the document is
5612 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005613 */
5614
5615htmlDocPtr
5616htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5617 htmlDocPtr ret;
5618 htmlParserCtxtPtr ctxt;
5619
Daniel Veillardd0463562001-10-13 09:15:48 +00005620 xmlInitParser();
5621
Owen Taylor3473f882001-02-23 17:55:21 +00005622 if (cur == NULL) return(NULL);
5623
5624
5625 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5626 if (ctxt == NULL) return(NULL);
5627 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005628 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005629 ctxt->sax = sax;
5630 ctxt->userData = userData;
5631 }
5632
5633 htmlParseDocument(ctxt);
5634 ret = ctxt->myDoc;
5635 if (sax != NULL) {
5636 ctxt->sax = NULL;
5637 ctxt->userData = NULL;
5638 }
5639 htmlFreeParserCtxt(ctxt);
5640
5641 return(ret);
5642}
5643
5644/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005645 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005646 * @cur: a pointer to an array of xmlChar
5647 * @encoding: a free form C string describing the HTML document encoding, or NULL
5648 *
5649 * parse an HTML in-memory document and build a tree.
5650 *
5651 * Returns the resulting document tree
5652 */
5653
5654htmlDocPtr
5655htmlParseDoc(xmlChar *cur, const char *encoding) {
5656 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5657}
5658
5659
5660/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005661 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005662 * @filename: the filename
5663 * @encoding: a free form C string describing the HTML document encoding, or NULL
5664 *
5665 * Create a parser context for a file content.
5666 * Automatic support for ZLIB/Compress compressed document is provided
5667 * by default if found at compile-time.
5668 *
5669 * Returns the new parser context or NULL
5670 */
5671htmlParserCtxtPtr
5672htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5673{
5674 htmlParserCtxtPtr ctxt;
5675 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005676 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005677 /* htmlCharEncoding enc; */
5678 xmlChar *content, *content_line = (xmlChar *) "charset=";
5679
Daniel Veillarda03e3652004-11-02 18:45:30 +00005680 if (filename == NULL)
5681 return(NULL);
5682
Daniel Veillardf403d292003-10-05 13:51:35 +00005683 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005684 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00005685 return(NULL);
5686 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005687 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5688 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00005689#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005690 if (xmlDefaultSAXHandler.error != NULL) {
5691 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5692 }
Daniel Veillard87247e82004-01-13 20:42:02 +00005693#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00005694 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005695 return(NULL);
5696 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005697
5698 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5699 xmlFree(canonicFilename);
5700 if (inputStream == NULL) {
5701 xmlFreeParserCtxt(ctxt);
5702 return(NULL);
5703 }
Owen Taylor3473f882001-02-23 17:55:21 +00005704
5705 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005706
Owen Taylor3473f882001-02-23 17:55:21 +00005707 /* set encoding */
5708 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005709 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Owen Taylor3473f882001-02-23 17:55:21 +00005710 if (content) {
5711 strcpy ((char *)content, (char *)content_line);
5712 strcat ((char *)content, (char *)encoding);
5713 htmlCheckEncoding (ctxt, content);
5714 xmlFree (content);
5715 }
5716 }
5717
5718 return(ctxt);
5719}
5720
5721/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005722 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005723 * @filename: the filename
5724 * @encoding: a free form C string describing the HTML document encoding, or NULL
5725 * @sax: the SAX handler block
5726 * @userData: if using SAX, this pointer will be provided on callbacks.
5727 *
5728 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5729 * compressed document is provided by default if found at compile-time.
5730 * It use the given SAX function block to handle the parsing callback.
5731 * If sax is NULL, fallback to the default DOM tree building routines.
5732 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005733 * Returns the resulting document tree unless SAX is NULL or the document is
5734 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005735 */
5736
5737htmlDocPtr
5738htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5739 void *userData) {
5740 htmlDocPtr ret;
5741 htmlParserCtxtPtr ctxt;
5742 htmlSAXHandlerPtr oldsax = NULL;
5743
Daniel Veillardd0463562001-10-13 09:15:48 +00005744 xmlInitParser();
5745
Owen Taylor3473f882001-02-23 17:55:21 +00005746 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5747 if (ctxt == NULL) return(NULL);
5748 if (sax != NULL) {
5749 oldsax = ctxt->sax;
5750 ctxt->sax = sax;
5751 ctxt->userData = userData;
5752 }
5753
5754 htmlParseDocument(ctxt);
5755
5756 ret = ctxt->myDoc;
5757 if (sax != NULL) {
5758 ctxt->sax = oldsax;
5759 ctxt->userData = NULL;
5760 }
5761 htmlFreeParserCtxt(ctxt);
5762
5763 return(ret);
5764}
5765
5766/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005767 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005768 * @filename: the filename
5769 * @encoding: a free form C string describing the HTML document encoding, or NULL
5770 *
5771 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5772 * compressed document is provided by default if found at compile-time.
5773 *
5774 * Returns the resulting document tree
5775 */
5776
5777htmlDocPtr
5778htmlParseFile(const char *filename, const char *encoding) {
5779 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5780}
5781
5782/**
5783 * htmlHandleOmittedElem:
5784 * @val: int 0 or 1
5785 *
5786 * Set and return the previous value for handling HTML omitted tags.
5787 *
5788 * Returns the last value for 0 for no handling, 1 for auto insertion.
5789 */
5790
5791int
5792htmlHandleOmittedElem(int val) {
5793 int old = htmlOmittedDefaultValue;
5794
5795 htmlOmittedDefaultValue = val;
5796 return(old);
5797}
5798
Daniel Veillard930dfb62003-02-05 10:17:38 +00005799/**
5800 * htmlElementAllowedHere:
5801 * @parent: HTML parent element
5802 * @elt: HTML element
5803 *
5804 * Checks whether an HTML element may be a direct child of a parent element.
5805 * Note - doesn't check for deprecated elements
5806 *
5807 * Returns 1 if allowed; 0 otherwise.
5808 */
5809int
5810htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5811 const char** p ;
5812
5813 if ( ! elt || ! parent || ! parent->subelts )
5814 return 0 ;
5815
5816 for ( p = parent->subelts; *p; ++p )
5817 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5818 return 1 ;
5819
5820 return 0 ;
5821}
5822/**
5823 * htmlElementStatusHere:
5824 * @parent: HTML parent element
5825 * @elt: HTML element
5826 *
5827 * Checks whether an HTML element may be a direct child of a parent element.
5828 * and if so whether it is valid or deprecated.
5829 *
5830 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5831 */
5832htmlStatus
5833htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5834 if ( ! parent || ! elt )
5835 return HTML_INVALID ;
5836 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5837 return HTML_INVALID ;
5838
5839 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5840}
5841/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005842 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005843 * @elt: HTML element
5844 * @attr: HTML attribute
5845 * @legacy: whether to allow deprecated attributes
5846 *
5847 * Checks whether an attribute is valid for an element
5848 * Has full knowledge of Required and Deprecated attributes
5849 *
5850 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5851 */
5852htmlStatus
5853htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5854 const char** p ;
5855
5856 if ( !elt || ! attr )
5857 return HTML_INVALID ;
5858
5859 if ( elt->attrs_req )
5860 for ( p = elt->attrs_req; *p; ++p)
5861 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5862 return HTML_REQUIRED ;
5863
5864 if ( elt->attrs_opt )
5865 for ( p = elt->attrs_opt; *p; ++p)
5866 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5867 return HTML_VALID ;
5868
5869 if ( legacy && elt->attrs_depr )
5870 for ( p = elt->attrs_depr; *p; ++p)
5871 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5872 return HTML_DEPRECATED ;
5873
5874 return HTML_INVALID ;
5875}
5876/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005877 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005878 * @node: an htmlNodePtr in a tree
5879 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005880 * for Element nodes)
5881 *
5882 * Checks whether the tree node is valid. Experimental (the author
5883 * only uses the HTML enhancements in a SAX parser)
5884 *
5885 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5886 * legacy allowed) or htmlElementStatusHere (otherwise).
5887 * for Attribute nodes, a return from htmlAttrAllowed
5888 * for other nodes, HTML_NA (no checks performed)
5889 */
5890htmlStatus
5891htmlNodeStatus(const htmlNodePtr node, int legacy) {
5892 if ( ! node )
5893 return HTML_INVALID ;
5894
5895 switch ( node->type ) {
5896 case XML_ELEMENT_NODE:
5897 return legacy
5898 ? ( htmlElementAllowedHere (
5899 htmlTagLookup(node->parent->name) , node->name
5900 ) ? HTML_VALID : HTML_INVALID )
5901 : htmlElementStatusHere(
5902 htmlTagLookup(node->parent->name) ,
5903 htmlTagLookup(node->name) )
5904 ;
5905 case XML_ATTRIBUTE_NODE:
5906 return htmlAttrAllowed(
5907 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5908 default: return HTML_NA ;
5909 }
5910}
Daniel Veillard9475a352003-09-26 12:47:50 +00005911/************************************************************************
5912 * *
5913 * New set (2.6.0) of simpler and more flexible APIs *
5914 * *
5915 ************************************************************************/
5916/**
5917 * DICT_FREE:
5918 * @str: a string
5919 *
5920 * Free a string if it is not owned by the "dict" dictionnary in the
5921 * current scope
5922 */
5923#define DICT_FREE(str) \
5924 if ((str) && ((!dict) || \
5925 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5926 xmlFree((char *)(str));
5927
5928/**
5929 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00005930 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00005931 *
5932 * Reset a parser context
5933 */
5934void
5935htmlCtxtReset(htmlParserCtxtPtr ctxt)
5936{
5937 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00005938 xmlDictPtr dict;
5939
5940 if (ctxt == NULL)
5941 return;
5942
Daniel Veillardf1a27c62006-10-13 22:33:03 +00005943 xmlInitParser();
Daniel Veillarda03e3652004-11-02 18:45:30 +00005944 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00005945
5946 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5947 xmlFreeInputStream(input);
5948 }
5949 ctxt->inputNr = 0;
5950 ctxt->input = NULL;
5951
5952 ctxt->spaceNr = 0;
Daniel Veillarda521d282004-11-09 14:59:59 +00005953 if (ctxt->spaceTab != NULL) {
5954 ctxt->spaceTab[0] = -1;
5955 ctxt->space = &ctxt->spaceTab[0];
5956 } else {
5957 ctxt->space = NULL;
5958 }
Daniel Veillard9475a352003-09-26 12:47:50 +00005959
5960
5961 ctxt->nodeNr = 0;
5962 ctxt->node = NULL;
5963
5964 ctxt->nameNr = 0;
5965 ctxt->name = NULL;
5966
5967 DICT_FREE(ctxt->version);
5968 ctxt->version = NULL;
5969 DICT_FREE(ctxt->encoding);
5970 ctxt->encoding = NULL;
5971 DICT_FREE(ctxt->directory);
5972 ctxt->directory = NULL;
5973 DICT_FREE(ctxt->extSubURI);
5974 ctxt->extSubURI = NULL;
5975 DICT_FREE(ctxt->extSubSystem);
5976 ctxt->extSubSystem = NULL;
5977 if (ctxt->myDoc != NULL)
5978 xmlFreeDoc(ctxt->myDoc);
5979 ctxt->myDoc = NULL;
5980
5981 ctxt->standalone = -1;
5982 ctxt->hasExternalSubset = 0;
5983 ctxt->hasPErefs = 0;
5984 ctxt->html = 1;
5985 ctxt->external = 0;
5986 ctxt->instate = XML_PARSER_START;
5987 ctxt->token = 0;
5988
5989 ctxt->wellFormed = 1;
5990 ctxt->nsWellFormed = 1;
5991 ctxt->valid = 1;
5992 ctxt->vctxt.userData = ctxt;
5993 ctxt->vctxt.error = xmlParserValidityError;
5994 ctxt->vctxt.warning = xmlParserValidityWarning;
5995 ctxt->record_info = 0;
5996 ctxt->nbChars = 0;
5997 ctxt->checkIndex = 0;
5998 ctxt->inSubset = 0;
5999 ctxt->errNo = XML_ERR_OK;
6000 ctxt->depth = 0;
Daniel Veillard772869f2006-11-08 09:16:56 +00006001 ctxt->charset = XML_CHAR_ENCODING_NONE;
Daniel Veillard9475a352003-09-26 12:47:50 +00006002 ctxt->catalogs = NULL;
6003 xmlInitNodeInfoSeq(&ctxt->node_seq);
6004
6005 if (ctxt->attsDefault != NULL) {
6006 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6007 ctxt->attsDefault = NULL;
6008 }
6009 if (ctxt->attsSpecial != NULL) {
6010 xmlHashFree(ctxt->attsSpecial, NULL);
6011 ctxt->attsSpecial = NULL;
6012 }
6013}
6014
6015/**
6016 * htmlCtxtUseOptions:
6017 * @ctxt: an HTML parser context
6018 * @options: a combination of htmlParserOption(s)
6019 *
6020 * Applies the options to the parser context
6021 *
6022 * Returns 0 in case of success, the set of unknown or unimplemented options
6023 * in case of error.
6024 */
6025int
6026htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6027{
Daniel Veillarda03e3652004-11-02 18:45:30 +00006028 if (ctxt == NULL)
6029 return(-1);
6030
Daniel Veillard9475a352003-09-26 12:47:50 +00006031 if (options & HTML_PARSE_NOWARNING) {
6032 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006033 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006034 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006035 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00006036 }
6037 if (options & HTML_PARSE_NOERROR) {
6038 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006039 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006040 ctxt->sax->fatalError = NULL;
6041 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006042 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00006043 }
6044 if (options & HTML_PARSE_PEDANTIC) {
6045 ctxt->pedantic = 1;
6046 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006047 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00006048 } else
6049 ctxt->pedantic = 0;
6050 if (options & XML_PARSE_NOBLANKS) {
6051 ctxt->keepBlanks = 0;
6052 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6053 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006054 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00006055 } else
6056 ctxt->keepBlanks = 1;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006057 if (options & HTML_PARSE_RECOVER) {
6058 ctxt->recovery = 1;
Daniel Veillardaf616a72006-10-17 20:18:39 +00006059 options -= HTML_PARSE_RECOVER;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006060 } else
6061 ctxt->recovery = 0;
Daniel Veillard8874b942005-08-25 13:19:21 +00006062 if (options & HTML_PARSE_COMPACT) {
6063 ctxt->options |= HTML_PARSE_COMPACT;
6064 options -= HTML_PARSE_COMPACT;
6065 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006066 ctxt->dictNames = 0;
6067 return (options);
6068}
6069
6070/**
6071 * htmlDoRead:
6072 * @ctxt: an HTML parser context
6073 * @URL: the base URL to use for the document
6074 * @encoding: the document encoding, or NULL
6075 * @options: a combination of htmlParserOption(s)
6076 * @reuse: keep the context for reuse
6077 *
6078 * Common front-end for the htmlRead functions
6079 *
6080 * Returns the resulting document tree or NULL
6081 */
6082static htmlDocPtr
6083htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6084 int options, int reuse)
6085{
6086 htmlDocPtr ret;
6087
6088 htmlCtxtUseOptions(ctxt, options);
6089 ctxt->html = 1;
6090 if (encoding != NULL) {
6091 xmlCharEncodingHandlerPtr hdlr;
6092
6093 hdlr = xmlFindCharEncodingHandler(encoding);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006094 if (hdlr != NULL) {
Daniel Veillard9475a352003-09-26 12:47:50 +00006095 xmlSwitchToEncoding(ctxt, hdlr);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006096 if (ctxt->input->encoding != NULL)
6097 xmlFree((xmlChar *) ctxt->input->encoding);
6098 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6099 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006100 }
6101 if ((URL != NULL) && (ctxt->input != NULL) &&
6102 (ctxt->input->filename == NULL))
6103 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6104 htmlParseDocument(ctxt);
6105 ret = ctxt->myDoc;
6106 ctxt->myDoc = NULL;
6107 if (!reuse) {
6108 if ((ctxt->dictNames) &&
6109 (ret != NULL) &&
6110 (ret->dict == ctxt->dict))
6111 ctxt->dict = NULL;
6112 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006113 }
6114 return (ret);
6115}
6116
6117/**
6118 * htmlReadDoc:
6119 * @cur: a pointer to a zero terminated string
6120 * @URL: the base URL to use for the document
6121 * @encoding: the document encoding, or NULL
6122 * @options: a combination of htmlParserOption(s)
6123 *
6124 * parse an XML in-memory document and build a tree.
6125 *
6126 * Returns the resulting document tree
6127 */
6128htmlDocPtr
6129htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6130{
6131 htmlParserCtxtPtr ctxt;
6132
6133 if (cur == NULL)
6134 return (NULL);
6135
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006136 xmlInitParser();
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006137 ctxt = htmlCreateDocParserCtxt(cur, NULL);
Daniel Veillard9475a352003-09-26 12:47:50 +00006138 if (ctxt == NULL)
6139 return (NULL);
6140 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6141}
6142
6143/**
6144 * htmlReadFile:
6145 * @filename: a file or URL
6146 * @encoding: the document encoding, or NULL
6147 * @options: a combination of htmlParserOption(s)
6148 *
6149 * parse an XML file from the filesystem or the network.
6150 *
6151 * Returns the resulting document tree
6152 */
6153htmlDocPtr
6154htmlReadFile(const char *filename, const char *encoding, int options)
6155{
6156 htmlParserCtxtPtr ctxt;
6157
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006158 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006159 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6160 if (ctxt == NULL)
6161 return (NULL);
6162 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6163}
6164
6165/**
6166 * htmlReadMemory:
6167 * @buffer: a pointer to a char array
6168 * @size: the size of the array
6169 * @URL: the base URL to use for the document
6170 * @encoding: the document encoding, or NULL
6171 * @options: a combination of htmlParserOption(s)
6172 *
6173 * parse an XML in-memory document and build a tree.
6174 *
6175 * Returns the resulting document tree
6176 */
6177htmlDocPtr
6178htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6179{
6180 htmlParserCtxtPtr ctxt;
6181
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006182 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006183 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6184 if (ctxt == NULL)
6185 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006186 htmlDefaultSAXHandlerInit();
William M. Brackd43cdcd2004-08-03 15:13:29 +00006187 if (ctxt->sax != NULL)
6188 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00006189 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6190}
6191
6192/**
6193 * htmlReadFd:
6194 * @fd: an open file descriptor
6195 * @URL: the base URL to use for the document
6196 * @encoding: the document encoding, or NULL
6197 * @options: a combination of htmlParserOption(s)
6198 *
6199 * parse an XML from a file descriptor and build a tree.
6200 *
6201 * Returns the resulting document tree
6202 */
6203htmlDocPtr
6204htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6205{
6206 htmlParserCtxtPtr ctxt;
6207 xmlParserInputBufferPtr input;
6208 xmlParserInputPtr stream;
6209
6210 if (fd < 0)
6211 return (NULL);
6212
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006213 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006214 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6215 if (input == NULL)
6216 return (NULL);
6217 ctxt = xmlNewParserCtxt();
6218 if (ctxt == NULL) {
6219 xmlFreeParserInputBuffer(input);
6220 return (NULL);
6221 }
6222 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6223 if (stream == NULL) {
6224 xmlFreeParserInputBuffer(input);
6225 xmlFreeParserCtxt(ctxt);
6226 return (NULL);
6227 }
6228 inputPush(ctxt, stream);
6229 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6230}
6231
6232/**
6233 * htmlReadIO:
6234 * @ioread: an I/O read function
6235 * @ioclose: an I/O close function
6236 * @ioctx: an I/O handler
6237 * @URL: the base URL to use for the document
6238 * @encoding: the document encoding, or NULL
6239 * @options: a combination of htmlParserOption(s)
6240 *
6241 * parse an HTML document from I/O functions and source and build a tree.
6242 *
6243 * Returns the resulting document tree
6244 */
6245htmlDocPtr
6246htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6247 void *ioctx, const char *URL, const char *encoding, int options)
6248{
6249 htmlParserCtxtPtr ctxt;
6250 xmlParserInputBufferPtr input;
6251 xmlParserInputPtr stream;
6252
6253 if (ioread == NULL)
6254 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006255 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006256
6257 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6258 XML_CHAR_ENCODING_NONE);
6259 if (input == NULL)
6260 return (NULL);
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006261 ctxt = htmlNewParserCtxt();
Daniel Veillard9475a352003-09-26 12:47:50 +00006262 if (ctxt == NULL) {
6263 xmlFreeParserInputBuffer(input);
6264 return (NULL);
6265 }
6266 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6267 if (stream == NULL) {
6268 xmlFreeParserInputBuffer(input);
6269 xmlFreeParserCtxt(ctxt);
6270 return (NULL);
6271 }
6272 inputPush(ctxt, stream);
6273 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6274}
6275
6276/**
6277 * htmlCtxtReadDoc:
6278 * @ctxt: an HTML parser context
6279 * @cur: a pointer to a zero terminated string
6280 * @URL: the base URL to use for the document
6281 * @encoding: the document encoding, or NULL
6282 * @options: a combination of htmlParserOption(s)
6283 *
6284 * parse an XML in-memory document and build a tree.
6285 * This reuses the existing @ctxt parser context
6286 *
6287 * Returns the resulting document tree
6288 */
6289htmlDocPtr
6290htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6291 const char *URL, const char *encoding, int options)
6292{
6293 xmlParserInputPtr stream;
6294
6295 if (cur == NULL)
6296 return (NULL);
6297 if (ctxt == NULL)
6298 return (NULL);
6299
6300 htmlCtxtReset(ctxt);
6301
6302 stream = xmlNewStringInputStream(ctxt, cur);
6303 if (stream == NULL) {
6304 return (NULL);
6305 }
6306 inputPush(ctxt, stream);
6307 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6308}
6309
6310/**
6311 * htmlCtxtReadFile:
6312 * @ctxt: an HTML parser context
6313 * @filename: a file or URL
6314 * @encoding: the document encoding, or NULL
6315 * @options: a combination of htmlParserOption(s)
6316 *
6317 * parse an XML file from the filesystem or the network.
6318 * This reuses the existing @ctxt parser context
6319 *
6320 * Returns the resulting document tree
6321 */
6322htmlDocPtr
6323htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6324 const char *encoding, int options)
6325{
6326 xmlParserInputPtr stream;
6327
6328 if (filename == NULL)
6329 return (NULL);
6330 if (ctxt == NULL)
6331 return (NULL);
6332
6333 htmlCtxtReset(ctxt);
6334
Daniel Veillard29614c72004-11-26 10:47:26 +00006335 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006336 if (stream == NULL) {
6337 return (NULL);
6338 }
6339 inputPush(ctxt, stream);
6340 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6341}
6342
6343/**
6344 * htmlCtxtReadMemory:
6345 * @ctxt: an HTML parser context
6346 * @buffer: a pointer to a char array
6347 * @size: the size of the array
6348 * @URL: the base URL to use for the document
6349 * @encoding: the document encoding, or NULL
6350 * @options: a combination of htmlParserOption(s)
6351 *
6352 * parse an XML in-memory document and build a tree.
6353 * This reuses the existing @ctxt parser context
6354 *
6355 * Returns the resulting document tree
6356 */
6357htmlDocPtr
6358htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6359 const char *URL, const char *encoding, int options)
6360{
6361 xmlParserInputBufferPtr input;
6362 xmlParserInputPtr stream;
6363
6364 if (ctxt == NULL)
6365 return (NULL);
6366 if (buffer == NULL)
6367 return (NULL);
6368
6369 htmlCtxtReset(ctxt);
6370
6371 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6372 if (input == NULL) {
6373 return(NULL);
6374 }
6375
6376 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6377 if (stream == NULL) {
6378 xmlFreeParserInputBuffer(input);
6379 return(NULL);
6380 }
6381
6382 inputPush(ctxt, stream);
6383 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6384}
6385
6386/**
6387 * htmlCtxtReadFd:
6388 * @ctxt: an HTML parser context
6389 * @fd: an open file descriptor
6390 * @URL: the base URL to use for the document
6391 * @encoding: the document encoding, or NULL
6392 * @options: a combination of htmlParserOption(s)
6393 *
6394 * parse an XML from a file descriptor and build a tree.
6395 * This reuses the existing @ctxt parser context
6396 *
6397 * Returns the resulting document tree
6398 */
6399htmlDocPtr
6400htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6401 const char *URL, const char *encoding, int options)
6402{
6403 xmlParserInputBufferPtr input;
6404 xmlParserInputPtr stream;
6405
6406 if (fd < 0)
6407 return (NULL);
6408 if (ctxt == NULL)
6409 return (NULL);
6410
6411 htmlCtxtReset(ctxt);
6412
6413
6414 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6415 if (input == NULL)
6416 return (NULL);
6417 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6418 if (stream == NULL) {
6419 xmlFreeParserInputBuffer(input);
6420 return (NULL);
6421 }
6422 inputPush(ctxt, stream);
6423 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6424}
6425
6426/**
6427 * htmlCtxtReadIO:
6428 * @ctxt: an HTML parser context
6429 * @ioread: an I/O read function
6430 * @ioclose: an I/O close function
6431 * @ioctx: an I/O handler
6432 * @URL: the base URL to use for the document
6433 * @encoding: the document encoding, or NULL
6434 * @options: a combination of htmlParserOption(s)
6435 *
6436 * parse an HTML document from I/O functions and source and build a tree.
6437 * This reuses the existing @ctxt parser context
6438 *
6439 * Returns the resulting document tree
6440 */
6441htmlDocPtr
6442htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6443 xmlInputCloseCallback ioclose, void *ioctx,
6444 const char *URL,
6445 const char *encoding, int options)
6446{
6447 xmlParserInputBufferPtr input;
6448 xmlParserInputPtr stream;
6449
6450 if (ioread == NULL)
6451 return (NULL);
6452 if (ctxt == NULL)
6453 return (NULL);
6454
6455 htmlCtxtReset(ctxt);
6456
6457 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6458 XML_CHAR_ENCODING_NONE);
6459 if (input == NULL)
6460 return (NULL);
6461 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6462 if (stream == NULL) {
6463 xmlFreeParserInputBuffer(input);
6464 return (NULL);
6465 }
6466 inputPush(ctxt, stream);
6467 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6468}
6469
Daniel Veillard5d4644e2005-04-01 13:11:58 +00006470#define bottom_HTMLparser
6471#include "elfgcchack.h"
Owen Taylor3473f882001-02-23 17:55:21 +00006472#endif /* LIBXML_HTML_ENABLED */